8 жил өмнө · e265eb3a30
--- a/Documentation/admin-guide/md.rst
+++ b/Documentation/admin-guide/md.rst
@@ -276,14 +276,14 @@ All md devices contain:
 
				      array creation it will default to 0, though starting the array as
			
 
				      ``clean`` will set it much larger.
			
 
				 
			
 
				-   new_dev
			
 
				+  new_dev
			
 
				      This file can be written but not read.  The value written should
			
 
				      be a block device number as major:minor.  e.g. 8:0
			
 
				      This will cause that device to be attached to the array, if it is
			
 
				      available.  It will then appear at md/dev-XXX (depending on the
			
 
				      name of the device) and further configuration is then possible.
			
 
				 
			
 
				-   safe_mode_delay
			
 
				+  safe_mode_delay
			
 
				      When an md array has seen no write requests for a certain period
			
 
				      of time, it will be marked as ``clean``.  When another write
			
 
				      request arrives, the array is marked as ``dirty`` before the write
			
@@ -292,7 +292,7 @@ All md devices contain:
 
				      period as a number of seconds.  The default is 200msec (0.200).
			
 
				      Writing a value of 0 disables safemode.
			
 
				 
			
 
				-   array_state
			
 
				+  array_state
			
 
				      This file contains a single word which describes the current
			
 
				      state of the array.  In many cases, the state can be set by
			
 
				      writing the word for the desired state, however some states
			
@@ -401,7 +401,30 @@ All md devices contain:
 
				      once the array becomes non-degraded, and this fact has been
			
 
				      recorded in the metadata.
			
 
				 
			
 
				+  consistency_policy
			
 
				+     This indicates how the array maintains consistency in case of unexpected
			
 
				+     shutdown. It can be:
			
 
				 
			
 
				+     none
			
 
				+       Array has no redundancy information, e.g. raid0, linear.
			
 
				+
			
 
				+     resync
			
 
				+       Full resync is performed and all redundancy is regenerated when the
			
 
				+       array is started after unclean shutdown.
			
 
				+
			
 
				+     bitmap
			
 
				+       Resync assisted by a write-intent bitmap.
			
 
				+
			
 
				+     journal
			
 
				+       For raid4/5/6, journal device is used to log transactions and replay
			
 
				+       after unclean shutdown.
			
 
				+
			
 
				+     ppl
			
 
				+       For raid5 only, Partial Parity Log is used to close the write hole and
			
 
				+       eliminate resync.
			
 
				+
			
 
				+     The accepted values when writing to this file are ``ppl`` and ``resync``,
			
 
				+     used to enable and disable PPL.
			
 
				 
			
 
				 
			
 
				 As component devices are added to an md array, they appear in the ``md``
			
@@ -563,6 +586,9 @@ Each directory contains:
 
				 	adds bad blocks without acknowledging them. This is largely
			
 
				 	for testing.
			
 
				 
			
 
				+      ppl_sector, ppl_size
			
 
				+        Location and size (in sectors) of the space used for Partial Parity Log
			
 
				+        on this device.
			
 
				 
			
 
				 
			
 
				 An active md device will also contain an entry for each active device
			
--- a/Documentation/md/md-cluster.txt
+++ b/Documentation/md/md-cluster.txt
@@ -321,4 +321,4 @@ The algorithm is:
 
				 
			
 
				 There are somethings which are not supported by cluster MD yet.
			
 
				 
			
 
				-- update size and change array_sectors.
			
 
				+- change array_sectors.
			
--- a/Documentation/md/raid5-ppl.txt
+++ b/Documentation/md/raid5-ppl.txt
@@ -0,0 +1,44 @@
 
				+Partial Parity Log
			
 
				+
			
 
				+Partial Parity Log (PPL) is a feature available for RAID5 arrays. The issue
			
 
				+addressed by PPL is that after a dirty shutdown, parity of a particular stripe
			
 
				+may become inconsistent with data on other member disks. If the array is also
			
 
				+in degraded state, there is no way to recalculate parity, because one of the
			
 
				+disks is missing. This can lead to silent data corruption when rebuilding the
			
 
				+array or using it is as degraded - data calculated from parity for array blocks
			
 
				+that have not been touched by a write request during the unclean shutdown can
			
 
				+be incorrect. Such condition is known as the RAID5 Write Hole. Because of
			
 
				+this, md by default does not allow starting a dirty degraded array.
			
 
				+
			
 
				+Partial parity for a write operation is the XOR of stripe data chunks not
			
 
				+modified by this write. It is just enough data needed for recovering from the
			
 
				+write hole. XORing partial parity with the modified chunks produces parity for
			
 
				+the stripe, consistent with its state before the write operation, regardless of
			
 
				+which chunk writes have completed. If one of the not modified data disks of
			
 
				+this stripe is missing, this updated parity can be used to recover its
			
 
				+contents. PPL recovery is also performed when starting an array after an
			
 
				+unclean shutdown and all disks are available, eliminating the need to resync
			
 
				+the array. Because of this, using write-intent bitmap and PPL together is not
			
 
				+supported.
			
 
				+
			
 
				+When handling a write request PPL writes partial parity before new data and
			
 
				+parity are dispatched to disks. PPL is a distributed log - it is stored on
			
 
				+array member drives in the metadata area, on the parity drive of a particular
			
 
				+stripe.  It does not require a dedicated journaling drive. Write performance is
			
 
				+reduced by up to 30%-40% but it scales with the number of drives in the array
			
 
				+and the journaling drive does not become a bottleneck or a single point of
			
 
				+failure.
			
 
				+
			
 
				+Unlike raid5-cache, the other solution in md for closing the write hole, PPL is
			
 
				+not a true journal. It does not protect from losing in-flight data, only from
			
 
				+silent data corruption. If a dirty disk of a stripe is lost, no PPL recovery is
			
 
				+performed for this stripe (parity is not updated). So it is possible to have
			
 
				+arbitrary data in the written part of a stripe if that disk is lost. In such
			
 
				+case the behavior is the same as in plain raid5.
			
 
				+
			
 
				+PPL is available for md version-1 metadata and external (specifically IMSM)
			
 
				+metadata arrays. It can be enabled using mdadm option --consistency-policy=ppl.
			
 
				+
			
 
				+Currently, volatile write-back cache should be disabled on all member drives
			
 
				+when using PPL. Otherwise it cannot guarantee consistency in case of power
			
 
				+failure.
			
--- a/block/bio.c
+++ b/block/bio.c
@@ -633,20 +633,21 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
 
				 }
			
 
				 EXPORT_SYMBOL(bio_clone_fast);
			
 
				 
			
 
				-static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
			
 
				-				      struct bio_set *bs, int offset,
			
 
				-				      int size)
			
 
				+/**
			
 
				+ * 	bio_clone_bioset - clone a bio
			
 
				+ * 	@bio_src: bio to clone
			
 
				+ *	@gfp_mask: allocation priority
			
 
				+ *	@bs: bio_set to allocate from
			
 
				+ *
			
 
				+ *	Clone bio. Caller will own the returned bio, but not the actual data it
			
 
				+ *	points to. Reference count of returned bio will be one.
			
 
				+ */
			
 
				+struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
			
 
				+			     struct bio_set *bs)
			
 
				 {
			
 
				 	struct bvec_iter iter;
			
 
				 	struct bio_vec bv;
			
 
				 	struct bio *bio;
			
 
				-	struct bvec_iter iter_src = bio_src->bi_iter;
			
 
				-
			
 
				-	/* for supporting partial clone */
			
 
				-	if (offset || size != bio_src->bi_iter.bi_size) {
			
 
				-		bio_advance_iter(bio_src, &iter_src, offset);
			
 
				-		iter_src.bi_size = size;
			
 
				-	}
			
 
				 
			
 
				 	/*
			
 
				 	 * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
			
@@ -670,8 +671,7 @@ static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 
				 	 *    __bio_clone_fast() anyways.
			
 
				 	 */
			
 
				 
			
 
				-	bio = bio_alloc_bioset(gfp_mask, __bio_segments(bio_src,
			
 
				-			       &iter_src), bs);
			
 
				+	bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
			
 
				 	if (!bio)
			
 
				 		return NULL;
			
 
				 	bio->bi_bdev		= bio_src->bi_bdev;
			
@@ -688,7 +688,7 @@ static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 
				 		bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
			
 
				 		break;
			
 
				 	default:
			
 
				-		__bio_for_each_segment(bv, bio_src, iter, iter_src)
			
 
				+		bio_for_each_segment(bv, bio_src, iter)
			
 
				 			bio->bi_io_vec[bio->bi_vcnt++] = bv;
			
 
				 		break;
			
 
				 	}
			
@@ -707,43 +707,8 @@ static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 
				 
			
 
				 	return bio;
			
 
				 }
			
 
				-
			
 
				-/**
			
 
				- * 	bio_clone_bioset - clone a bio
			
 
				- * 	@bio_src: bio to clone
			
 
				- *	@gfp_mask: allocation priority
			
 
				- *	@bs: bio_set to allocate from
			
 
				- *
			
 
				- *	Clone bio. Caller will own the returned bio, but not the actual data it
			
 
				- *	points to. Reference count of returned bio will be one.
			
 
				- */
			
 
				-struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
			
 
				-			     struct bio_set *bs)
			
 
				-{
			
 
				-	return __bio_clone_bioset(bio_src, gfp_mask, bs, 0,
			
 
				-				  bio_src->bi_iter.bi_size);
			
 
				-}
			
 
				 EXPORT_SYMBOL(bio_clone_bioset);
			
 
				 
			
 
				-/**
			
 
				- * 	bio_clone_bioset_partial - clone a partial bio
			
 
				- * 	@bio_src: bio to clone
			
 
				- *	@gfp_mask: allocation priority
			
 
				- *	@bs: bio_set to allocate from
			
 
				- *	@offset: cloned starting from the offset
			
 
				- *	@size: size for the cloned bio
			
 
				- *
			
 
				- *	Clone bio. Caller will own the returned bio, but not the actual data it
			
 
				- *	points to. Reference count of returned bio will be one.
			
 
				- */
			
 
				-struct bio *bio_clone_bioset_partial(struct bio *bio_src, gfp_t gfp_mask,
			
 
				-				     struct bio_set *bs, int offset,
			
 
				-				     int size)
			
 
				-{
			
 
				-	return __bio_clone_bioset(bio_src, gfp_mask, bs, offset, size);
			
 
				-}
			
 
				-EXPORT_SYMBOL(bio_clone_bioset_partial);
			
 
				-
			
 
				 /**
			
 
				  *	bio_add_pc_page	-	attempt to add page to bio
			
 
				  *	@q: the target queue
			
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -18,7 +18,7 @@ dm-cache-cleaner-y += dm-cache-policy-cleaner.o
 
				 dm-era-y	+= dm-era-target.o
			
 
				 dm-verity-y	+= dm-verity-target.o
			
 
				 md-mod-y	+= md.o bitmap.o
			
 
				-raid456-y	+= raid5.o raid5-cache.o
			
 
				+raid456-y	+= raid5.o raid5-cache.o raid5-ppl.o
			
 
				 
			
 
				 # Note: link order is important.  All raid personalities
			
 
				 # and must come before md.o, as they each initialise 
			
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -471,6 +471,7 @@ void bitmap_update_sb(struct bitmap *bitmap)
 
				 	kunmap_atomic(sb);
			
 
				 	write_page(bitmap, bitmap->storage.sb_page, 1);
			
 
				 }
			
 
				+EXPORT_SYMBOL(bitmap_update_sb);
			
 
				 
			
 
				 /* print out the bitmap file superblock */
			
 
				 void bitmap_print_sb(struct bitmap *bitmap)
			
@@ -696,7 +697,7 @@ re_read:
 
				 
			
 
				 out:
			
 
				 	kunmap_atomic(sb);
			
 
				-	/* Assiging chunksize is required for "re_read" */
			
 
				+	/* Assigning chunksize is required for "re_read" */
			
 
				 	bitmap->mddev->bitmap_info.chunksize = chunksize;
			
 
				 	if (err == 0 && nodes && (bitmap->cluster_slot < 0)) {
			
 
				 		err = md_setup_cluster(bitmap->mddev, nodes);
			
@@ -1727,7 +1728,7 @@ void bitmap_flush(struct mddev *mddev)
 
				 /*
			
 
				  * free memory that was allocated
			
 
				  */
			
 
				-static void bitmap_free(struct bitmap *bitmap)
			
 
				+void bitmap_free(struct bitmap *bitmap)
			
 
				 {
			
 
				 	unsigned long k, pages;
			
 
				 	struct bitmap_page *bp;
			
@@ -1761,6 +1762,21 @@ static void bitmap_free(struct bitmap *bitmap)
 
				 	kfree(bp);
			
 
				 	kfree(bitmap);
			
 
				 }
			
 
				+EXPORT_SYMBOL(bitmap_free);
			
 
				+
			
 
				+void bitmap_wait_behind_writes(struct mddev *mddev)
			
 
				+{
			
 
				+	struct bitmap *bitmap = mddev->bitmap;
			
 
				+
			
 
				+	/* wait for behind writes to complete */
			
 
				+	if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
			
 
				+		pr_debug("md:%s: behind writes in progress - waiting to stop.\n",
			
 
				+			 mdname(mddev));
			
 
				+		/* need to kick something here to make sure I/O goes? */
			
 
				+		wait_event(bitmap->behind_wait,
			
 
				+			   atomic_read(&bitmap->behind_writes) == 0);
			
 
				+	}
			
 
				+}
			
 
				 
			
 
				 void bitmap_destroy(struct mddev *mddev)
			
 
				 {
			
@@ -1769,6 +1785,8 @@ void bitmap_destroy(struct mddev *mddev)
 
				 	if (!bitmap) /* there was no bitmap */
			
 
				 		return;
			
 
				 
			
 
				+	bitmap_wait_behind_writes(mddev);
			
 
				+
			
 
				 	mutex_lock(&mddev->bitmap_info.mutex);
			
 
				 	spin_lock(&mddev->lock);
			
 
				 	mddev->bitmap = NULL; /* disconnect from the md device */
			
@@ -1920,6 +1938,27 @@ out:
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(bitmap_load);
			
 
				 
			
 
				+struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot)
			
 
				+{
			
 
				+	int rv = 0;
			
 
				+	struct bitmap *bitmap;
			
 
				+
			
 
				+	bitmap = bitmap_create(mddev, slot);
			
 
				+	if (IS_ERR(bitmap)) {
			
 
				+		rv = PTR_ERR(bitmap);
			
 
				+		return ERR_PTR(rv);
			
 
				+	}
			
 
				+
			
 
				+	rv = bitmap_init_from_disk(bitmap, 0);
			
 
				+	if (rv) {
			
 
				+		bitmap_free(bitmap);
			
 
				+		return ERR_PTR(rv);
			
 
				+	}
			
 
				+
			
 
				+	return bitmap;
			
 
				+}
			
 
				+EXPORT_SYMBOL(get_bitmap_from_slot);
			
 
				+
			
 
				 /* Loads the bitmap associated with slot and copies the resync information
			
 
				  * to our bitmap
			
 
				  */
			
@@ -1929,14 +1968,13 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot,
 
				 	int rv = 0, i, j;
			
 
				 	sector_t block, lo = 0, hi = 0;
			
 
				 	struct bitmap_counts *counts;
			
 
				-	struct bitmap *bitmap = bitmap_create(mddev, slot);
			
 
				-
			
 
				-	if (IS_ERR(bitmap))
			
 
				-		return PTR_ERR(bitmap);
			
 
				+	struct bitmap *bitmap;
			
 
				 
			
 
				-	rv = bitmap_init_from_disk(bitmap, 0);
			
 
				-	if (rv)
			
 
				-		goto err;
			
 
				+	bitmap = get_bitmap_from_slot(mddev, slot);
			
 
				+	if (IS_ERR(bitmap)) {
			
 
				+		pr_err("%s can't get bitmap from slot %d\n", __func__, slot);
			
 
				+		return -1;
			
 
				+	}
			
 
				 
			
 
				 	counts = &bitmap->counts;
			
 
				 	for (j = 0; j < counts->chunks; j++) {
			
@@ -1963,8 +2001,7 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot,
 
				 	bitmap_unplug(mddev->bitmap);
			
 
				 	*low = lo;
			
 
				 	*high = hi;
			
 
				-err:
			
 
				-	bitmap_free(bitmap);
			
 
				+
			
 
				 	return rv;
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(bitmap_copy_from_slot);
			
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -267,8 +267,11 @@ void bitmap_daemon_work(struct mddev *mddev);
 
				 
			
 
				 int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
			
 
				 		  int chunksize, int init);
			
 
				+struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot);
			
 
				 int bitmap_copy_from_slot(struct mddev *mddev, int slot,
			
 
				 				sector_t *lo, sector_t *hi, bool clear_bits);
			
 
				+void bitmap_free(struct bitmap *bitmap);
			
 
				+void bitmap_wait_behind_writes(struct mddev *mddev);
			
 
				 #endif
			
 
				 
			
 
				 #endif
			
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -249,54 +249,49 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
 
				 {
			
 
				 	char b[BDEVNAME_SIZE];
			
 
				 	struct dev_info *tmp_dev;
			
 
				-	struct bio *split;
			
 
				 	sector_t start_sector, end_sector, data_offset;
			
 
				+	sector_t bio_sector = bio->bi_iter.bi_sector;
			
 
				 
			
 
				 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
			
 
				 		md_flush_request(mddev, bio);
			
 
				 		return;
			
 
				 	}
			
 
				 
			
 
				-	do {
			
 
				-		sector_t bio_sector = bio->bi_iter.bi_sector;
			
 
				-		tmp_dev = which_dev(mddev, bio_sector);
			
 
				-		start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
			
 
				-		end_sector = tmp_dev->end_sector;
			
 
				-		data_offset = tmp_dev->rdev->data_offset;
			
 
				-		bio->bi_bdev = tmp_dev->rdev->bdev;
			
 
				-
			
 
				-		if (unlikely(bio_sector >= end_sector ||
			
 
				-			     bio_sector < start_sector))
			
 
				-			goto out_of_bounds;
			
 
				-
			
 
				-		if (unlikely(bio_end_sector(bio) > end_sector)) {
			
 
				-			/* This bio crosses a device boundary, so we have to
			
 
				-			 * split it.
			
 
				-			 */
			
 
				-			split = bio_split(bio, end_sector - bio_sector,
			
 
				-					  GFP_NOIO, fs_bio_set);
			
 
				-			bio_chain(split, bio);
			
 
				-		} else {
			
 
				-			split = bio;
			
 
				-		}
			
 
				+	tmp_dev = which_dev(mddev, bio_sector);
			
 
				+	start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
			
 
				+	end_sector = tmp_dev->end_sector;
			
 
				+	data_offset = tmp_dev->rdev->data_offset;
			
 
				+
			
 
				+	if (unlikely(bio_sector >= end_sector ||
			
 
				+		     bio_sector < start_sector))
			
 
				+		goto out_of_bounds;
			
 
				+
			
 
				+	if (unlikely(bio_end_sector(bio) > end_sector)) {
			
 
				+		/* This bio crosses a device boundary, so we have to split it */
			
 
				+		struct bio *split = bio_split(bio, end_sector - bio_sector,
			
 
				+					      GFP_NOIO, mddev->bio_set);
			
 
				+		bio_chain(split, bio);
			
 
				+		generic_make_request(bio);
			
 
				+		bio = split;
			
 
				+	}
			
 
				 
			
 
				-		split->bi_iter.bi_sector = split->bi_iter.bi_sector -
			
 
				-			start_sector + data_offset;
			
 
				-
			
 
				-		if (unlikely((bio_op(split) == REQ_OP_DISCARD) &&
			
 
				-			 !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
			
 
				-			/* Just ignore it */
			
 
				-			bio_endio(split);
			
 
				-		} else {
			
 
				-			if (mddev->gendisk)
			
 
				-				trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
			
 
				-						      split, disk_devt(mddev->gendisk),
			
 
				-						      bio_sector);
			
 
				-			mddev_check_writesame(mddev, split);
			
 
				-			mddev_check_write_zeroes(mddev, split);
			
 
				-			generic_make_request(split);
			
 
				-		}
			
 
				-	} while (split != bio);
			
 
				+	bio->bi_bdev = tmp_dev->rdev->bdev;
			
 
				+	bio->bi_iter.bi_sector = bio->bi_iter.bi_sector -
			
 
				+		start_sector + data_offset;
			
 
				+
			
 
				+	if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
			
 
				+		     !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
			
 
				+		/* Just ignore it */
			
 
				+		bio_endio(bio);
			
 
				+	} else {
			
 
				+		if (mddev->gendisk)
			
 
				+			trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
			
 
				+					      bio, disk_devt(mddev->gendisk),
			
 
				+					      bio_sector);
			
 
				+		mddev_check_writesame(mddev, bio);
			
 
				+		mddev_check_write_zeroes(mddev, bio);
			
 
				+		generic_make_request(bio);
			
 
				+	}
			
 
				 	return;
			
 
				 
			
 
				 out_of_bounds:
			
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -67,9 +67,10 @@ struct resync_info {
 
				  * set up all the related infos such as bitmap and personality */
			
 
				 #define		MD_CLUSTER_ALREADY_IN_CLUSTER		6
			
 
				 #define		MD_CLUSTER_PENDING_RECV_EVENT		7
			
 
				-
			
 
				+#define 	MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD		8
			
 
				 
			
 
				 struct md_cluster_info {
			
 
				+	struct mddev *mddev; /* the md device which md_cluster_info belongs to */
			
 
				 	/* dlm lock space and resources for clustered raid. */
			
 
				 	dlm_lockspace_t *lockspace;
			
 
				 	int slot_number;
			
@@ -103,6 +104,7 @@ enum msg_type {
 
				 	REMOVE,
			
 
				 	RE_ADD,
			
 
				 	BITMAP_NEEDS_SYNC,
			
 
				+	CHANGE_CAPACITY,
			
 
				 };
			
 
				 
			
 
				 struct cluster_msg {
			
@@ -523,11 +525,17 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
 
				 
			
 
				 static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
			
 
				 {
			
 
				+	int got_lock = 0;
			
 
				 	struct md_cluster_info *cinfo = mddev->cluster_info;
			
 
				 	mddev->good_device_nr = le32_to_cpu(msg->raid_slot);
			
 
				-	set_bit(MD_RELOAD_SB, &mddev->flags);
			
 
				+
			
 
				 	dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
			
 
				-	md_wakeup_thread(mddev->thread);
			
 
				+	wait_event(mddev->thread->wqueue,
			
 
				+		   (got_lock = mddev_trylock(mddev)) ||
			
 
				+		    test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state));
			
 
				+	md_reload_sb(mddev, mddev->good_device_nr);
			
 
				+	if (got_lock)
			
 
				+		mddev_unlock(mddev);
			
 
				 }
			
 
				 
			
 
				 static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
			
@@ -572,6 +580,10 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
 
				 	case METADATA_UPDATED:
			
 
				 		process_metadata_update(mddev, msg);
			
 
				 		break;
			
 
				+	case CHANGE_CAPACITY:
			
 
				+		set_capacity(mddev->gendisk, mddev->array_sectors);
			
 
				+		revalidate_disk(mddev->gendisk);
			
 
				+		break;
			
 
				 	case RESYNCING:
			
 
				 		process_suspend_info(mddev, le32_to_cpu(msg->slot),
			
 
				 				     le64_to_cpu(msg->low),
			
@@ -646,11 +658,29 @@ out:
 
				  * Takes the lock on the TOKEN lock resource so no other
			
 
				  * node can communicate while the operation is underway.
			
 
				  */
			
 
				-static int lock_token(struct md_cluster_info *cinfo)
			
 
				+static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked)
			
 
				 {
			
 
				-	int error;
			
 
				+	int error, set_bit = 0;
			
 
				+	struct mddev *mddev = cinfo->mddev;
			
 
				 
			
 
				+	/*
			
 
				+	 * If resync thread run after raid1d thread, then process_metadata_update
			
 
				+	 * could not continue if raid1d held reconfig_mutex (and raid1d is blocked
			
 
				+	 * since another node already got EX on Token and waitting the EX of Ack),
			
 
				+	 * so let resync wake up thread in case flag is set.
			
 
				+	 */
			
 
				+	if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
			
 
				+				      &cinfo->state)) {
			
 
				+		error = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
			
 
				+					      &cinfo->state);
			
 
				+		WARN_ON_ONCE(error);
			
 
				+		md_wakeup_thread(mddev->thread);
			
 
				+		set_bit = 1;
			
 
				+	}
			
 
				 	error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
			
 
				+	if (set_bit)
			
 
				+		clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
			
 
				+
			
 
				 	if (error)
			
 
				 		pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
			
 
				 				__func__, __LINE__, error);
			
@@ -663,12 +693,12 @@ static int lock_token(struct md_cluster_info *cinfo)
 
				 /* lock_comm()
			
 
				  * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel.
			
 
				  */
			
 
				-static int lock_comm(struct md_cluster_info *cinfo)
			
 
				+static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked)
			
 
				 {
			
 
				 	wait_event(cinfo->wait,
			
 
				 		   !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state));
			
 
				 
			
 
				-	return lock_token(cinfo);
			
 
				+	return lock_token(cinfo, mddev_locked);
			
 
				 }
			
 
				 
			
 
				 static void unlock_comm(struct md_cluster_info *cinfo)
			
@@ -743,11 +773,12 @@ failed_message:
 
				 	return error;
			
 
				 }
			
 
				 
			
 
				-static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
			
 
				+static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg,
			
 
				+		   bool mddev_locked)
			
 
				 {
			
 
				 	int ret;
			
 
				 
			
 
				-	lock_comm(cinfo);
			
 
				+	lock_comm(cinfo, mddev_locked);
			
 
				 	ret = __sendmsg(cinfo, cmsg);
			
 
				 	unlock_comm(cinfo);
			
 
				 	return ret;
			
@@ -834,6 +865,7 @@ static int join(struct mddev *mddev, int nodes)
 
				 	mutex_init(&cinfo->recv_mutex);
			
 
				 
			
 
				 	mddev->cluster_info = cinfo;
			
 
				+	cinfo->mddev = mddev;
			
 
				 
			
 
				 	memset(str, 0, 64);
			
 
				 	sprintf(str, "%pU", mddev->uuid);
			
@@ -908,6 +940,7 @@ static int join(struct mddev *mddev, int nodes)
 
				 
			
 
				 	return 0;
			
 
				 err:
			
 
				+	set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
			
 
				 	md_unregister_thread(&cinfo->recovery_thread);
			
 
				 	md_unregister_thread(&cinfo->recv_thread);
			
 
				 	lockres_free(cinfo->message_lockres);
			
@@ -943,7 +976,7 @@ static void resync_bitmap(struct mddev *mddev)
 
				 	int err;
			
 
				 
			
 
				 	cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
			
 
				-	err = sendmsg(cinfo, &cmsg);
			
 
				+	err = sendmsg(cinfo, &cmsg, 1);
			
 
				 	if (err)
			
 
				 		pr_err("%s:%d: failed to send BITMAP_NEEDS_SYNC message (%d)\n",
			
 
				 			__func__, __LINE__, err);
			
@@ -963,6 +996,7 @@ static int leave(struct mddev *mddev)
 
				 	if (cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector)
			
 
				 		resync_bitmap(mddev);
			
 
				 
			
 
				+	set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
			
 
				 	md_unregister_thread(&cinfo->recovery_thread);
			
 
				 	md_unregister_thread(&cinfo->recv_thread);
			
 
				 	lockres_free(cinfo->message_lockres);
			
@@ -997,16 +1031,30 @@ static int slot_number(struct mddev *mddev)
 
				 static int metadata_update_start(struct mddev *mddev)
			
 
				 {
			
 
				 	struct md_cluster_info *cinfo = mddev->cluster_info;
			
 
				+	int ret;
			
 
				+
			
 
				+	/*
			
 
				+	 * metadata_update_start is always called with the protection of
			
 
				+	 * reconfig_mutex, so set WAITING_FOR_TOKEN here.
			
 
				+	 */
			
 
				+	ret = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
			
 
				+				    &cinfo->state);
			
 
				+	WARN_ON_ONCE(ret);
			
 
				+	md_wakeup_thread(mddev->thread);
			
 
				 
			
 
				 	wait_event(cinfo->wait,
			
 
				 		   !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state) ||
			
 
				 		   test_and_clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state));
			
 
				 
			
 
				 	/* If token is already locked, return 0 */
			
 
				-	if (cinfo->token_lockres->mode == DLM_LOCK_EX)
			
 
				+	if (cinfo->token_lockres->mode == DLM_LOCK_EX) {
			
 
				+		clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
			
 
				 		return 0;
			
 
				+	}
			
 
				 
			
 
				-	return lock_token(cinfo);
			
 
				+	ret = lock_token(cinfo, 1);
			
 
				+	clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 static int metadata_update_finish(struct mddev *mddev)
			
@@ -1043,6 +1091,141 @@ static void metadata_update_cancel(struct mddev *mddev)
 
				 	unlock_comm(cinfo);
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * return 0 if all the bitmaps have the same sync_size
			
 
				+ */
			
 
				+int cluster_check_sync_size(struct mddev *mddev)
			
 
				+{
			
 
				+	int i, rv;
			
 
				+	bitmap_super_t *sb;
			
 
				+	unsigned long my_sync_size, sync_size = 0;
			
 
				+	int node_num = mddev->bitmap_info.nodes;
			
 
				+	int current_slot = md_cluster_ops->slot_number(mddev);
			
 
				+	struct bitmap *bitmap = mddev->bitmap;
			
 
				+	char str[64];
			
 
				+	struct dlm_lock_resource *bm_lockres;
			
 
				+
			
 
				+	sb = kmap_atomic(bitmap->storage.sb_page);
			
 
				+	my_sync_size = sb->sync_size;
			
 
				+	kunmap_atomic(sb);
			
 
				+
			
 
				+	for (i = 0; i < node_num; i++) {
			
 
				+		if (i == current_slot)
			
 
				+			continue;
			
 
				+
			
 
				+		bitmap = get_bitmap_from_slot(mddev, i);
			
 
				+		if (IS_ERR(bitmap)) {
			
 
				+			pr_err("can't get bitmap from slot %d\n", i);
			
 
				+			return -1;
			
 
				+		}
			
 
				+
			
 
				+		/*
			
 
				+		 * If we can hold the bitmap lock of one node then
			
 
				+		 * the slot is not occupied, update the sb.
			
 
				+		 */
			
 
				+		snprintf(str, 64, "bitmap%04d", i);
			
 
				+		bm_lockres = lockres_init(mddev, str, NULL, 1);
			
 
				+		if (!bm_lockres) {
			
 
				+			pr_err("md-cluster: Cannot initialize %s\n", str);
			
 
				+			bitmap_free(bitmap);
			
 
				+			return -1;
			
 
				+		}
			
 
				+		bm_lockres->flags |= DLM_LKF_NOQUEUE;
			
 
				+		rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
			
 
				+		if (!rv)
			
 
				+			bitmap_update_sb(bitmap);
			
 
				+		lockres_free(bm_lockres);
			
 
				+
			
 
				+		sb = kmap_atomic(bitmap->storage.sb_page);
			
 
				+		if (sync_size == 0)
			
 
				+			sync_size = sb->sync_size;
			
 
				+		else if (sync_size != sb->sync_size) {
			
 
				+			kunmap_atomic(sb);
			
 
				+			bitmap_free(bitmap);
			
 
				+			return -1;
			
 
				+		}
			
 
				+		kunmap_atomic(sb);
			
 
				+		bitmap_free(bitmap);
			
 
				+	}
			
 
				+
			
 
				+	return (my_sync_size == sync_size) ? 0 : -1;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Update the size for cluster raid is a little more complex, we perform it
			
 
				+ * by the steps:
			
 
				+ * 1. hold token lock and update superblock in initiator node.
			
 
				+ * 2. send METADATA_UPDATED msg to other nodes.
			
 
				+ * 3. The initiator node continues to check each bitmap's sync_size, if all
			
 
				+ *    bitmaps have the same value of sync_size, then we can set capacity and
			
 
				+ *    let other nodes to perform it. If one node can't update sync_size
			
 
				+ *    accordingly, we need to revert to previous value.
			
 
				+ */
			
 
				+static void update_size(struct mddev *mddev, sector_t old_dev_sectors)
			
 
				+{
			
 
				+	struct md_cluster_info *cinfo = mddev->cluster_info;
			
 
				+	struct cluster_msg cmsg;
			
 
				+	struct md_rdev *rdev;
			
 
				+	int ret = 0;
			
 
				+	int raid_slot = -1;
			
 
				+
			
 
				+	md_update_sb(mddev, 1);
			
 
				+	lock_comm(cinfo, 1);
			
 
				+
			
 
				+	memset(&cmsg, 0, sizeof(cmsg));
			
 
				+	cmsg.type = cpu_to_le32(METADATA_UPDATED);
			
 
				+	rdev_for_each(rdev, mddev)
			
 
				+		if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) {
			
 
				+			raid_slot = rdev->desc_nr;
			
 
				+			break;
			
 
				+		}
			
 
				+	if (raid_slot >= 0) {
			
 
				+		cmsg.raid_slot = cpu_to_le32(raid_slot);
			
 
				+		/*
			
 
				+		 * We can only change capiticy after all the nodes can do it,
			
 
				+		 * so need to wait after other nodes already received the msg
			
 
				+		 * and handled the change
			
 
				+		 */
			
 
				+		ret = __sendmsg(cinfo, &cmsg);
			
 
				+		if (ret) {
			
 
				+			pr_err("%s:%d: failed to send METADATA_UPDATED msg\n",
			
 
				+			       __func__, __LINE__);
			
 
				+			unlock_comm(cinfo);
			
 
				+			return;
			
 
				+		}
			
 
				+	} else {
			
 
				+		pr_err("md-cluster: No good device id found to send\n");
			
 
				+		unlock_comm(cinfo);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * check the sync_size from other node's bitmap, if sync_size
			
 
				+	 * have already updated in other nodes as expected, send an
			
 
				+	 * empty metadata msg to permit the change of capacity
			
 
				+	 */
			
 
				+	if (cluster_check_sync_size(mddev) == 0) {
			
 
				+		memset(&cmsg, 0, sizeof(cmsg));
			
 
				+		cmsg.type = cpu_to_le32(CHANGE_CAPACITY);
			
 
				+		ret = __sendmsg(cinfo, &cmsg);
			
 
				+		if (ret)
			
 
				+			pr_err("%s:%d: failed to send CHANGE_CAPACITY msg\n",
			
 
				+			       __func__, __LINE__);
			
 
				+		set_capacity(mddev->gendisk, mddev->array_sectors);
			
 
				+		revalidate_disk(mddev->gendisk);
			
 
				+	} else {
			
 
				+		/* revert to previous sectors */
			
 
				+		ret = mddev->pers->resize(mddev, old_dev_sectors);
			
 
				+		if (!ret)
			
 
				+			revalidate_disk(mddev->gendisk);
			
 
				+		ret = __sendmsg(cinfo, &cmsg);
			
 
				+		if (ret)
			
 
				+			pr_err("%s:%d: failed to send METADATA_UPDATED msg\n",
			
 
				+			       __func__, __LINE__);
			
 
				+	}
			
 
				+	unlock_comm(cinfo);
			
 
				+}
			
 
				+
			
 
				 static int resync_start(struct mddev *mddev)
			
 
				 {
			
 
				 	struct md_cluster_info *cinfo = mddev->cluster_info;
			
@@ -1069,7 +1252,14 @@ static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
 
				 	cmsg.low = cpu_to_le64(lo);
			
 
				 	cmsg.high = cpu_to_le64(hi);
			
 
				 
			
 
				-	return sendmsg(cinfo, &cmsg);
			
 
				+	/*
			
 
				+	 * mddev_lock is held if resync_info_update is called from
			
 
				+	 * resync_finish (md_reap_sync_thread -> resync_finish)
			
 
				+	 */
			
 
				+	if (lo == 0 && hi == 0)
			
 
				+		return sendmsg(cinfo, &cmsg, 1);
			
 
				+	else
			
 
				+		return sendmsg(cinfo, &cmsg, 0);
			
 
				 }
			
 
				 
			
 
				 static int resync_finish(struct mddev *mddev)
			
@@ -1119,7 +1309,7 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
 
				 	cmsg.type = cpu_to_le32(NEWDISK);
			
 
				 	memcpy(cmsg.uuid, uuid, 16);
			
 
				 	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
			
 
				-	lock_comm(cinfo);
			
 
				+	lock_comm(cinfo, 1);
			
 
				 	ret = __sendmsg(cinfo, &cmsg);
			
 
				 	if (ret)
			
 
				 		return ret;
			
@@ -1179,7 +1369,7 @@ static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 
				 	struct md_cluster_info *cinfo = mddev->cluster_info;
			
 
				 	cmsg.type = cpu_to_le32(REMOVE);
			
 
				 	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
			
 
				-	return sendmsg(cinfo, &cmsg);
			
 
				+	return sendmsg(cinfo, &cmsg, 1);
			
 
				 }
			
 
				 
			
 
				 static int lock_all_bitmaps(struct mddev *mddev)
			
@@ -1243,7 +1433,7 @@ static int gather_bitmaps(struct md_rdev *rdev)
 
				 
			
 
				 	cmsg.type = cpu_to_le32(RE_ADD);
			
 
				 	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
			
 
				-	err = sendmsg(cinfo, &cmsg);
			
 
				+	err = sendmsg(cinfo, &cmsg, 1);
			
 
				 	if (err)
			
 
				 		goto out;
			
 
				 
			
@@ -1281,6 +1471,7 @@ static struct md_cluster_operations cluster_ops = {
 
				 	.gather_bitmaps = gather_bitmaps,
			
 
				 	.lock_all_bitmaps = lock_all_bitmaps,
			
 
				 	.unlock_all_bitmaps = unlock_all_bitmaps,
			
 
				+	.update_size = update_size,
			
 
				 };
			
 
				 
			
 
				 static int __init cluster_init(void)
			
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -27,6 +27,7 @@ struct md_cluster_operations {
 
				 	int (*gather_bitmaps)(struct md_rdev *rdev);
			
 
				 	int (*lock_all_bitmaps)(struct mddev *mddev);
			
 
				 	void (*unlock_all_bitmaps)(struct mddev *mddev);
			
 
				+	void (*update_size)(struct mddev *mddev, sector_t old_dev_sectors);
			
 
				 };
			
 
				 
			
 
				 #endif /* _MD_CLUSTER_H */
			
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -65,6 +65,8 @@
 
				 #include <linux/raid/md_p.h>
			
 
				 #include <linux/raid/md_u.h>
			
 
				 #include <linux/slab.h>
			
 
				+#include <linux/percpu-refcount.h>
			
 
				+
			
 
				 #include <trace/events/block.h>
			
 
				 #include "md.h"
			
 
				 #include "bitmap.h"
			
@@ -172,6 +174,16 @@ static const struct block_device_operations md_fops;
 
				 
			
 
				 static int start_readonly;
			
 
				 
			
 
				+/*
			
 
				+ * The original mechanism for creating an md device is to create
			
 
				+ * a device node in /dev and to open it.  This causes races with device-close.
			
 
				+ * The preferred method is to write to the "new_array" module parameter.
			
 
				+ * This can avoid races.
			
 
				+ * Setting create_on_open to false disables the original mechanism
			
 
				+ * so all the races disappear.
			
 
				+ */
			
 
				+static bool create_on_open = true;
			
 
				+
			
 
				 /* bio_clone_mddev
			
 
				  * like bio_clone, but with a local bio set
			
 
				  */
			
@@ -1507,6 +1519,12 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
 
				 	} else if (sb->bblog_offset != 0)
			
 
				 		rdev->badblocks.shift = 0;
			
 
				 
			
 
				+	if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) {
			
 
				+		rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
			
 
				+		rdev->ppl.size = le16_to_cpu(sb->ppl.size);
			
 
				+		rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
			
 
				+	}
			
 
				+
			
 
				 	if (!refdev) {
			
 
				 		ret = 1;
			
 
				 	} else {
			
@@ -1619,6 +1637,13 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 
				 
			
 
				 		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
			
 
				 			set_bit(MD_HAS_JOURNAL, &mddev->flags);
			
 
				+
			
 
				+		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) {
			
 
				+			if (le32_to_cpu(sb->feature_map) &
			
 
				+			    (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
			
 
				+				return -EINVAL;
			
 
				+			set_bit(MD_HAS_PPL, &mddev->flags);
			
 
				+		}
			
 
				 	} else if (mddev->pers == NULL) {
			
 
				 		/* Insist of good event counter while assembling, except for
			
 
				 		 * spares (which don't need an event count) */
			
@@ -1832,6 +1857,12 @@ retry:
 
				 	if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
			
 
				 		sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
			
 
				 
			
 
				+	if (test_bit(MD_HAS_PPL, &mddev->flags)) {
			
 
				+		sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
			
 
				+		sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
			
 
				+		sb->ppl.size = cpu_to_le16(rdev->ppl.size);
			
 
				+	}
			
 
				+
			
 
				 	rdev_for_each(rdev2, mddev) {
			
 
				 		i = rdev2->desc_nr;
			
 
				 		if (test_bit(Faulty, &rdev2->flags))
			
@@ -2072,6 +2103,10 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
 
				 	if (find_rdev(mddev, rdev->bdev->bd_dev))
			
 
				 		return -EEXIST;
			
 
				 
			
 
				+	if ((bdev_read_only(rdev->bdev) || bdev_read_only(rdev->meta_bdev)) &&
			
 
				+	    mddev->pers)
			
 
				+		return -EROFS;
			
 
				+
			
 
				 	/* make sure rdev->sectors exceeds mddev->dev_sectors */
			
 
				 	if (!test_bit(Journal, &rdev->flags) &&
			
 
				 	    rdev->sectors &&
			
@@ -2233,6 +2268,33 @@ static void export_array(struct mddev *mddev)
 
				 	mddev->major_version = 0;
			
 
				 }
			
 
				 
			
 
				+static bool set_in_sync(struct mddev *mddev)
			
 
				+{
			
 
				+	WARN_ON_ONCE(!spin_is_locked(&mddev->lock));
			
 
				+	if (!mddev->in_sync) {
			
 
				+		mddev->sync_checkers++;
			
 
				+		spin_unlock(&mddev->lock);
			
 
				+		percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
			
 
				+		spin_lock(&mddev->lock);
			
 
				+		if (!mddev->in_sync &&
			
 
				+		    percpu_ref_is_zero(&mddev->writes_pending)) {
			
 
				+			mddev->in_sync = 1;
			
 
				+			/*
			
 
				+			 * Ensure ->in_sync is visible before we clear
			
 
				+			 * ->sync_checkers.
			
 
				+			 */
			
 
				+			smp_mb();
			
 
				+			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
			
 
				+			sysfs_notify_dirent_safe(mddev->sysfs_state);
			
 
				+		}
			
 
				+		if (--mddev->sync_checkers == 0)
			
 
				+			percpu_ref_switch_to_percpu(&mddev->writes_pending);
			
 
				+	}
			
 
				+	if (mddev->safemode == 1)
			
 
				+		mddev->safemode = 0;
			
 
				+	return mddev->in_sync;
			
 
				+}
			
 
				+
			
 
				 static void sync_sbs(struct mddev *mddev, int nospares)
			
 
				 {
			
 
				 	/* Update each superblock (in-memory image), but
			
@@ -3131,6 +3193,78 @@ static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
 
				 static struct rdev_sysfs_entry rdev_unack_bad_blocks =
			
 
				 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
			
 
				 
			
 
				+static ssize_t
			
 
				+ppl_sector_show(struct md_rdev *rdev, char *page)
			
 
				+{
			
 
				+	return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector);
			
 
				+}
			
 
				+
			
 
				+static ssize_t
			
 
				+ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len)
			
 
				+{
			
 
				+	unsigned long long sector;
			
 
				+
			
 
				+	if (kstrtoull(buf, 10, &sector) < 0)
			
 
				+		return -EINVAL;
			
 
				+	if (sector != (sector_t)sector)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
			
 
				+	    rdev->raid_disk >= 0)
			
 
				+		return -EBUSY;
			
 
				+
			
 
				+	if (rdev->mddev->persistent) {
			
 
				+		if (rdev->mddev->major_version == 0)
			
 
				+			return -EINVAL;
			
 
				+		if ((sector > rdev->sb_start &&
			
 
				+		     sector - rdev->sb_start > S16_MAX) ||
			
 
				+		    (sector < rdev->sb_start &&
			
 
				+		     rdev->sb_start - sector > -S16_MIN))
			
 
				+			return -EINVAL;
			
 
				+		rdev->ppl.offset = sector - rdev->sb_start;
			
 
				+	} else if (!rdev->mddev->external) {
			
 
				+		return -EBUSY;
			
 
				+	}
			
 
				+	rdev->ppl.sector = sector;
			
 
				+	return len;
			
 
				+}
			
 
				+
			
 
				+static struct rdev_sysfs_entry rdev_ppl_sector =
			
 
				+__ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store);
			
 
				+
			
 
				+static ssize_t
			
 
				+ppl_size_show(struct md_rdev *rdev, char *page)
			
 
				+{
			
 
				+	return sprintf(page, "%u\n", rdev->ppl.size);
			
 
				+}
			
 
				+
			
 
				+static ssize_t
			
 
				+ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len)
			
 
				+{
			
 
				+	unsigned int size;
			
 
				+
			
 
				+	if (kstrtouint(buf, 10, &size) < 0)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
			
 
				+	    rdev->raid_disk >= 0)
			
 
				+		return -EBUSY;
			
 
				+
			
 
				+	if (rdev->mddev->persistent) {
			
 
				+		if (rdev->mddev->major_version == 0)
			
 
				+			return -EINVAL;
			
 
				+		if (size > U16_MAX)
			
 
				+			return -EINVAL;
			
 
				+	} else if (!rdev->mddev->external) {
			
 
				+		return -EBUSY;
			
 
				+	}
			
 
				+	rdev->ppl.size = size;
			
 
				+	return len;
			
 
				+}
			
 
				+
			
 
				+static struct rdev_sysfs_entry rdev_ppl_size =
			
 
				+__ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store);
			
 
				+
			
 
				 static struct attribute *rdev_default_attrs[] = {
			
 
				 	&rdev_state.attr,
			
 
				 	&rdev_errors.attr,
			
@@ -3141,6 +3275,8 @@ static struct attribute *rdev_default_attrs[] = {
 
				 	&rdev_recovery_start.attr,
			
 
				 	&rdev_bad_blocks.attr,
			
 
				 	&rdev_unack_bad_blocks.attr,
			
 
				+	&rdev_ppl_sector.attr,
			
 
				+	&rdev_ppl_size.attr,
			
 
				 	NULL,
			
 
				 };
			
 
				 static ssize_t
			
@@ -3903,6 +4039,7 @@ array_state_show(struct mddev *mddev, char *page)
 
				 			st = read_auto;
			
 
				 			break;
			
 
				 		case 0:
			
 
				+			spin_lock(&mddev->lock);
			
 
				 			if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
			
 
				 				st = write_pending;
			
 
				 			else if (mddev->in_sync)
			
@@ -3911,6 +4048,7 @@ array_state_show(struct mddev *mddev, char *page)
 
				 				st = active_idle;
			
 
				 			else
			
 
				 				st = active;
			
 
				+			spin_unlock(&mddev->lock);
			
 
				 		}
			
 
				 	else {
			
 
				 		if (list_empty(&mddev->disks) &&
			
@@ -3931,7 +4069,7 @@ static int restart_array(struct mddev *mddev);
 
				 static ssize_t
			
 
				 array_state_store(struct mddev *mddev, const char *buf, size_t len)
			
 
				 {
			
 
				-	int err;
			
 
				+	int err = 0;
			
 
				 	enum array_state st = match_word(buf, array_states);
			
 
				 
			
 
				 	if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
			
@@ -3944,18 +4082,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
 
				 			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
			
 
				 			md_wakeup_thread(mddev->thread);
			
 
				 			wake_up(&mddev->sb_wait);
			
 
				-			err = 0;
			
 
				 		} else /* st == clean */ {
			
 
				 			restart_array(mddev);
			
 
				-			if (atomic_read(&mddev->writes_pending) == 0) {
			
 
				-				if (mddev->in_sync == 0) {
			
 
				-					mddev->in_sync = 1;
			
 
				-					if (mddev->safemode == 1)
			
 
				-						mddev->safemode = 0;
			
 
				-					set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
			
 
				-				}
			
 
				-				err = 0;
			
 
				-			} else
			
 
				+			if (!set_in_sync(mddev))
			
 
				 				err = -EBUSY;
			
 
				 		}
			
 
				 		if (!err)
			
@@ -4013,15 +4142,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
 
				 			if (err)
			
 
				 				break;
			
 
				 			spin_lock(&mddev->lock);
			
 
				-			if (atomic_read(&mddev->writes_pending) == 0) {
			
 
				-				if (mddev->in_sync == 0) {
			
 
				-					mddev->in_sync = 1;
			
 
				-					if (mddev->safemode == 1)
			
 
				-						mddev->safemode = 0;
			
 
				-					set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
			
 
				-				}
			
 
				-				err = 0;
			
 
				-			} else
			
 
				+			if (!set_in_sync(mddev))
			
 
				 				err = -EBUSY;
			
 
				 			spin_unlock(&mddev->lock);
			
 
				 		} else
			
@@ -4843,8 +4964,10 @@ array_size_store(struct mddev *mddev, const char *buf, size_t len)
 
				 		return err;
			
 
				 
			
 
				 	/* cluster raid doesn't support change array_sectors */
			
 
				-	if (mddev_is_clustered(mddev))
			
 
				+	if (mddev_is_clustered(mddev)) {
			
 
				+		mddev_unlock(mddev);
			
 
				 		return -EINVAL;
			
 
				+	}
			
 
				 
			
 
				 	if (strncmp(buf, "default", 7) == 0) {
			
 
				 		if (mddev->pers)
			
@@ -4877,6 +5000,52 @@ static struct md_sysfs_entry md_array_size =
 
				 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
			
 
				        array_size_store);
			
 
				 
			
 
				+static ssize_t
			
 
				+consistency_policy_show(struct mddev *mddev, char *page)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
			
 
				+		ret = sprintf(page, "journal\n");
			
 
				+	} else if (test_bit(MD_HAS_PPL, &mddev->flags)) {
			
 
				+		ret = sprintf(page, "ppl\n");
			
 
				+	} else if (mddev->bitmap) {
			
 
				+		ret = sprintf(page, "bitmap\n");
			
 
				+	} else if (mddev->pers) {
			
 
				+		if (mddev->pers->sync_request)
			
 
				+			ret = sprintf(page, "resync\n");
			
 
				+		else
			
 
				+			ret = sprintf(page, "none\n");
			
 
				+	} else {
			
 
				+		ret = sprintf(page, "unknown\n");
			
 
				+	}
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static ssize_t
			
 
				+consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
			
 
				+{
			
 
				+	int err = 0;
			
 
				+
			
 
				+	if (mddev->pers) {
			
 
				+		if (mddev->pers->change_consistency_policy)
			
 
				+			err = mddev->pers->change_consistency_policy(mddev, buf);
			
 
				+		else
			
 
				+			err = -EBUSY;
			
 
				+	} else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
			
 
				+		set_bit(MD_HAS_PPL, &mddev->flags);
			
 
				+	} else {
			
 
				+		err = -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	return err ? err : len;
			
 
				+}
			
 
				+
			
 
				+static struct md_sysfs_entry md_consistency_policy =
			
 
				+__ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
			
 
				+       consistency_policy_store);
			
 
				+
			
 
				 static struct attribute *md_default_attrs[] = {
			
 
				 	&md_level.attr,
			
 
				 	&md_layout.attr,
			
@@ -4892,6 +5061,7 @@ static struct attribute *md_default_attrs[] = {
 
				 	&md_reshape_direction.attr,
			
 
				 	&md_array_size.attr,
			
 
				 	&max_corr_read_errors.attr,
			
 
				+	&md_consistency_policy.attr,
			
 
				 	NULL,
			
 
				 };
			
 
				 
			
@@ -4976,6 +5146,7 @@ static void md_free(struct kobject *ko)
 
				 		del_gendisk(mddev->gendisk);
			
 
				 		put_disk(mddev->gendisk);
			
 
				 	}
			
 
				+	percpu_ref_exit(&mddev->writes_pending);
			
 
				 
			
 
				 	kfree(mddev);
			
 
				 }
			
@@ -5001,8 +5172,19 @@ static void mddev_delayed_delete(struct work_struct *ws)
 
				 	kobject_put(&mddev->kobj);
			
 
				 }
			
 
				 
			
 
				+static void no_op(struct percpu_ref *r) {}
			
 
				+
			
 
				 static int md_alloc(dev_t dev, char *name)
			
 
				 {
			
 
				+	/*
			
 
				+	 * If dev is zero, name is the name of a device to allocate with
			
 
				+	 * an arbitrary minor number.  It will be "md_???"
			
 
				+	 * If dev is non-zero it must be a device number with a MAJOR of
			
 
				+	 * MD_MAJOR or mdp_major.  In this case, if "name" is NULL, then
			
 
				+	 * the device is being created by opening a node in /dev.
			
 
				+	 * If "name" is not NULL, the device is being created by
			
 
				+	 * writing to /sys/module/md_mod/parameters/new_array.
			
 
				+	 */
			
 
				 	static DEFINE_MUTEX(disks_mutex);
			
 
				 	struct mddev *mddev = mddev_find(dev);
			
 
				 	struct gendisk *disk;
			
@@ -5028,7 +5210,7 @@ static int md_alloc(dev_t dev, char *name)
 
				 	if (mddev->gendisk)
			
 
				 		goto abort;
			
 
				 
			
 
				-	if (name) {
			
 
				+	if (name && !dev) {
			
 
				 		/* Need to ensure that 'name' is not a duplicate.
			
 
				 		 */
			
 
				 		struct mddev *mddev2;
			
@@ -5042,6 +5224,11 @@ static int md_alloc(dev_t dev, char *name)
 
				 			}
			
 
				 		spin_unlock(&all_mddevs_lock);
			
 
				 	}
			
 
				+	if (name && dev)
			
 
				+		/*
			
 
				+		 * Creating /dev/mdNNN via "newarray", so adjust hold_active.
			
 
				+		 */
			
 
				+		mddev->hold_active = UNTIL_STOP;
			
 
				 
			
 
				 	error = -ENOMEM;
			
 
				 	mddev->queue = blk_alloc_queue(GFP_KERNEL);
			
@@ -5052,6 +5239,10 @@ static int md_alloc(dev_t dev, char *name)
 
				 	blk_queue_make_request(mddev->queue, md_make_request);
			
 
				 	blk_set_stacking_limits(&mddev->queue->limits);
			
 
				 
			
 
				+	if (percpu_ref_init(&mddev->writes_pending, no_op, 0, GFP_KERNEL) < 0)
			
 
				+		goto abort;
			
 
				+	/* We want to start with the refcount at zero */
			
 
				+	percpu_ref_put(&mddev->writes_pending);
			
 
				 	disk = alloc_disk(1 << shift);
			
 
				 	if (!disk) {
			
 
				 		blk_cleanup_queue(mddev->queue);
			
@@ -5108,38 +5299,48 @@ static int md_alloc(dev_t dev, char *name)
 
				 
			
 
				 static struct kobject *md_probe(dev_t dev, int *part, void *data)
			
 
				 {
			
 
				-	md_alloc(dev, NULL);
			
 
				+	if (create_on_open)
			
 
				+		md_alloc(dev, NULL);
			
 
				 	return NULL;
			
 
				 }
			
 
				 
			
 
				 static int add_named_array(const char *val, struct kernel_param *kp)
			
 
				 {
			
 
				-	/* val must be "md_*" where * is not all digits.
			
 
				-	 * We allocate an array with a large free minor number, and
			
 
				+	/*
			
 
				+	 * val must be "md_*" or "mdNNN".
			
 
				+	 * For "md_*" we allocate an array with a large free minor number, and
			
 
				 	 * set the name to val.  val must not already be an active name.
			
 
				+	 * For "mdNNN" we allocate an array with the minor number NNN
			
 
				+	 * which must not already be in use.
			
 
				 	 */
			
 
				 	int len = strlen(val);
			
 
				 	char buf[DISK_NAME_LEN];
			
 
				+	unsigned long devnum;
			
 
				 
			
 
				 	while (len && val[len-1] == '\n')
			
 
				 		len--;
			
 
				 	if (len >= DISK_NAME_LEN)
			
 
				 		return -E2BIG;
			
 
				 	strlcpy(buf, val, len+1);
			
 
				-	if (strncmp(buf, "md_", 3) != 0)
			
 
				-		return -EINVAL;
			
 
				-	return md_alloc(0, buf);
			
 
				+	if (strncmp(buf, "md_", 3) == 0)
			
 
				+		return md_alloc(0, buf);
			
 
				+	if (strncmp(buf, "md", 2) == 0 &&
			
 
				+	    isdigit(buf[2]) &&
			
 
				+	    kstrtoul(buf+2, 10, &devnum) == 0 &&
			
 
				+	    devnum <= MINORMASK)
			
 
				+		return md_alloc(MKDEV(MD_MAJOR, devnum), NULL);
			
 
				+
			
 
				+	return -EINVAL;
			
 
				 }
			
 
				 
			
 
				 static void md_safemode_timeout(unsigned long data)
			
 
				 {
			
 
				 	struct mddev *mddev = (struct mddev *) data;
			
 
				 
			
 
				-	if (!atomic_read(&mddev->writes_pending)) {
			
 
				-		mddev->safemode = 1;
			
 
				-		if (mddev->external)
			
 
				-			sysfs_notify_dirent_safe(mddev->sysfs_state);
			
 
				-	}
			
 
				+	mddev->safemode = 1;
			
 
				+	if (mddev->external)
			
 
				+		sysfs_notify_dirent_safe(mddev->sysfs_state);
			
 
				+
			
 
				 	md_wakeup_thread(mddev->thread);
			
 
				 }
			
 
				 
			
@@ -5185,6 +5386,13 @@ int md_run(struct mddev *mddev)
 
				 			continue;
			
 
				 		sync_blockdev(rdev->bdev);
			
 
				 		invalidate_bdev(rdev->bdev);
			
 
				+		if (mddev->ro != 1 &&
			
 
				+		    (bdev_read_only(rdev->bdev) ||
			
 
				+		     bdev_read_only(rdev->meta_bdev))) {
			
 
				+			mddev->ro = 1;
			
 
				+			if (mddev->gendisk)
			
 
				+				set_disk_ro(mddev->gendisk, 1);
			
 
				+		}
			
 
				 
			
 
				 		/* perform some consistency tests on the device.
			
 
				 		 * We don't want the data to overlap the metadata,
			
@@ -5344,7 +5552,6 @@ int md_run(struct mddev *mddev)
 
				 	} else if (mddev->ro == 2) /* auto-readonly not meaningful */
			
 
				 		mddev->ro = 0;
			
 
				 
			
 
				-	atomic_set(&mddev->writes_pending,0);
			
 
				 	atomic_set(&mddev->max_corr_read_errors,
			
 
				 		   MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
			
 
				 	mddev->safemode = 0;
			
@@ -5410,6 +5617,9 @@ out:
 
				 static int restart_array(struct mddev *mddev)
			
 
				 {
			
 
				 	struct gendisk *disk = mddev->gendisk;
			
 
				+	struct md_rdev *rdev;
			
 
				+	bool has_journal = false;
			
 
				+	bool has_readonly = false;
			
 
				 
			
 
				 	/* Complain if it has no devices */
			
 
				 	if (list_empty(&mddev->disks))
			
@@ -5418,24 +5628,21 @@ static int restart_array(struct mddev *mddev)
 
				 		return -EINVAL;
			
 
				 	if (!mddev->ro)
			
 
				 		return -EBUSY;
			
 
				-	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
			
 
				-		struct md_rdev *rdev;
			
 
				-		bool has_journal = false;
			
 
				-
			
 
				-		rcu_read_lock();
			
 
				-		rdev_for_each_rcu(rdev, mddev) {
			
 
				-			if (test_bit(Journal, &rdev->flags) &&
			
 
				-			    !test_bit(Faulty, &rdev->flags)) {
			
 
				-				has_journal = true;
			
 
				-				break;
			
 
				-			}
			
 
				-		}
			
 
				-		rcu_read_unlock();
			
 
				 
			
 
				+	rcu_read_lock();
			
 
				+	rdev_for_each_rcu(rdev, mddev) {
			
 
				+		if (test_bit(Journal, &rdev->flags) &&
			
 
				+		    !test_bit(Faulty, &rdev->flags))
			
 
				+			has_journal = true;
			
 
				+		if (bdev_read_only(rdev->bdev))
			
 
				+			has_readonly = true;
			
 
				+	}
			
 
				+	rcu_read_unlock();
			
 
				+	if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal)
			
 
				 		/* Don't restart rw with journal missing/faulty */
			
 
				-		if (!has_journal)
			
 
				 			return -EINVAL;
			
 
				-	}
			
 
				+	if (has_readonly)
			
 
				+		return -EROFS;
			
 
				 
			
 
				 	mddev->safemode = 0;
			
 
				 	mddev->ro = 0;
			
@@ -5535,15 +5742,7 @@ EXPORT_SYMBOL_GPL(md_stop_writes);
 
				 
			
 
				 static void mddev_detach(struct mddev *mddev)
			
 
				 {
			
 
				-	struct bitmap *bitmap = mddev->bitmap;
			
 
				-	/* wait for behind writes to complete */
			
 
				-	if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
			
 
				-		pr_debug("md:%s: behind writes in progress - waiting to stop.\n",
			
 
				-			 mdname(mddev));
			
 
				-		/* need to kick something here to make sure I/O goes? */
			
 
				-		wait_event(bitmap->behind_wait,
			
 
				-			   atomic_read(&bitmap->behind_writes) == 0);
			
 
				-	}
			
 
				+	bitmap_wait_behind_writes(mddev);
			
 
				 	if (mddev->pers && mddev->pers->quiesce) {
			
 
				 		mddev->pers->quiesce(mddev, 1);
			
 
				 		mddev->pers->quiesce(mddev, 0);
			
@@ -5556,6 +5755,7 @@ static void mddev_detach(struct mddev *mddev)
 
				 static void __md_stop(struct mddev *mddev)
			
 
				 {
			
 
				 	struct md_personality *pers = mddev->pers;
			
 
				+	bitmap_destroy(mddev);
			
 
				 	mddev_detach(mddev);
			
 
				 	/* Ensure ->event_work is done */
			
 
				 	flush_workqueue(md_misc_wq);
			
@@ -5576,7 +5776,6 @@ void md_stop(struct mddev *mddev)
 
				 	 * This is called from dm-raid
			
 
				 	 */
			
 
				 	__md_stop(mddev);
			
 
				-	bitmap_destroy(mddev);
			
 
				 	if (mddev->bio_set)
			
 
				 		bioset_free(mddev->bio_set);
			
 
				 }
			
@@ -5714,7 +5913,6 @@ static int do_md_stop(struct mddev *mddev, int mode,
 
				 	if (mode == 0) {
			
 
				 		pr_info("md: %s stopped.\n", mdname(mddev));
			
 
				 
			
 
				-		bitmap_destroy(mddev);
			
 
				 		if (mddev->bitmap_info.file) {
			
 
				 			struct file *f = mddev->bitmap_info.file;
			
 
				 			spin_lock(&mddev->lock);
			
@@ -6493,10 +6691,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
 
				 	struct md_rdev *rdev;
			
 
				 	int rv;
			
 
				 	int fit = (num_sectors == 0);
			
 
				-
			
 
				-	/* cluster raid doesn't support update size */
			
 
				-	if (mddev_is_clustered(mddev))
			
 
				-		return -EINVAL;
			
 
				+	sector_t old_dev_sectors = mddev->dev_sectors;
			
 
				 
			
 
				 	if (mddev->pers->resize == NULL)
			
 
				 		return -EINVAL;
			
@@ -6525,7 +6720,9 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
 
				 	}
			
 
				 	rv = mddev->pers->resize(mddev, num_sectors);
			
 
				 	if (!rv) {
			
 
				-		if (mddev->queue) {
			
 
				+		if (mddev_is_clustered(mddev))
			
 
				+			md_cluster_ops->update_size(mddev, old_dev_sectors);
			
 
				+		else if (mddev->queue) {
			
 
				 			set_capacity(mddev->gendisk, mddev->array_sectors);
			
 
				 			revalidate_disk(mddev->gendisk);
			
 
				 		}
			
@@ -6776,6 +6973,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 
				 	void __user *argp = (void __user *)arg;
			
 
				 	struct mddev *mddev = NULL;
			
 
				 	int ro;
			
 
				+	bool did_set_md_closing = false;
			
 
				 
			
 
				 	if (!md_ioctl_valid(cmd))
			
 
				 		return -ENOTTY;
			
@@ -6865,7 +7063,9 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 
				 			err = -EBUSY;
			
 
				 			goto out;
			
 
				 		}
			
 
				+		WARN_ON_ONCE(test_bit(MD_CLOSING, &mddev->flags));
			
 
				 		set_bit(MD_CLOSING, &mddev->flags);
			
 
				+		did_set_md_closing = true;
			
 
				 		mutex_unlock(&mddev->open_mutex);
			
 
				 		sync_blockdev(bdev);
			
 
				 	}
			
@@ -7058,6 +7258,8 @@ unlock:
 
				 		mddev->hold_active = 0;
			
 
				 	mddev_unlock(mddev);
			
 
				 out:
			
 
				+	if(did_set_md_closing)
			
 
				+		clear_bit(MD_CLOSING, &mddev->flags);
			
 
				 	return err;
			
 
				 }
			
 
				 #ifdef CONFIG_COMPAT
			
@@ -7208,8 +7410,8 @@ void md_wakeup_thread(struct md_thread *thread)
 
				 {
			
 
				 	if (thread) {
			
 
				 		pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
			
 
				-		set_bit(THREAD_WAKEUP, &thread->flags);
			
 
				-		wake_up(&thread->wqueue);
			
 
				+		if (!test_and_set_bit(THREAD_WAKEUP, &thread->flags))
			
 
				+			wake_up(&thread->wqueue);
			
 
				 	}
			
 
				 }
			
 
				 EXPORT_SYMBOL(md_wakeup_thread);
			
@@ -7756,10 +7958,13 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
 
				 		md_wakeup_thread(mddev->sync_thread);
			
 
				 		did_change = 1;
			
 
				 	}
			
 
				-	atomic_inc(&mddev->writes_pending);
			
 
				+	rcu_read_lock();
			
 
				+	percpu_ref_get(&mddev->writes_pending);
			
 
				+	smp_mb(); /* Match smp_mb in set_in_sync() */
			
 
				 	if (mddev->safemode == 1)
			
 
				 		mddev->safemode = 0;
			
 
				-	if (mddev->in_sync) {
			
 
				+	/* sync_checkers is always 0 when writes_pending is in per-cpu mode */
			
 
				+	if (mddev->in_sync || !mddev->sync_checkers) {
			
 
				 		spin_lock(&mddev->lock);
			
 
				 		if (mddev->in_sync) {
			
 
				 			mddev->in_sync = 0;
			
@@ -7770,6 +7975,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
 
				 		}
			
 
				 		spin_unlock(&mddev->lock);
			
 
				 	}
			
 
				+	rcu_read_unlock();
			
 
				 	if (did_change)
			
 
				 		sysfs_notify_dirent_safe(mddev->sysfs_state);
			
 
				 	wait_event(mddev->sb_wait,
			
@@ -7777,15 +7983,38 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
 
				 }
			
 
				 EXPORT_SYMBOL(md_write_start);
			
 
				 
			
 
				+/* md_write_inc can only be called when md_write_start() has
			
 
				+ * already been called at least once of the current request.
			
 
				+ * It increments the counter and is useful when a single request
			
 
				+ * is split into several parts.  Each part causes an increment and
			
 
				+ * so needs a matching md_write_end().
			
 
				+ * Unlike md_write_start(), it is safe to call md_write_inc() inside
			
 
				+ * a spinlocked region.
			
 
				+ */
			
 
				+void md_write_inc(struct mddev *mddev, struct bio *bi)
			
 
				+{
			
 
				+	if (bio_data_dir(bi) != WRITE)
			
 
				+		return;
			
 
				+	WARN_ON_ONCE(mddev->in_sync || mddev->ro);
			
 
				+	percpu_ref_get(&mddev->writes_pending);
			
 
				+}
			
 
				+EXPORT_SYMBOL(md_write_inc);
			
 
				+
			
 
				 void md_write_end(struct mddev *mddev)
			
 
				 {
			
 
				-	if (atomic_dec_and_test(&mddev->writes_pending)) {
			
 
				-		if (mddev->safemode == 2)
			
 
				-			md_wakeup_thread(mddev->thread);
			
 
				-		else if (mddev->safemode_delay)
			
 
				-			mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
			
 
				-	}
			
 
				+	percpu_ref_put(&mddev->writes_pending);
			
 
				+
			
 
				+	if (mddev->safemode == 2)
			
 
				+		md_wakeup_thread(mddev->thread);
			
 
				+	else if (mddev->safemode_delay)
			
 
				+		/* The roundup() ensures this only performs locking once
			
 
				+		 * every ->safemode_delay jiffies
			
 
				+		 */
			
 
				+		mod_timer(&mddev->safemode_timer,
			
 
				+			  roundup(jiffies, mddev->safemode_delay) +
			
 
				+			  mddev->safemode_delay);
			
 
				 }
			
 
				+
			
 
				 EXPORT_SYMBOL(md_write_end);
			
 
				 
			
 
				 /* md_allow_write(mddev)
			
@@ -8385,9 +8614,8 @@ void md_check_recovery(struct mddev *mddev)
 
				 		(mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
			
 
				 		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
			
 
				 		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
			
 
				-		test_bit(MD_RELOAD_SB, &mddev->flags) ||
			
 
				 		(mddev->external == 0 && mddev->safemode == 1) ||
			
 
				-		(mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
			
 
				+		(mddev->safemode == 2
			
 
				 		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
			
 
				 		))
			
 
				 		return;
			
@@ -8434,27 +8662,12 @@ void md_check_recovery(struct mddev *mddev)
 
				 						rdev->raid_disk < 0)
			
 
				 					md_kick_rdev_from_array(rdev);
			
 
				 			}
			
 
				-
			
 
				-			if (test_and_clear_bit(MD_RELOAD_SB, &mddev->flags))
			
 
				-				md_reload_sb(mddev, mddev->good_device_nr);
			
 
				 		}
			
 
				 
			
 
				-		if (!mddev->external) {
			
 
				-			int did_change = 0;
			
 
				+		if (!mddev->external && !mddev->in_sync) {
			
 
				 			spin_lock(&mddev->lock);
			
 
				-			if (mddev->safemode &&
			
 
				-			    !atomic_read(&mddev->writes_pending) &&
			
 
				-			    !mddev->in_sync &&
			
 
				-			    mddev->recovery_cp == MaxSector) {
			
 
				-				mddev->in_sync = 1;
			
 
				-				did_change = 1;
			
 
				-				set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
			
 
				-			}
			
 
				-			if (mddev->safemode == 1)
			
 
				-				mddev->safemode = 0;
			
 
				+			set_in_sync(mddev);
			
 
				 			spin_unlock(&mddev->lock);
			
 
				-			if (did_change)
			
 
				-				sysfs_notify_dirent_safe(mddev->sysfs_state);
			
 
				 		}
			
 
				 
			
 
				 		if (mddev->sb_flags)
			
@@ -8747,6 +8960,18 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
 
				 	int role, ret;
			
 
				 	char b[BDEVNAME_SIZE];
			
 
				 
			
 
				+	/*
			
 
				+	 * If size is changed in another node then we need to
			
 
				+	 * do resize as well.
			
 
				+	 */
			
 
				+	if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
			
 
				+		ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
			
 
				+		if (ret)
			
 
				+			pr_info("md-cluster: resize failed\n");
			
 
				+		else
			
 
				+			bitmap_update_sb(mddev->bitmap);
			
 
				+	}
			
 
				+
			
 
				 	/* Check for change of roles in the active devices */
			
 
				 	rdev_for_each(rdev2, mddev) {
			
 
				 		if (test_bit(Faulty, &rdev2->flags))
			
@@ -8997,6 +9222,7 @@ static int set_ro(const char *val, struct kernel_param *kp)
 
				 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
			
 
				 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
			
 
				 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
			
 
				+module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
			
 
				 
			
 
				 MODULE_LICENSE("GPL");
			
 
				 MODULE_DESCRIPTION("MD RAID framework");
			
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -122,6 +122,13 @@ struct md_rdev {
 
				 					   * sysfs entry */
			
 
				 
			
 
				 	struct badblocks badblocks;
			
 
				+
			
 
				+	struct {
			
 
				+		short offset;	/* Offset from superblock to start of PPL.
			
 
				+				 * Not used by external metadata. */
			
 
				+		unsigned int size;	/* Size in sectors of the PPL space */
			
 
				+		sector_t sector;	/* First sector of the PPL space */
			
 
				+	} ppl;
			
 
				 };
			
 
				 enum flag_bits {
			
 
				 	Faulty,			/* device is known to have a fault */
			
@@ -219,9 +226,6 @@ enum mddev_flags {
 
				 				 * it then */
			
 
				 	MD_JOURNAL_CLEAN,	/* A raid with journal is already clean */
			
 
				 	MD_HAS_JOURNAL,		/* The raid array has journal feature set */
			
 
				-	MD_RELOAD_SB,		/* Reload the superblock because another node
			
 
				-				 * updated it.
			
 
				-				 */
			
 
				 	MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node
			
 
				 				   * already took resync lock, need to
			
 
				 				   * release the lock */
			
@@ -229,6 +233,7 @@ enum mddev_flags {
 
				 				 * supported as calls to md_error() will
			
 
				 				 * never cause the array to become failed.
			
 
				 				 */
			
 
				+	MD_HAS_PPL,		/* The raid array has PPL feature set */
			
 
				 };
			
 
				 
			
 
				 enum mddev_sb_flags {
			
@@ -404,7 +409,8 @@ struct mddev {
 
				 							 */
			
 
				 	unsigned int			safemode_delay;
			
 
				 	struct timer_list		safemode_timer;
			
 
				-	atomic_t			writes_pending;
			
 
				+	struct percpu_ref		writes_pending;
			
 
				+	int				sync_checkers;	/* # of threads checking writes_pending */
			
 
				 	struct request_queue		*queue;	/* for plugging ... */
			
 
				 
			
 
				 	struct bitmap			*bitmap; /* the bitmap for the device */
			
@@ -540,6 +546,8 @@ struct md_personality
 
				 	/* congested implements bdi.congested_fn().
			
 
				 	 * Will not be called while array is 'suspended' */
			
 
				 	int (*congested)(struct mddev *mddev, int bits);
			
 
				+	/* Changes the consistency policy of an active array. */
			
 
				+	int (*change_consistency_policy)(struct mddev *mddev, const char *buf);
			
 
				 };
			
 
				 
			
 
				 struct md_sysfs_entry {
			
@@ -641,6 +649,7 @@ extern void md_wakeup_thread(struct md_thread *thread);
 
				 extern void md_check_recovery(struct mddev *mddev);
			
 
				 extern void md_reap_sync_thread(struct mddev *mddev);
			
 
				 extern void md_write_start(struct mddev *mddev, struct bio *bi);
			
 
				+extern void md_write_inc(struct mddev *mddev, struct bio *bi);
			
 
				 extern void md_write_end(struct mddev *mddev);
			
 
				 extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
			
 
				 extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
			
@@ -716,4 +725,58 @@ static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio
 
				 	    !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors)
			
 
				 		mddev->queue->limits.max_write_zeroes_sectors = 0;
			
 
				 }
			
 
				+
			
 
				+/* Maximum size of each resync request */
			
 
				+#define RESYNC_BLOCK_SIZE (64*1024)
			
 
				+#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
			
 
				+
			
 
				+/* for managing resync I/O pages */
			
 
				+struct resync_pages {
			
 
				+	unsigned	idx;	/* for get/put page from the pool */
			
 
				+	void		*raid_bio;
			
 
				+	struct page	*pages[RESYNC_PAGES];
			
 
				+};
			
 
				+
			
 
				+static inline int resync_alloc_pages(struct resync_pages *rp,
			
 
				+				     gfp_t gfp_flags)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < RESYNC_PAGES; i++) {
			
 
				+		rp->pages[i] = alloc_page(gfp_flags);
			
 
				+		if (!rp->pages[i])
			
 
				+			goto out_free;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+out_free:
			
 
				+	while (--i >= 0)
			
 
				+		put_page(rp->pages[i]);
			
 
				+	return -ENOMEM;
			
 
				+}
			
 
				+
			
 
				+static inline void resync_free_pages(struct resync_pages *rp)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < RESYNC_PAGES; i++)
			
 
				+		put_page(rp->pages[i]);
			
 
				+}
			
 
				+
			
 
				+static inline void resync_get_all_pages(struct resync_pages *rp)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < RESYNC_PAGES; i++)
			
 
				+		get_page(rp->pages[i]);
			
 
				+}
			
 
				+
			
 
				+static inline struct page *resync_fetch_page(struct resync_pages *rp,
			
 
				+					     unsigned idx)
			
 
				+{
			
 
				+	if (WARN_ON_ONCE(idx >= RESYNC_PAGES))
			
 
				+		return NULL;
			
 
				+	return rp->pages[idx];
			
 
				+}
			
 
				 #endif /* _MD_MD_H */
			
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -29,7 +29,8 @@
 
				 #define UNSUPPORTED_MDDEV_FLAGS		\
			
 
				 	((1L << MD_HAS_JOURNAL) |	\
			
 
				 	 (1L << MD_JOURNAL_CLEAN) |	\
			
 
				-	 (1L << MD_FAILFAST_SUPPORTED))
			
 
				+	 (1L << MD_FAILFAST_SUPPORTED) |\
			
 
				+	 (1L << MD_HAS_PPL))
			
 
				 
			
 
				 static int raid0_congested(struct mddev *mddev, int bits)
			
 
				 {
			
@@ -462,53 +463,54 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 
				 {
			
 
				 	struct strip_zone *zone;
			
 
				 	struct md_rdev *tmp_dev;
			
 
				-	struct bio *split;
			
 
				+	sector_t bio_sector;
			
 
				+	sector_t sector;
			
 
				+	unsigned chunk_sects;
			
 
				+	unsigned sectors;
			
 
				 
			
 
				 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
			
 
				 		md_flush_request(mddev, bio);
			
 
				 		return;
			
 
				 	}
			
 
				 
			
 
				-	do {
			
 
				-		sector_t bio_sector = bio->bi_iter.bi_sector;
			
 
				-		sector_t sector = bio_sector;
			
 
				-		unsigned chunk_sects = mddev->chunk_sectors;
			
 
				+	bio_sector = bio->bi_iter.bi_sector;
			
 
				+	sector = bio_sector;
			
 
				+	chunk_sects = mddev->chunk_sectors;
			
 
				 
			
 
				-		unsigned sectors = chunk_sects -
			
 
				-			(likely(is_power_of_2(chunk_sects))
			
 
				-			 ? (sector & (chunk_sects-1))
			
 
				-			 : sector_div(sector, chunk_sects));
			
 
				+	sectors = chunk_sects -
			
 
				+		(likely(is_power_of_2(chunk_sects))
			
 
				+		 ? (sector & (chunk_sects-1))
			
 
				+		 : sector_div(sector, chunk_sects));
			
 
				 
			
 
				-		/* Restore due to sector_div */
			
 
				-		sector = bio_sector;
			
 
				+	/* Restore due to sector_div */
			
 
				+	sector = bio_sector;
			
 
				 
			
 
				-		if (sectors < bio_sectors(bio)) {
			
 
				-			split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
			
 
				-			bio_chain(split, bio);
			
 
				-		} else {
			
 
				-			split = bio;
			
 
				-		}
			
 
				+	if (sectors < bio_sectors(bio)) {
			
 
				+		struct bio *split = bio_split(bio, sectors, GFP_NOIO, mddev->bio_set);
			
 
				+		bio_chain(split, bio);
			
 
				+		generic_make_request(bio);
			
 
				+		bio = split;
			
 
				+	}
			
 
				 
			
 
				-		zone = find_zone(mddev->private, &sector);
			
 
				-		tmp_dev = map_sector(mddev, zone, sector, &sector);
			
 
				-		split->bi_bdev = tmp_dev->bdev;
			
 
				-		split->bi_iter.bi_sector = sector + zone->dev_start +
			
 
				-			tmp_dev->data_offset;
			
 
				-
			
 
				-		if (unlikely((bio_op(split) == REQ_OP_DISCARD) &&
			
 
				-			 !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
			
 
				-			/* Just ignore it */
			
 
				-			bio_endio(split);
			
 
				-		} else {
			
 
				-			if (mddev->gendisk)
			
 
				-				trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
			
 
				-						      split, disk_devt(mddev->gendisk),
			
 
				-						      bio_sector);
			
 
				-			mddev_check_writesame(mddev, split);
			
 
				-			mddev_check_write_zeroes(mddev, split);
			
 
				-			generic_make_request(split);
			
 
				-		}
			
 
				-	} while (split != bio);
			
 
				+	zone = find_zone(mddev->private, &sector);
			
 
				+	tmp_dev = map_sector(mddev, zone, sector, &sector);
			
 
				+	bio->bi_bdev = tmp_dev->bdev;
			
 
				+	bio->bi_iter.bi_sector = sector + zone->dev_start +
			
 
				+		tmp_dev->data_offset;
			
 
				+
			
 
				+	if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
			
 
				+		     !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
			
 
				+		/* Just ignore it */
			
 
				+		bio_endio(bio);
			
 
				+	} else {
			
 
				+		if (mddev->gendisk)
			
 
				+			trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
			
 
				+					      bio, disk_devt(mddev->gendisk),
			
 
				+					      bio_sector);
			
 
				+		mddev_check_writesame(mddev, bio);
			
 
				+		mddev_check_write_zeroes(mddev, bio);
			
 
				+		generic_make_request(bio);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 static void raid0_status(struct seq_file *seq, struct mddev *mddev)
			
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -47,7 +47,8 @@
 
				 
			
 
				 #define UNSUPPORTED_MDDEV_FLAGS		\
			
 
				 	((1L << MD_HAS_JOURNAL) |	\
			
 
				-	 (1L << MD_JOURNAL_CLEAN))
			
 
				+	 (1L << MD_JOURNAL_CLEAN) |	\
			
 
				+	 (1L << MD_HAS_PPL))
			
 
				 
			
 
				 /*
			
 
				  * Number of guaranteed r1bios in case of extreme VM load:
			
@@ -80,6 +81,24 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
 
				 #define raid1_log(md, fmt, args...)				\
			
 
				 	do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
			
 
				 
			
 
				+/*
			
 
				+ * 'strct resync_pages' stores actual pages used for doing the resync
			
 
				+ *  IO, and it is per-bio, so make .bi_private points to it.
			
 
				+ */
			
 
				+static inline struct resync_pages *get_resync_pages(struct bio *bio)
			
 
				+{
			
 
				+	return bio->bi_private;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * for resync bio, r1bio pointer can be retrieved from the per-bio
			
 
				+ * 'struct resync_pages'.
			
 
				+ */
			
 
				+static inline struct r1bio *get_resync_r1bio(struct bio *bio)
			
 
				+{
			
 
				+	return get_resync_pages(bio)->raid_bio;
			
 
				+}
			
 
				+
			
 
				 static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
			
 
				 {
			
 
				 	struct pool_info *pi = data;
			
@@ -94,10 +113,8 @@ static void r1bio_pool_free(void *r1_bio, void *data)
 
				 	kfree(r1_bio);
			
 
				 }
			
 
				 
			
 
				-#define RESYNC_BLOCK_SIZE (64*1024)
			
 
				 #define RESYNC_DEPTH 32
			
 
				 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
			
 
				-#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
			
 
				 #define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
			
 
				 #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
			
 
				 #define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
			
@@ -109,12 +126,18 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 
				 	struct r1bio *r1_bio;
			
 
				 	struct bio *bio;
			
 
				 	int need_pages;
			
 
				-	int i, j;
			
 
				+	int j;
			
 
				+	struct resync_pages *rps;
			
 
				 
			
 
				 	r1_bio = r1bio_pool_alloc(gfp_flags, pi);
			
 
				 	if (!r1_bio)
			
 
				 		return NULL;
			
 
				 
			
 
				+	rps = kmalloc(sizeof(struct resync_pages) * pi->raid_disks,
			
 
				+		      gfp_flags);
			
 
				+	if (!rps)
			
 
				+		goto out_free_r1bio;
			
 
				+
			
 
				 	/*
			
 
				 	 * Allocate bios : 1 for reading, n-1 for writing
			
 
				 	 */
			
@@ -134,19 +157,22 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 
				 		need_pages = pi->raid_disks;
			
 
				 	else
			
 
				 		need_pages = 1;
			
 
				-	for (j = 0; j < need_pages; j++) {
			
 
				+	for (j = 0; j < pi->raid_disks; j++) {
			
 
				+		struct resync_pages *rp = &rps[j];
			
 
				+
			
 
				 		bio = r1_bio->bios[j];
			
 
				-		bio->bi_vcnt = RESYNC_PAGES;
			
 
				 
			
 
				-		if (bio_alloc_pages(bio, gfp_flags))
			
 
				-			goto out_free_pages;
			
 
				-	}
			
 
				-	/* If not user-requests, copy the page pointers to all bios */
			
 
				-	if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
			
 
				-		for (i=0; i<RESYNC_PAGES ; i++)
			
 
				-			for (j=1; j<pi->raid_disks; j++)
			
 
				-				r1_bio->bios[j]->bi_io_vec[i].bv_page =
			
 
				-					r1_bio->bios[0]->bi_io_vec[i].bv_page;
			
 
				+		if (j < need_pages) {
			
 
				+			if (resync_alloc_pages(rp, gfp_flags))
			
 
				+				goto out_free_pages;
			
 
				+		} else {
			
 
				+			memcpy(rp, &rps[0], sizeof(*rp));
			
 
				+			resync_get_all_pages(rp);
			
 
				+		}
			
 
				+
			
 
				+		rp->idx = 0;
			
 
				+		rp->raid_bio = r1_bio;
			
 
				+		bio->bi_private = rp;
			
 
				 	}
			
 
				 
			
 
				 	r1_bio->master_bio = NULL;
			
@@ -155,11 +181,14 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 
				 
			
 
				 out_free_pages:
			
 
				 	while (--j >= 0)
			
 
				-		bio_free_pages(r1_bio->bios[j]);
			
 
				+		resync_free_pages(&rps[j]);
			
 
				 
			
 
				 out_free_bio:
			
 
				 	while (++j < pi->raid_disks)
			
 
				 		bio_put(r1_bio->bios[j]);
			
 
				+	kfree(rps);
			
 
				+
			
 
				+out_free_r1bio:
			
 
				 	r1bio_pool_free(r1_bio, data);
			
 
				 	return NULL;
			
 
				 }
			
@@ -167,18 +196,18 @@ out_free_bio:
 
				 static void r1buf_pool_free(void *__r1_bio, void *data)
			
 
				 {
			
 
				 	struct pool_info *pi = data;
			
 
				-	int i,j;
			
 
				+	int i;
			
 
				 	struct r1bio *r1bio = __r1_bio;
			
 
				+	struct resync_pages *rp = NULL;
			
 
				 
			
 
				-	for (i = 0; i < RESYNC_PAGES; i++)
			
 
				-		for (j = pi->raid_disks; j-- ;) {
			
 
				-			if (j == 0 ||
			
 
				-			    r1bio->bios[j]->bi_io_vec[i].bv_page !=
			
 
				-			    r1bio->bios[0]->bi_io_vec[i].bv_page)
			
 
				-				safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
			
 
				-		}
			
 
				-	for (i=0 ; i < pi->raid_disks; i++)
			
 
				+	for (i = pi->raid_disks; i--; ) {
			
 
				+		rp = get_resync_pages(r1bio->bios[i]);
			
 
				+		resync_free_pages(rp);
			
 
				 		bio_put(r1bio->bios[i]);
			
 
				+	}
			
 
				+
			
 
				+	/* resync pages array stored in the 1st bio's .bi_private */
			
 
				+	kfree(rp);
			
 
				 
			
 
				 	r1bio_pool_free(r1bio, data);
			
 
				 }
			
@@ -245,35 +274,17 @@ static void reschedule_retry(struct r1bio *r1_bio)
 
				 static void call_bio_endio(struct r1bio *r1_bio)
			
 
				 {
			
 
				 	struct bio *bio = r1_bio->master_bio;
			
 
				-	int done;
			
 
				 	struct r1conf *conf = r1_bio->mddev->private;
			
 
				-	sector_t bi_sector = bio->bi_iter.bi_sector;
			
 
				-
			
 
				-	if (bio->bi_phys_segments) {
			
 
				-		unsigned long flags;
			
 
				-		spin_lock_irqsave(&conf->device_lock, flags);
			
 
				-		bio->bi_phys_segments--;
			
 
				-		done = (bio->bi_phys_segments == 0);
			
 
				-		spin_unlock_irqrestore(&conf->device_lock, flags);
			
 
				-		/*
			
 
				-		 * make_request() might be waiting for
			
 
				-		 * bi_phys_segments to decrease
			
 
				-		 */
			
 
				-		wake_up(&conf->wait_barrier);
			
 
				-	} else
			
 
				-		done = 1;
			
 
				 
			
 
				 	if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
			
 
				 		bio->bi_error = -EIO;
			
 
				 
			
 
				-	if (done) {
			
 
				-		bio_endio(bio);
			
 
				-		/*
			
 
				-		 * Wake up any possible resync thread that waits for the device
			
 
				-		 * to go idle.
			
 
				-		 */
			
 
				-		allow_barrier(conf, bi_sector);
			
 
				-	}
			
 
				+	bio_endio(bio);
			
 
				+	/*
			
 
				+	 * Wake up any possible resync thread that waits for the device
			
 
				+	 * to go idle.
			
 
				+	 */
			
 
				+	allow_barrier(conf, r1_bio->sector);
			
 
				 }
			
 
				 
			
 
				 static void raid_end_bio_io(struct r1bio *r1_bio)
			
@@ -377,12 +388,9 @@ static void close_write(struct r1bio *r1_bio)
 
				 {
			
 
				 	/* it really is the end of this request */
			
 
				 	if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
			
 
				-		/* free extra copy of the data pages */
			
 
				-		int i = r1_bio->behind_page_count;
			
 
				-		while (i--)
			
 
				-			safe_put_page(r1_bio->behind_bvecs[i].bv_page);
			
 
				-		kfree(r1_bio->behind_bvecs);
			
 
				-		r1_bio->behind_bvecs = NULL;
			
 
				+		bio_free_pages(r1_bio->behind_master_bio);
			
 
				+		bio_put(r1_bio->behind_master_bio);
			
 
				+		r1_bio->behind_master_bio = NULL;
			
 
				 	}
			
 
				 	/* clear the bitmap if all writes complete successfully */
			
 
				 	bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
			
@@ -484,6 +492,10 @@ static void raid1_end_write_request(struct bio *bio)
 
				 	}
			
 
				 
			
 
				 	if (behind) {
			
 
				+		/* we release behind master bio when all write are done */
			
 
				+		if (r1_bio->behind_master_bio == bio)
			
 
				+			to_put = NULL;
			
 
				+
			
 
				 		if (test_bit(WriteMostly, &rdev->flags))
			
 
				 			atomic_dec(&r1_bio->behind_remaining);
			
 
				 
			
@@ -775,6 +787,30 @@ static int raid1_congested(struct mddev *mddev, int bits)
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+static void flush_bio_list(struct r1conf *conf, struct bio *bio)
			
 
				+{
			
 
				+	/* flush any pending bitmap writes to disk before proceeding w/ I/O */
			
 
				+	bitmap_unplug(conf->mddev->bitmap);
			
 
				+	wake_up(&conf->wait_barrier);
			
 
				+
			
 
				+	while (bio) { /* submit pending writes */
			
 
				+		struct bio *next = bio->bi_next;
			
 
				+		struct md_rdev *rdev = (void*)bio->bi_bdev;
			
 
				+		bio->bi_next = NULL;
			
 
				+		bio->bi_bdev = rdev->bdev;
			
 
				+		if (test_bit(Faulty, &rdev->flags)) {
			
 
				+			bio->bi_error = -EIO;
			
 
				+			bio_endio(bio);
			
 
				+		} else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
			
 
				+				    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
			
 
				+			/* Just ignore it */
			
 
				+			bio_endio(bio);
			
 
				+		else
			
 
				+			generic_make_request(bio);
			
 
				+		bio = next;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 static void flush_pending_writes(struct r1conf *conf)
			
 
				 {
			
 
				 	/* Any writes that have been queued but are awaiting
			
@@ -787,27 +823,7 @@ static void flush_pending_writes(struct r1conf *conf)
 
				 		bio = bio_list_get(&conf->pending_bio_list);
			
 
				 		conf->pending_count = 0;
			
 
				 		spin_unlock_irq(&conf->device_lock);
			
 
				-		/* flush any pending bitmap writes to
			
 
				-		 * disk before proceeding w/ I/O */
			
 
				-		bitmap_unplug(conf->mddev->bitmap);
			
 
				-		wake_up(&conf->wait_barrier);
			
 
				-
			
 
				-		while (bio) { /* submit pending writes */
			
 
				-			struct bio *next = bio->bi_next;
			
 
				-			struct md_rdev *rdev = (void*)bio->bi_bdev;
			
 
				-			bio->bi_next = NULL;
			
 
				-			bio->bi_bdev = rdev->bdev;
			
 
				-			if (test_bit(Faulty, &rdev->flags)) {
			
 
				-				bio->bi_error = -EIO;
			
 
				-				bio_endio(bio);
			
 
				-			} else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
			
 
				-					    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
			
 
				-				/* Just ignore it */
			
 
				-				bio_endio(bio);
			
 
				-			else
			
 
				-				generic_make_request(bio);
			
 
				-			bio = next;
			
 
				-		}
			
 
				+		flush_bio_list(conf, bio);
			
 
				 	} else
			
 
				 		spin_unlock_irq(&conf->device_lock);
			
 
				 }
			
@@ -869,7 +885,7 @@ static void raise_barrier(struct r1conf *conf, sector_t sector_nr)
 
				 			     atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH,
			
 
				 			    conf->resync_lock);
			
 
				 
			
 
				-	atomic_inc(&conf->nr_pending[idx]);
			
 
				+	atomic_inc(&conf->nr_sync_pending);
			
 
				 	spin_unlock_irq(&conf->resync_lock);
			
 
				 }
			
 
				 
			
@@ -880,7 +896,7 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
 
				 	BUG_ON(atomic_read(&conf->barrier[idx]) <= 0);
			
 
				 
			
 
				 	atomic_dec(&conf->barrier[idx]);
			
 
				-	atomic_dec(&conf->nr_pending[idx]);
			
 
				+	atomic_dec(&conf->nr_sync_pending);
			
 
				 	wake_up(&conf->wait_barrier);
			
 
				 }
			
 
				 
			
@@ -1017,7 +1033,8 @@ static int get_unqueued_pending(struct r1conf *conf)
 
				 {
			
 
				 	int idx, ret;
			
 
				 
			
 
				-	for (ret = 0, idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
			
 
				+	ret = atomic_read(&conf->nr_sync_pending);
			
 
				+	for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
			
 
				 		ret += atomic_read(&conf->nr_pending[idx]) -
			
 
				 			atomic_read(&conf->nr_queued[idx]);
			
 
				 
			
@@ -1068,39 +1085,49 @@ static void unfreeze_array(struct r1conf *conf)
 
				 	wake_up(&conf->wait_barrier);
			
 
				 }
			
 
				 
			
 
				-/* duplicate the data pages for behind I/O
			
 
				- */
			
 
				-static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
			
 
				+static struct bio *alloc_behind_master_bio(struct r1bio *r1_bio,
			
 
				+					   struct bio *bio)
			
 
				 {
			
 
				-	int i;
			
 
				-	struct bio_vec *bvec;
			
 
				-	struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
			
 
				-					GFP_NOIO);
			
 
				-	if (unlikely(!bvecs))
			
 
				-		return;
			
 
				+	int size = bio->bi_iter.bi_size;
			
 
				+	unsigned vcnt = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
			
 
				+	int i = 0;
			
 
				+	struct bio *behind_bio = NULL;
			
 
				+
			
 
				+	behind_bio = bio_alloc_mddev(GFP_NOIO, vcnt, r1_bio->mddev);
			
 
				+	if (!behind_bio)
			
 
				+		goto fail;
			
 
				 
			
 
				-	bio_for_each_segment_all(bvec, bio, i) {
			
 
				-		bvecs[i] = *bvec;
			
 
				-		bvecs[i].bv_page = alloc_page(GFP_NOIO);
			
 
				-		if (unlikely(!bvecs[i].bv_page))
			
 
				-			goto do_sync_io;
			
 
				-		memcpy(kmap(bvecs[i].bv_page) + bvec->bv_offset,
			
 
				-		       kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
			
 
				-		kunmap(bvecs[i].bv_page);
			
 
				-		kunmap(bvec->bv_page);
			
 
				-	}
			
 
				-	r1_bio->behind_bvecs = bvecs;
			
 
				-	r1_bio->behind_page_count = bio->bi_vcnt;
			
 
				+	/* discard op, we don't support writezero/writesame yet */
			
 
				+	if (!bio_has_data(bio))
			
 
				+		goto skip_copy;
			
 
				+
			
 
				+	while (i < vcnt && size) {
			
 
				+		struct page *page;
			
 
				+		int len = min_t(int, PAGE_SIZE, size);
			
 
				+
			
 
				+		page = alloc_page(GFP_NOIO);
			
 
				+		if (unlikely(!page))
			
 
				+			goto free_pages;
			
 
				+
			
 
				+		bio_add_page(behind_bio, page, len, 0);
			
 
				+
			
 
				+		size -= len;
			
 
				+		i++;
			
 
				+	}
			
 
				+
			
 
				+	bio_copy_data(behind_bio, bio);
			
 
				+skip_copy:
			
 
				+	r1_bio->behind_master_bio = behind_bio;;
			
 
				 	set_bit(R1BIO_BehindIO, &r1_bio->state);
			
 
				-	return;
			
 
				 
			
 
				-do_sync_io:
			
 
				-	for (i = 0; i < bio->bi_vcnt; i++)
			
 
				-		if (bvecs[i].bv_page)
			
 
				-			put_page(bvecs[i].bv_page);
			
 
				-	kfree(bvecs);
			
 
				+	return behind_bio;
			
 
				+
			
 
				+free_pages:
			
 
				 	pr_debug("%dB behind alloc failed, doing sync I/O\n",
			
 
				 		 bio->bi_iter.bi_size);
			
 
				+	bio_free_pages(behind_bio);
			
 
				+fail:
			
 
				+	return behind_bio;
			
 
				 }
			
 
				 
			
 
				 struct raid1_plug_cb {
			
@@ -1130,91 +1157,102 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
 
				 
			
 
				 	/* we aren't scheduling, so we can do the write-out directly. */
			
 
				 	bio = bio_list_get(&plug->pending);
			
 
				-	bitmap_unplug(mddev->bitmap);
			
 
				-	wake_up(&conf->wait_barrier);
			
 
				-
			
 
				-	while (bio) { /* submit pending writes */
			
 
				-		struct bio *next = bio->bi_next;
			
 
				-		struct md_rdev *rdev = (void*)bio->bi_bdev;
			
 
				-		bio->bi_next = NULL;
			
 
				-		bio->bi_bdev = rdev->bdev;
			
 
				-		if (test_bit(Faulty, &rdev->flags)) {
			
 
				-			bio->bi_error = -EIO;
			
 
				-			bio_endio(bio);
			
 
				-		} else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
			
 
				-				    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
			
 
				-			/* Just ignore it */
			
 
				-			bio_endio(bio);
			
 
				-		else
			
 
				-			generic_make_request(bio);
			
 
				-		bio = next;
			
 
				-	}
			
 
				+	flush_bio_list(conf, bio);
			
 
				 	kfree(plug);
			
 
				 }
			
 
				 
			
 
				+static void init_r1bio(struct r1bio *r1_bio, struct mddev *mddev, struct bio *bio)
			
 
				+{
			
 
				+	r1_bio->master_bio = bio;
			
 
				+	r1_bio->sectors = bio_sectors(bio);
			
 
				+	r1_bio->state = 0;
			
 
				+	r1_bio->mddev = mddev;
			
 
				+	r1_bio->sector = bio->bi_iter.bi_sector;
			
 
				+}
			
 
				+
			
 
				 static inline struct r1bio *
			
 
				-alloc_r1bio(struct mddev *mddev, struct bio *bio, sector_t sectors_handled)
			
 
				+alloc_r1bio(struct mddev *mddev, struct bio *bio)
			
 
				 {
			
 
				 	struct r1conf *conf = mddev->private;
			
 
				 	struct r1bio *r1_bio;
			
 
				 
			
 
				 	r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
			
 
				-
			
 
				-	r1_bio->master_bio = bio;
			
 
				-	r1_bio->sectors = bio_sectors(bio) - sectors_handled;
			
 
				-	r1_bio->state = 0;
			
 
				-	r1_bio->mddev = mddev;
			
 
				-	r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
			
 
				-
			
 
				+	/* Ensure no bio records IO_BLOCKED */
			
 
				+	memset(r1_bio->bios, 0, conf->raid_disks * sizeof(r1_bio->bios[0]));
			
 
				+	init_r1bio(r1_bio, mddev, bio);
			
 
				 	return r1_bio;
			
 
				 }
			
 
				 
			
 
				-static void raid1_read_request(struct mddev *mddev, struct bio *bio)
			
 
				+static void raid1_read_request(struct mddev *mddev, struct bio *bio,
			
 
				+			       int max_read_sectors, struct r1bio *r1_bio)
			
 
				 {
			
 
				 	struct r1conf *conf = mddev->private;
			
 
				 	struct raid1_info *mirror;
			
 
				-	struct r1bio *r1_bio;
			
 
				 	struct bio *read_bio;
			
 
				 	struct bitmap *bitmap = mddev->bitmap;
			
 
				 	const int op = bio_op(bio);
			
 
				 	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
			
 
				-	int sectors_handled;
			
 
				 	int max_sectors;
			
 
				 	int rdisk;
			
 
				+	bool print_msg = !!r1_bio;
			
 
				+	char b[BDEVNAME_SIZE];
			
 
				 
			
 
				 	/*
			
 
				-	 * Still need barrier for READ in case that whole
			
 
				-	 * array is frozen.
			
 
				+	 * If r1_bio is set, we are blocking the raid1d thread
			
 
				+	 * so there is a tiny risk of deadlock.  So ask for
			
 
				+	 * emergency memory if needed.
			
 
				 	 */
			
 
				-	wait_read_barrier(conf, bio->bi_iter.bi_sector);
			
 
				+	gfp_t gfp = r1_bio ? (GFP_NOIO | __GFP_HIGH) : GFP_NOIO;
			
 
				 
			
 
				-	r1_bio = alloc_r1bio(mddev, bio, 0);
			
 
				+	if (print_msg) {
			
 
				+		/* Need to get the block device name carefully */
			
 
				+		struct md_rdev *rdev;
			
 
				+		rcu_read_lock();
			
 
				+		rdev = rcu_dereference(conf->mirrors[r1_bio->read_disk].rdev);
			
 
				+		if (rdev)
			
 
				+			bdevname(rdev->bdev, b);
			
 
				+		else
			
 
				+			strcpy(b, "???");
			
 
				+		rcu_read_unlock();
			
 
				+	}
			
 
				 
			
 
				 	/*
			
 
				-	 * We might need to issue multiple reads to different
			
 
				-	 * devices if there are bad blocks around, so we keep
			
 
				-	 * track of the number of reads in bio->bi_phys_segments.
			
 
				-	 * If this is 0, there is only one r1_bio and no locking
			
 
				-	 * will be needed when requests complete.  If it is
			
 
				-	 * non-zero, then it is the number of not-completed requests.
			
 
				+	 * Still need barrier for READ in case that whole
			
 
				+	 * array is frozen.
			
 
				 	 */
			
 
				-	bio->bi_phys_segments = 0;
			
 
				-	bio_clear_flag(bio, BIO_SEG_VALID);
			
 
				+	wait_read_barrier(conf, bio->bi_iter.bi_sector);
			
 
				+
			
 
				+	if (!r1_bio)
			
 
				+		r1_bio = alloc_r1bio(mddev, bio);
			
 
				+	else
			
 
				+		init_r1bio(r1_bio, mddev, bio);
			
 
				+	r1_bio->sectors = max_read_sectors;
			
 
				 
			
 
				 	/*
			
 
				 	 * make_request() can abort the operation when read-ahead is being
			
 
				 	 * used and no empty request is available.
			
 
				 	 */
			
 
				-read_again:
			
 
				 	rdisk = read_balance(conf, r1_bio, &max_sectors);
			
 
				 
			
 
				 	if (rdisk < 0) {
			
 
				 		/* couldn't find anywhere to read from */
			
 
				+		if (print_msg) {
			
 
				+			pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n",
			
 
				+					    mdname(mddev),
			
 
				+					    b,
			
 
				+					    (unsigned long long)r1_bio->sector);
			
 
				+		}
			
 
				 		raid_end_bio_io(r1_bio);
			
 
				 		return;
			
 
				 	}
			
 
				 	mirror = conf->mirrors + rdisk;
			
 
				 
			
 
				+	if (print_msg)
			
 
				+		pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %s\n",
			
 
				+				    mdname(mddev),
			
 
				+				    (unsigned long long)r1_bio->sector,
			
 
				+				    bdevname(mirror->rdev->bdev, b));
			
 
				+
			
 
				 	if (test_bit(WriteMostly, &mirror->rdev->flags) &&
			
 
				 	    bitmap) {
			
 
				 		/*
			
@@ -1225,11 +1263,20 @@ read_again:
 
				 		wait_event(bitmap->behind_wait,
			
 
				 			   atomic_read(&bitmap->behind_writes) == 0);
			
 
				 	}
			
 
				+
			
 
				+	if (max_sectors < bio_sectors(bio)) {
			
 
				+		struct bio *split = bio_split(bio, max_sectors,
			
 
				+					      gfp, conf->bio_split);
			
 
				+		bio_chain(split, bio);
			
 
				+		generic_make_request(bio);
			
 
				+		bio = split;
			
 
				+		r1_bio->master_bio = bio;
			
 
				+		r1_bio->sectors = max_sectors;
			
 
				+	}
			
 
				+
			
 
				 	r1_bio->read_disk = rdisk;
			
 
				 
			
 
				-	read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
			
 
				-	bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
			
 
				-		 max_sectors);
			
 
				+	read_bio = bio_clone_fast(bio, gfp, mddev->bio_set);
			
 
				 
			
 
				 	r1_bio->bios[rdisk] = read_bio;
			
 
				 
			
@@ -1248,35 +1295,11 @@ read_again:
 
				 	                              read_bio, disk_devt(mddev->gendisk),
			
 
				 	                              r1_bio->sector);
			
 
				 
			
 
				-	if (max_sectors < r1_bio->sectors) {
			
 
				-		/*
			
 
				-		 * could not read all from this device, so we will need another
			
 
				-		 * r1_bio.
			
 
				-		 */
			
 
				-		sectors_handled = (r1_bio->sector + max_sectors
			
 
				-				   - bio->bi_iter.bi_sector);
			
 
				-		r1_bio->sectors = max_sectors;
			
 
				-		spin_lock_irq(&conf->device_lock);
			
 
				-		if (bio->bi_phys_segments == 0)
			
 
				-			bio->bi_phys_segments = 2;
			
 
				-		else
			
 
				-			bio->bi_phys_segments++;
			
 
				-		spin_unlock_irq(&conf->device_lock);
			
 
				-
			
 
				-		/*
			
 
				-		 * Cannot call generic_make_request directly as that will be
			
 
				-		 * queued in __make_request and subsequent mempool_alloc might
			
 
				-		 * block waiting for it.  So hand bio over to raid1d.
			
 
				-		 */
			
 
				-		reschedule_retry(r1_bio);
			
 
				-
			
 
				-		r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
			
 
				-		goto read_again;
			
 
				-	} else
			
 
				-		generic_make_request(read_bio);
			
 
				+	generic_make_request(read_bio);
			
 
				 }
			
 
				 
			
 
				-static void raid1_write_request(struct mddev *mddev, struct bio *bio)
			
 
				+static void raid1_write_request(struct mddev *mddev, struct bio *bio,
			
 
				+				int max_write_sectors)
			
 
				 {
			
 
				 	struct r1conf *conf = mddev->private;
			
 
				 	struct r1bio *r1_bio;
			
@@ -1287,7 +1310,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 
				 	struct blk_plug_cb *cb;
			
 
				 	struct raid1_plug_cb *plug = NULL;
			
 
				 	int first_clone;
			
 
				-	int sectors_handled;
			
 
				 	int max_sectors;
			
 
				 
			
 
				 	/*
			
@@ -1326,17 +1348,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 
				 	}
			
 
				 	wait_barrier(conf, bio->bi_iter.bi_sector);
			
 
				 
			
 
				-	r1_bio = alloc_r1bio(mddev, bio, 0);
			
 
				-
			
 
				-	/* We might need to issue multiple writes to different
			
 
				-	 * devices if there are bad blocks around, so we keep
			
 
				-	 * track of the number of writes in bio->bi_phys_segments.
			
 
				-	 * If this is 0, there is only one r1_bio and no locking
			
 
				-	 * will be needed when requests complete.  If it is
			
 
				-	 * non-zero, then it is the number of not-completed requests.
			
 
				-	 */
			
 
				-	bio->bi_phys_segments = 0;
			
 
				-	bio_clear_flag(bio, BIO_SEG_VALID);
			
 
				+	r1_bio = alloc_r1bio(mddev, bio);
			
 
				+	r1_bio->sectors = max_write_sectors;
			
 
				 
			
 
				 	if (conf->pending_count >= max_queued_requests) {
			
 
				 		md_wakeup_thread(mddev->thread);
			
@@ -1435,31 +1448,26 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 
				 		goto retry_write;
			
 
				 	}
			
 
				 
			
 
				-	if (max_sectors < r1_bio->sectors) {
			
 
				-		/* We are splitting this write into multiple parts, so
			
 
				-		 * we need to prepare for allocating another r1_bio.
			
 
				-		 */
			
 
				+	if (max_sectors < bio_sectors(bio)) {
			
 
				+		struct bio *split = bio_split(bio, max_sectors,
			
 
				+					      GFP_NOIO, conf->bio_split);
			
 
				+		bio_chain(split, bio);
			
 
				+		generic_make_request(bio);
			
 
				+		bio = split;
			
 
				+		r1_bio->master_bio = bio;
			
 
				 		r1_bio->sectors = max_sectors;
			
 
				-		spin_lock_irq(&conf->device_lock);
			
 
				-		if (bio->bi_phys_segments == 0)
			
 
				-			bio->bi_phys_segments = 2;
			
 
				-		else
			
 
				-			bio->bi_phys_segments++;
			
 
				-		spin_unlock_irq(&conf->device_lock);
			
 
				 	}
			
 
				-	sectors_handled = r1_bio->sector + max_sectors - bio->bi_iter.bi_sector;
			
 
				 
			
 
				 	atomic_set(&r1_bio->remaining, 1);
			
 
				 	atomic_set(&r1_bio->behind_remaining, 0);
			
 
				 
			
 
				 	first_clone = 1;
			
 
				+
			
 
				 	for (i = 0; i < disks; i++) {
			
 
				 		struct bio *mbio = NULL;
			
 
				-		sector_t offset;
			
 
				 		if (!r1_bio->bios[i])
			
 
				 			continue;
			
 
				 
			
 
				-		offset = r1_bio->sector - bio->bi_iter.bi_sector;
			
 
				 
			
 
				 		if (first_clone) {
			
 
				 			/* do behind I/O ?
			
@@ -1470,11 +1478,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 
				 			    (atomic_read(&bitmap->behind_writes)
			
 
				 			     < mddev->bitmap_info.max_write_behind) &&
			
 
				 			    !waitqueue_active(&bitmap->behind_wait)) {
			
 
				-				mbio = bio_clone_bioset_partial(bio, GFP_NOIO,
			
 
				-								mddev->bio_set,
			
 
				-								offset << 9,
			
 
				-								max_sectors << 9);
			
 
				-				alloc_behind_pages(mbio, r1_bio);
			
 
				+				mbio = alloc_behind_master_bio(r1_bio, bio);
			
 
				 			}
			
 
				 
			
 
				 			bitmap_startwrite(bitmap, r1_bio->sector,
			
@@ -1485,26 +1489,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 
				 		}
			
 
				 
			
 
				 		if (!mbio) {
			
 
				-			if (r1_bio->behind_bvecs)
			
 
				-				mbio = bio_clone_bioset_partial(bio, GFP_NOIO,
			
 
				-								mddev->bio_set,
			
 
				-								offset << 9,
			
 
				-								max_sectors << 9);
			
 
				-			else {
			
 
				+			if (r1_bio->behind_master_bio)
			
 
				+				mbio = bio_clone_fast(r1_bio->behind_master_bio,
			
 
				+						      GFP_NOIO,
			
 
				+						      mddev->bio_set);
			
 
				+			else
			
 
				 				mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
			
 
				-				bio_trim(mbio, offset, max_sectors);
			
 
				-			}
			
 
				 		}
			
 
				 
			
 
				-		if (r1_bio->behind_bvecs) {
			
 
				-			struct bio_vec *bvec;
			
 
				-			int j;
			
 
				-
			
 
				-			/*
			
 
				-			 * We trimmed the bio, so _all is legit
			
 
				-			 */
			
 
				-			bio_for_each_segment_all(bvec, mbio, j)
			
 
				-				bvec->bv_page = r1_bio->behind_bvecs[j].bv_page;
			
 
				+		if (r1_bio->behind_master_bio) {
			
 
				 			if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
			
 
				 				atomic_inc(&r1_bio->behind_remaining);
			
 
				 		}
			
@@ -1548,17 +1541,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 
				 		if (!plug)
			
 
				 			md_wakeup_thread(mddev->thread);
			
 
				 	}
			
 
				-	/* Mustn't call r1_bio_write_done before this next test,
			
 
				-	 * as it could result in the bio being freed.
			
 
				-	 */
			
 
				-	if (sectors_handled < bio_sectors(bio)) {
			
 
				-		r1_bio_write_done(r1_bio);
			
 
				-		/* We need another r1_bio.  It has already been counted
			
 
				-		 * in bio->bi_phys_segments
			
 
				-		 */
			
 
				-		r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
			
 
				-		goto retry_write;
			
 
				-	}
			
 
				 
			
 
				 	r1_bio_write_done(r1_bio);
			
 
				 
			
@@ -1568,7 +1550,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 
				 
			
 
				 static void raid1_make_request(struct mddev *mddev, struct bio *bio)
			
 
				 {
			
 
				-	struct bio *split;
			
 
				 	sector_t sectors;
			
 
				 
			
 
				 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
			
@@ -1576,43 +1557,20 @@ static void raid1_make_request(struct mddev *mddev, struct bio *bio)
 
				 		return;
			
 
				 	}
			
 
				 
			
 
				-	/* if bio exceeds barrier unit boundary, split it */
			
 
				-	do {
			
 
				-		sectors = align_to_barrier_unit_end(
			
 
				-				bio->bi_iter.bi_sector, bio_sectors(bio));
			
 
				-		if (sectors < bio_sectors(bio)) {
			
 
				-			split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
			
 
				-			bio_chain(split, bio);
			
 
				-		} else {
			
 
				-			split = bio;
			
 
				-		}
			
 
				-
			
 
				-		if (bio_data_dir(split) == READ) {
			
 
				-			raid1_read_request(mddev, split);
			
 
				+	/*
			
 
				+	 * There is a limit to the maximum size, but
			
 
				+	 * the read/write handler might find a lower limit
			
 
				+	 * due to bad blocks.  To avoid multiple splits,
			
 
				+	 * we pass the maximum number of sectors down
			
 
				+	 * and let the lower level perform the split.
			
 
				+	 */
			
 
				+	sectors = align_to_barrier_unit_end(
			
 
				+		bio->bi_iter.bi_sector, bio_sectors(bio));
			
 
				 
			
 
				-			/*
			
 
				-			 * If a bio is splitted, the first part of bio will
			
 
				-			 * pass barrier but the bio is queued in
			
 
				-			 * current->bio_list (see generic_make_request). If
			
 
				-			 * there is a raise_barrier() called here, the second
			
 
				-			 * part of bio can't pass barrier. But since the first
			
 
				-			 * part bio isn't dispatched to underlaying disks yet,
			
 
				-			 * the barrier is never released, hence raise_barrier
			
 
				-			 * will alays wait. We have a deadlock.
			
 
				-			 * Note, this only happens in read path. For write
			
 
				-			 * path, the first part of bio is dispatched in a
			
 
				-			 * schedule() call (because of blk plug) or offloaded
			
 
				-			 * to raid10d.
			
 
				-			 * Quitting from the function immediately can change
			
 
				-			 * the bio order queued in bio_list and avoid the deadlock.
			
 
				-			 */
			
 
				-			if (split != bio) {
			
 
				-				generic_make_request(bio);
			
 
				-				break;
			
 
				-			}
			
 
				-		} else
			
 
				-			raid1_write_request(mddev, split);
			
 
				-	} while (split != bio);
			
 
				+	if (bio_data_dir(bio) == READ)
			
 
				+		raid1_read_request(mddev, bio, sectors, NULL);
			
 
				+	else
			
 
				+		raid1_write_request(mddev, bio, sectors);
			
 
				 }
			
 
				 
			
 
				 static void raid1_status(struct seq_file *seq, struct mddev *mddev)
			
@@ -1874,9 +1832,9 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 
				 			p->rdev = repl;
			
 
				 			conf->mirrors[conf->raid_disks + number].rdev = NULL;
			
 
				 			unfreeze_array(conf);
			
 
				-			clear_bit(WantReplacement, &rdev->flags);
			
 
				-		} else
			
 
				-			clear_bit(WantReplacement, &rdev->flags);
			
 
				+		}
			
 
				+
			
 
				+		clear_bit(WantReplacement, &rdev->flags);
			
 
				 		err = md_integrity_register(mddev);
			
 
				 	}
			
 
				 abort:
			
@@ -1887,7 +1845,7 @@ abort:
 
				 
			
 
				 static void end_sync_read(struct bio *bio)
			
 
				 {
			
 
				-	struct r1bio *r1_bio = bio->bi_private;
			
 
				+	struct r1bio *r1_bio = get_resync_r1bio(bio);
			
 
				 
			
 
				 	update_head_pos(r1_bio->read_disk, r1_bio);
			
 
				 
			
@@ -1906,7 +1864,7 @@ static void end_sync_read(struct bio *bio)
 
				 static void end_sync_write(struct bio *bio)
			
 
				 {
			
 
				 	int uptodate = !bio->bi_error;
			
 
				-	struct r1bio *r1_bio = bio->bi_private;
			
 
				+	struct r1bio *r1_bio = get_resync_r1bio(bio);
			
 
				 	struct mddev *mddev = r1_bio->mddev;
			
 
				 	struct r1conf *conf = mddev->private;
			
 
				 	sector_t first_bad;
			
@@ -1985,6 +1943,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 
				 	struct mddev *mddev = r1_bio->mddev;
			
 
				 	struct r1conf *conf = mddev->private;
			
 
				 	struct bio *bio = r1_bio->bios[r1_bio->read_disk];
			
 
				+	struct page **pages = get_resync_pages(bio)->pages;
			
 
				 	sector_t sect = r1_bio->sector;
			
 
				 	int sectors = r1_bio->sectors;
			
 
				 	int idx = 0;
			
@@ -2018,7 +1977,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 
				 				 */
			
 
				 				rdev = conf->mirrors[d].rdev;
			
 
				 				if (sync_page_io(rdev, sect, s<<9,
			
 
				-						 bio->bi_io_vec[idx].bv_page,
			
 
				+						 pages[idx],
			
 
				 						 REQ_OP_READ, 0, false)) {
			
 
				 					success = 1;
			
 
				 					break;
			
@@ -2073,7 +2032,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 
				 				continue;
			
 
				 			rdev = conf->mirrors[d].rdev;
			
 
				 			if (r1_sync_page_io(rdev, sect, s,
			
 
				-					    bio->bi_io_vec[idx].bv_page,
			
 
				+					    pages[idx],
			
 
				 					    WRITE) == 0) {
			
 
				 				r1_bio->bios[d]->bi_end_io = NULL;
			
 
				 				rdev_dec_pending(rdev, mddev);
			
@@ -2088,7 +2047,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 
				 				continue;
			
 
				 			rdev = conf->mirrors[d].rdev;
			
 
				 			if (r1_sync_page_io(rdev, sect, s,
			
 
				-					    bio->bi_io_vec[idx].bv_page,
			
 
				+					    pages[idx],
			
 
				 					    READ) != 0)
			
 
				 				atomic_add(s, &rdev->corrected_errors);
			
 
				 		}
			
@@ -2122,7 +2081,9 @@ static void process_checks(struct r1bio *r1_bio)
 
				 		int j;
			
 
				 		int size;
			
 
				 		int error;
			
 
				+		struct bio_vec *bi;
			
 
				 		struct bio *b = r1_bio->bios[i];
			
 
				+		struct resync_pages *rp = get_resync_pages(b);
			
 
				 		if (b->bi_end_io != end_sync_read)
			
 
				 			continue;
			
 
				 		/* fixup the bio for reuse, but preserve errno */
			
@@ -2135,12 +2096,11 @@ static void process_checks(struct r1bio *r1_bio)
 
				 			conf->mirrors[i].rdev->data_offset;
			
 
				 		b->bi_bdev = conf->mirrors[i].rdev->bdev;
			
 
				 		b->bi_end_io = end_sync_read;
			
 
				-		b->bi_private = r1_bio;
			
 
				+		rp->raid_bio = r1_bio;
			
 
				+		b->bi_private = rp;
			
 
				 
			
 
				 		size = b->bi_iter.bi_size;
			
 
				-		for (j = 0; j < vcnt ; j++) {
			
 
				-			struct bio_vec *bi;
			
 
				-			bi = &b->bi_io_vec[j];
			
 
				+		bio_for_each_segment_all(bi, b, j) {
			
 
				 			bi->bv_offset = 0;
			
 
				 			if (size > PAGE_SIZE)
			
 
				 				bi->bv_len = PAGE_SIZE;
			
@@ -2162,20 +2122,24 @@ static void process_checks(struct r1bio *r1_bio)
 
				 		struct bio *pbio = r1_bio->bios[primary];
			
 
				 		struct bio *sbio = r1_bio->bios[i];
			
 
				 		int error = sbio->bi_error;
			
 
				+		struct page **ppages = get_resync_pages(pbio)->pages;
			
 
				+		struct page **spages = get_resync_pages(sbio)->pages;
			
 
				+		struct bio_vec *bi;
			
 
				+		int page_len[RESYNC_PAGES] = { 0 };
			
 
				 
			
 
				 		if (sbio->bi_end_io != end_sync_read)
			
 
				 			continue;
			
 
				 		/* Now we can 'fixup' the error value */
			
 
				 		sbio->bi_error = 0;
			
 
				 
			
 
				+		bio_for_each_segment_all(bi, sbio, j)
			
 
				+			page_len[j] = bi->bv_len;
			
 
				+
			
 
				 		if (!error) {
			
 
				 			for (j = vcnt; j-- ; ) {
			
 
				-				struct page *p, *s;
			
 
				-				p = pbio->bi_io_vec[j].bv_page;
			
 
				-				s = sbio->bi_io_vec[j].bv_page;
			
 
				-				if (memcmp(page_address(p),
			
 
				-					   page_address(s),
			
 
				-					   sbio->bi_io_vec[j].bv_len))
			
 
				+				if (memcmp(page_address(ppages[j]),
			
 
				+					   page_address(spages[j]),
			
 
				+					   page_len[j]))
			
 
				 					break;
			
 
				 			}
			
 
				 		} else
			
@@ -2222,6 +2186,8 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
 
				 		     (i == r1_bio->read_disk ||
			
 
				 		      !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
			
 
				 			continue;
			
 
				+		if (test_bit(Faulty, &conf->mirrors[i].rdev->flags))
			
 
				+			continue;
			
 
				 
			
 
				 		bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
			
 
				 		if (test_bit(FailFast, &conf->mirrors[i].rdev->flags))
			
@@ -2391,18 +2357,11 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
 
				 		/* Write at 'sector' for 'sectors'*/
			
 
				 
			
 
				 		if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
			
 
				-			unsigned vcnt = r1_bio->behind_page_count;
			
 
				-			struct bio_vec *vec = r1_bio->behind_bvecs;
			
 
				-
			
 
				-			while (!vec->bv_page) {
			
 
				-				vec++;
			
 
				-				vcnt--;
			
 
				-			}
			
 
				-
			
 
				-			wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
			
 
				-			memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
			
 
				-
			
 
				-			wbio->bi_vcnt = vcnt;
			
 
				+			wbio = bio_clone_fast(r1_bio->behind_master_bio,
			
 
				+					      GFP_NOIO,
			
 
				+					      mddev->bio_set);
			
 
				+			/* We really need a _all clone */
			
 
				+			wbio->bi_iter = (struct bvec_iter){ 0 };
			
 
				 		} else {
			
 
				 			wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
			
 
				 					      mddev->bio_set);
			
@@ -2501,11 +2460,8 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
 
				 
			
 
				 static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
			
 
				 {
			
 
				-	int disk;
			
 
				-	int max_sectors;
			
 
				 	struct mddev *mddev = conf->mddev;
			
 
				 	struct bio *bio;
			
 
				-	char b[BDEVNAME_SIZE];
			
 
				 	struct md_rdev *rdev;
			
 
				 	dev_t bio_dev;
			
 
				 	sector_t bio_sector;
			
@@ -2521,7 +2477,6 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
 
				 	 */
			
 
				 
			
 
				 	bio = r1_bio->bios[r1_bio->read_disk];
			
 
				-	bdevname(bio->bi_bdev, b);
			
 
				 	bio_dev = bio->bi_bdev->bd_dev;
			
 
				 	bio_sector = conf->mirrors[r1_bio->read_disk].rdev->data_offset + r1_bio->sector;
			
 
				 	bio_put(bio);
			
@@ -2539,62 +2494,12 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
 
				 	}
			
 
				 
			
 
				 	rdev_dec_pending(rdev, conf->mddev);
			
 
				+	allow_barrier(conf, r1_bio->sector);
			
 
				+	bio = r1_bio->master_bio;
			
 
				 
			
 
				-read_more:
			
 
				-	disk = read_balance(conf, r1_bio, &max_sectors);
			
 
				-	if (disk == -1) {
			
 
				-		pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n",
			
 
				-				    mdname(mddev), b, (unsigned long long)r1_bio->sector);
			
 
				-		raid_end_bio_io(r1_bio);
			
 
				-	} else {
			
 
				-		const unsigned long do_sync
			
 
				-			= r1_bio->master_bio->bi_opf & REQ_SYNC;
			
 
				-		r1_bio->read_disk = disk;
			
 
				-		bio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
			
 
				-				     mddev->bio_set);
			
 
				-		bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector,
			
 
				-			 max_sectors);
			
 
				-		r1_bio->bios[r1_bio->read_disk] = bio;
			
 
				-		rdev = conf->mirrors[disk].rdev;
			
 
				-		pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %s\n",
			
 
				-				    mdname(mddev),
			
 
				-				    (unsigned long long)r1_bio->sector,
			
 
				-				    bdevname(rdev->bdev, b));
			
 
				-		bio->bi_iter.bi_sector = r1_bio->sector + rdev->data_offset;
			
 
				-		bio->bi_bdev = rdev->bdev;
			
 
				-		bio->bi_end_io = raid1_end_read_request;
			
 
				-		bio_set_op_attrs(bio, REQ_OP_READ, do_sync);
			
 
				-		if (test_bit(FailFast, &rdev->flags) &&
			
 
				-		    test_bit(R1BIO_FailFast, &r1_bio->state))
			
 
				-			bio->bi_opf |= MD_FAILFAST;
			
 
				-		bio->bi_private = r1_bio;
			
 
				-		if (max_sectors < r1_bio->sectors) {
			
 
				-			/* Drat - have to split this up more */
			
 
				-			struct bio *mbio = r1_bio->master_bio;
			
 
				-			int sectors_handled = (r1_bio->sector + max_sectors
			
 
				-					       - mbio->bi_iter.bi_sector);
			
 
				-			r1_bio->sectors = max_sectors;
			
 
				-			spin_lock_irq(&conf->device_lock);
			
 
				-			if (mbio->bi_phys_segments == 0)
			
 
				-				mbio->bi_phys_segments = 2;
			
 
				-			else
			
 
				-				mbio->bi_phys_segments++;
			
 
				-			spin_unlock_irq(&conf->device_lock);
			
 
				-			trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
			
 
				-					      bio, bio_dev, bio_sector);
			
 
				-			generic_make_request(bio);
			
 
				-			bio = NULL;
			
 
				-
			
 
				-			r1_bio = alloc_r1bio(mddev, mbio, sectors_handled);
			
 
				-			set_bit(R1BIO_ReadError, &r1_bio->state);
			
 
				-
			
 
				-			goto read_more;
			
 
				-		} else {
			
 
				-			trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
			
 
				-					      bio, bio_dev, bio_sector);
			
 
				-			generic_make_request(bio);
			
 
				-		}
			
 
				-	}
			
 
				+	/* Reuse the old r1_bio so that the IO_BLOCKED settings are preserved */
			
 
				+	r1_bio->state = 0;
			
 
				+	raid1_read_request(mddev, bio, r1_bio->sectors, r1_bio);
			
 
				 }
			
 
				 
			
 
				 static void raid1d(struct md_thread *thread)
			
@@ -2660,10 +2565,7 @@ static void raid1d(struct md_thread *thread)
 
				 		else if (test_bit(R1BIO_ReadError, &r1_bio->state))
			
 
				 			handle_read_error(conf, r1_bio);
			
 
				 		else
			
 
				-			/* just a partial read to be scheduled from separate
			
 
				-			 * context
			
 
				-			 */
			
 
				-			generic_make_request(r1_bio->bios[r1_bio->read_disk]);
			
 
				+			WARN_ON_ONCE(1);
			
 
				 
			
 
				 		cond_resched();
			
 
				 		if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
			
@@ -2793,7 +2695,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 
				 	for (i = 0; i < conf->raid_disks * 2; i++) {
			
 
				 		struct md_rdev *rdev;
			
 
				 		bio = r1_bio->bios[i];
			
 
				-		bio_reset(bio);
			
 
				 
			
 
				 		rdev = rcu_dereference(conf->mirrors[i].rdev);
			
 
				 		if (rdev == NULL ||
			
@@ -2849,7 +2750,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 
				 			atomic_inc(&rdev->nr_pending);
			
 
				 			bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
			
 
				 			bio->bi_bdev = rdev->bdev;
			
 
				-			bio->bi_private = r1_bio;
			
 
				 			if (test_bit(FailFast, &rdev->flags))
			
 
				 				bio->bi_opf |= MD_FAILFAST;
			
 
				 		}
			
@@ -2935,31 +2835,25 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 
				 		}
			
 
				 
			
 
				 		for (i = 0 ; i < conf->raid_disks * 2; i++) {
			
 
				+			struct resync_pages *rp;
			
 
				+
			
 
				 			bio = r1_bio->bios[i];
			
 
				+			rp = get_resync_pages(bio);
			
 
				 			if (bio->bi_end_io) {
			
 
				-				page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
			
 
				-				if (bio_add_page(bio, page, len, 0) == 0) {
			
 
				-					/* stop here */
			
 
				-					bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
			
 
				-					while (i > 0) {
			
 
				-						i--;
			
 
				-						bio = r1_bio->bios[i];
			
 
				-						if (bio->bi_end_io==NULL)
			
 
				-							continue;
			
 
				-						/* remove last page from this bio */
			
 
				-						bio->bi_vcnt--;
			
 
				-						bio->bi_iter.bi_size -= len;
			
 
				-						bio_clear_flag(bio, BIO_SEG_VALID);
			
 
				-					}
			
 
				-					goto bio_full;
			
 
				-				}
			
 
				+				page = resync_fetch_page(rp, rp->idx++);
			
 
				+
			
 
				+				/*
			
 
				+				 * won't fail because the vec table is big
			
 
				+				 * enough to hold all these pages
			
 
				+				 */
			
 
				+				bio_add_page(bio, page, len, 0);
			
 
				 			}
			
 
				 		}
			
 
				 		nr_sectors += len>>9;
			
 
				 		sector_nr += len>>9;
			
 
				 		sync_blocks -= (len>>9);
			
 
				-	} while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
			
 
				- bio_full:
			
 
				+	} while (get_resync_pages(r1_bio->bios[disk]->bi_private)->idx < RESYNC_PAGES);
			
 
				+
			
 
				 	r1_bio->sectors = nr_sectors;
			
 
				 
			
 
				 	if (mddev_is_clustered(mddev) &&
			
@@ -3059,12 +2953,15 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 
				 	if (!conf->r1bio_pool)
			
 
				 		goto abort;
			
 
				 
			
 
				+	conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
			
 
				+	if (!conf->bio_split)
			
 
				+		goto abort;
			
 
				+
			
 
				 	conf->poolinfo->mddev = mddev;
			
 
				 
			
 
				 	err = -EINVAL;
			
 
				 	spin_lock_init(&conf->device_lock);
			
 
				 	rdev_for_each(rdev, mddev) {
			
 
				-		struct request_queue *q;
			
 
				 		int disk_idx = rdev->raid_disk;
			
 
				 		if (disk_idx >= mddev->raid_disks
			
 
				 		    || disk_idx < 0)
			
@@ -3077,8 +2974,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 
				 		if (disk->rdev)
			
 
				 			goto abort;
			
 
				 		disk->rdev = rdev;
			
 
				-		q = bdev_get_queue(rdev->bdev);
			
 
				-
			
 
				 		disk->head_position = 0;
			
 
				 		disk->seq_start = MaxSector;
			
 
				 	}
			
@@ -3140,6 +3035,8 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 
				 		kfree(conf->nr_waiting);
			
 
				 		kfree(conf->nr_queued);
			
 
				 		kfree(conf->barrier);
			
 
				+		if (conf->bio_split)
			
 
				+			bioset_free(conf->bio_split);
			
 
				 		kfree(conf);
			
 
				 	}
			
 
				 	return ERR_PTR(err);
			
@@ -3247,6 +3144,8 @@ static void raid1_free(struct mddev *mddev, void *priv)
 
				 	kfree(conf->nr_waiting);
			
 
				 	kfree(conf->nr_queued);
			
 
				 	kfree(conf->barrier);
			
 
				+	if (conf->bio_split)
			
 
				+		bioset_free(conf->bio_split);
			
 
				 	kfree(conf);
			
 
				 }
			
 
				 
			
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -84,6 +84,7 @@ struct r1conf {
 
				 	 */
			
 
				 	wait_queue_head_t	wait_barrier;
			
 
				 	spinlock_t		resync_lock;
			
 
				+	atomic_t		nr_sync_pending;
			
 
				 	atomic_t		*nr_pending;
			
 
				 	atomic_t		*nr_waiting;
			
 
				 	atomic_t		*nr_queued;
			
@@ -107,6 +108,8 @@ struct r1conf {
 
				 	mempool_t		*r1bio_pool;
			
 
				 	mempool_t		*r1buf_pool;
			
 
				 
			
 
				+	struct bio_set		*bio_split;
			
 
				+
			
 
				 	/* temporary buffer to synchronous IO when attempting to repair
			
 
				 	 * a read error.
			
 
				 	 */
			
@@ -153,9 +156,13 @@ struct r1bio {
 
				 	int			read_disk;
			
 
				 
			
 
				 	struct list_head	retry_list;
			
 
				-	/* Next two are only valid when R1BIO_BehindIO is set */
			
 
				-	struct bio_vec		*behind_bvecs;
			
 
				-	int			behind_page_count;
			
 
				+
			
 
				+	/*
			
 
				+	 * When R1BIO_BehindIO is set, we store pages for write behind
			
 
				+	 * in behind_master_bio.
			
 
				+	 */
			
 
				+	struct bio		*behind_master_bio;
			
 
				+
			
 
				 	/*
			
 
				 	 * if the IO is in WRITE direction, then multiple bios are used.
			
 
				 	 * We choose the number when they are allocated.
			
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -110,6 +110,24 @@ static void end_reshape(struct r10conf *conf);
 
				 #define raid10_log(md, fmt, args...)				\
			
 
				 	do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
			
 
				 
			
 
				+/*
			
 
				+ * 'strct resync_pages' stores actual pages used for doing the resync
			
 
				+ *  IO, and it is per-bio, so make .bi_private points to it.
			
 
				+ */
			
 
				+static inline struct resync_pages *get_resync_pages(struct bio *bio)
			
 
				+{
			
 
				+	return bio->bi_private;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * for resync bio, r10bio pointer can be retrieved from the per-bio
			
 
				+ * 'struct resync_pages'.
			
 
				+ */
			
 
				+static inline struct r10bio *get_resync_r10bio(struct bio *bio)
			
 
				+{
			
 
				+	return get_resync_pages(bio)->raid_bio;
			
 
				+}
			
 
				+
			
 
				 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
			
 
				 {
			
 
				 	struct r10conf *conf = data;
			
@@ -125,9 +143,6 @@ static void r10bio_pool_free(void *r10_bio, void *data)
 
				 	kfree(r10_bio);
			
 
				 }
			
 
				 
			
 
				-/* Maximum size of each resync request */
			
 
				-#define RESYNC_BLOCK_SIZE (64*1024)
			
 
				-#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
			
 
				 /* amount of memory to reserve for resync requests */
			
 
				 #define RESYNC_WINDOW (1024*1024)
			
 
				 /* maximum number of concurrent requests, memory permitting */
			
@@ -143,11 +158,11 @@ static void r10bio_pool_free(void *r10_bio, void *data)
 
				 static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
			
 
				 {
			
 
				 	struct r10conf *conf = data;
			
 
				-	struct page *page;
			
 
				 	struct r10bio *r10_bio;
			
 
				 	struct bio *bio;
			
 
				-	int i, j;
			
 
				-	int nalloc;
			
 
				+	int j;
			
 
				+	int nalloc, nalloc_rp;
			
 
				+	struct resync_pages *rps;
			
 
				 
			
 
				 	r10_bio = r10bio_pool_alloc(gfp_flags, conf);
			
 
				 	if (!r10_bio)
			
@@ -159,6 +174,15 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 
				 	else
			
 
				 		nalloc = 2; /* recovery */
			
 
				 
			
 
				+	/* allocate once for all bios */
			
 
				+	if (!conf->have_replacement)
			
 
				+		nalloc_rp = nalloc;
			
 
				+	else
			
 
				+		nalloc_rp = nalloc * 2;
			
 
				+	rps = kmalloc(sizeof(struct resync_pages) * nalloc_rp, gfp_flags);
			
 
				+	if (!rps)
			
 
				+		goto out_free_r10bio;
			
 
				+
			
 
				 	/*
			
 
				 	 * Allocate bios.
			
 
				 	 */
			
@@ -178,36 +202,40 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 
				 	 * Allocate RESYNC_PAGES data pages and attach them
			
 
				 	 * where needed.
			
 
				 	 */
			
 
				-	for (j = 0 ; j < nalloc; j++) {
			
 
				+	for (j = 0; j < nalloc; j++) {
			
 
				 		struct bio *rbio = r10_bio->devs[j].repl_bio;
			
 
				+		struct resync_pages *rp, *rp_repl;
			
 
				+
			
 
				+		rp = &rps[j];
			
 
				+		if (rbio)
			
 
				+			rp_repl = &rps[nalloc + j];
			
 
				+
			
 
				 		bio = r10_bio->devs[j].bio;
			
 
				-		for (i = 0; i < RESYNC_PAGES; i++) {
			
 
				-			if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
			
 
				-					       &conf->mddev->recovery)) {
			
 
				-				/* we can share bv_page's during recovery
			
 
				-				 * and reshape */
			
 
				-				struct bio *rbio = r10_bio->devs[0].bio;
			
 
				-				page = rbio->bi_io_vec[i].bv_page;
			
 
				-				get_page(page);
			
 
				-			} else
			
 
				-				page = alloc_page(gfp_flags);
			
 
				-			if (unlikely(!page))
			
 
				+
			
 
				+		if (!j || test_bit(MD_RECOVERY_SYNC,
			
 
				+				   &conf->mddev->recovery)) {
			
 
				+			if (resync_alloc_pages(rp, gfp_flags))
			
 
				 				goto out_free_pages;
			
 
				+		} else {
			
 
				+			memcpy(rp, &rps[0], sizeof(*rp));
			
 
				+			resync_get_all_pages(rp);
			
 
				+		}
			
 
				 
			
 
				-			bio->bi_io_vec[i].bv_page = page;
			
 
				-			if (rbio)
			
 
				-				rbio->bi_io_vec[i].bv_page = page;
			
 
				+		rp->idx = 0;
			
 
				+		rp->raid_bio = r10_bio;
			
 
				+		bio->bi_private = rp;
			
 
				+		if (rbio) {
			
 
				+			memcpy(rp_repl, rp, sizeof(*rp));
			
 
				+			rbio->bi_private = rp_repl;
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				 	return r10_bio;
			
 
				 
			
 
				 out_free_pages:
			
 
				-	for ( ; i > 0 ; i--)
			
 
				-		safe_put_page(bio->bi_io_vec[i-1].bv_page);
			
 
				-	while (j--)
			
 
				-		for (i = 0; i < RESYNC_PAGES ; i++)
			
 
				-			safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
			
 
				+	while (--j >= 0)
			
 
				+		resync_free_pages(&rps[j * 2]);
			
 
				+
			
 
				 	j = 0;
			
 
				 out_free_bio:
			
 
				 	for ( ; j < nalloc; j++) {
			
@@ -216,30 +244,34 @@ out_free_bio:
 
				 		if (r10_bio->devs[j].repl_bio)
			
 
				 			bio_put(r10_bio->devs[j].repl_bio);
			
 
				 	}
			
 
				+	kfree(rps);
			
 
				+out_free_r10bio:
			
 
				 	r10bio_pool_free(r10_bio, conf);
			
 
				 	return NULL;
			
 
				 }
			
 
				 
			
 
				 static void r10buf_pool_free(void *__r10_bio, void *data)
			
 
				 {
			
 
				-	int i;
			
 
				 	struct r10conf *conf = data;
			
 
				 	struct r10bio *r10bio = __r10_bio;
			
 
				 	int j;
			
 
				+	struct resync_pages *rp = NULL;
			
 
				 
			
 
				-	for (j=0; j < conf->copies; j++) {
			
 
				+	for (j = conf->copies; j--; ) {
			
 
				 		struct bio *bio = r10bio->devs[j].bio;
			
 
				-		if (bio) {
			
 
				-			for (i = 0; i < RESYNC_PAGES; i++) {
			
 
				-				safe_put_page(bio->bi_io_vec[i].bv_page);
			
 
				-				bio->bi_io_vec[i].bv_page = NULL;
			
 
				-			}
			
 
				-			bio_put(bio);
			
 
				-		}
			
 
				+
			
 
				+		rp = get_resync_pages(bio);
			
 
				+		resync_free_pages(rp);
			
 
				+		bio_put(bio);
			
 
				+
			
 
				 		bio = r10bio->devs[j].repl_bio;
			
 
				 		if (bio)
			
 
				 			bio_put(bio);
			
 
				 	}
			
 
				+
			
 
				+	/* resync pages array stored in the 1st bio's .bi_private */
			
 
				+	kfree(rp);
			
 
				+
			
 
				 	r10bio_pool_free(r10bio, conf);
			
 
				 }
			
 
				 
			
@@ -301,27 +333,18 @@ static void reschedule_retry(struct r10bio *r10_bio)
 
				 static void raid_end_bio_io(struct r10bio *r10_bio)
			
 
				 {
			
 
				 	struct bio *bio = r10_bio->master_bio;
			
 
				-	int done;
			
 
				 	struct r10conf *conf = r10_bio->mddev->private;
			
 
				 
			
 
				-	if (bio->bi_phys_segments) {
			
 
				-		unsigned long flags;
			
 
				-		spin_lock_irqsave(&conf->device_lock, flags);
			
 
				-		bio->bi_phys_segments--;
			
 
				-		done = (bio->bi_phys_segments == 0);
			
 
				-		spin_unlock_irqrestore(&conf->device_lock, flags);
			
 
				-	} else
			
 
				-		done = 1;
			
 
				 	if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
			
 
				 		bio->bi_error = -EIO;
			
 
				-	if (done) {
			
 
				-		bio_endio(bio);
			
 
				-		/*
			
 
				-		 * Wake up any possible resync thread that waits for the device
			
 
				-		 * to go idle.
			
 
				-		 */
			
 
				-		allow_barrier(conf);
			
 
				-	}
			
 
				+
			
 
				+	bio_endio(bio);
			
 
				+	/*
			
 
				+	 * Wake up any possible resync thread that waits for the device
			
 
				+	 * to go idle.
			
 
				+	 */
			
 
				+	allow_barrier(conf);
			
 
				+
			
 
				 	free_r10bio(r10_bio);
			
 
				 }
			
 
				 
			
@@ -1095,12 +1118,41 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 
				 	struct bio *read_bio;
			
 
				 	const int op = bio_op(bio);
			
 
				 	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
			
 
				-	int sectors_handled;
			
 
				 	int max_sectors;
			
 
				 	sector_t sectors;
			
 
				 	struct md_rdev *rdev;
			
 
				-	int slot;
			
 
				+	char b[BDEVNAME_SIZE];
			
 
				+	int slot = r10_bio->read_slot;
			
 
				+	struct md_rdev *err_rdev = NULL;
			
 
				+	gfp_t gfp = GFP_NOIO;
			
 
				 
			
 
				+	if (r10_bio->devs[slot].rdev) {
			
 
				+		/*
			
 
				+		 * This is an error retry, but we cannot
			
 
				+		 * safely dereference the rdev in the r10_bio,
			
 
				+		 * we must use the one in conf.
			
 
				+		 * If it has already been disconnected (unlikely)
			
 
				+		 * we lose the device name in error messages.
			
 
				+		 */
			
 
				+		int disk;
			
 
				+		/*
			
 
				+		 * As we are blocking raid10, it is a little safer to
			
 
				+		 * use __GFP_HIGH.
			
 
				+		 */
			
 
				+		gfp = GFP_NOIO | __GFP_HIGH;
			
 
				+
			
 
				+		rcu_read_lock();
			
 
				+		disk = r10_bio->devs[slot].devnum;
			
 
				+		err_rdev = rcu_dereference(conf->mirrors[disk].rdev);
			
 
				+		if (err_rdev)
			
 
				+			bdevname(err_rdev->bdev, b);
			
 
				+		else {
			
 
				+			strcpy(b, "???");
			
 
				+			/* This never gets dereferenced */
			
 
				+			err_rdev = r10_bio->devs[slot].rdev;
			
 
				+		}
			
 
				+		rcu_read_unlock();
			
 
				+	}
			
 
				 	/*
			
 
				 	 * Register the new request and wait if the reconstruction
			
 
				 	 * thread has put up a bar for new requests.
			
@@ -1108,7 +1160,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 
				 	 */
			
 
				 	wait_barrier(conf);
			
 
				 
			
 
				-	sectors = bio_sectors(bio);
			
 
				+	sectors = r10_bio->sectors;
			
 
				 	while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
			
 
				 	    bio->bi_iter.bi_sector < conf->reshape_progress &&
			
 
				 	    bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
			
@@ -1125,17 +1177,33 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 
				 		wait_barrier(conf);
			
 
				 	}
			
 
				 
			
 
				-read_again:
			
 
				 	rdev = read_balance(conf, r10_bio, &max_sectors);
			
 
				 	if (!rdev) {
			
 
				+		if (err_rdev) {
			
 
				+			pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
			
 
				+					    mdname(mddev), b,
			
 
				+					    (unsigned long long)r10_bio->sector);
			
 
				+		}
			
 
				 		raid_end_bio_io(r10_bio);
			
 
				 		return;
			
 
				 	}
			
 
				+	if (err_rdev)
			
 
				+		pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
			
 
				+				   mdname(mddev),
			
 
				+				   bdevname(rdev->bdev, b),
			
 
				+				   (unsigned long long)r10_bio->sector);
			
 
				+	if (max_sectors < bio_sectors(bio)) {
			
 
				+		struct bio *split = bio_split(bio, max_sectors,
			
 
				+					      gfp, conf->bio_split);
			
 
				+		bio_chain(split, bio);
			
 
				+		generic_make_request(bio);
			
 
				+		bio = split;
			
 
				+		r10_bio->master_bio = bio;
			
 
				+		r10_bio->sectors = max_sectors;
			
 
				+	}
			
 
				 	slot = r10_bio->read_slot;
			
 
				 
			
 
				-	read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
			
 
				-	bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
			
 
				-		 max_sectors);
			
 
				+	read_bio = bio_clone_fast(bio, gfp, mddev->bio_set);
			
 
				 
			
 
				 	r10_bio->devs[slot].bio = read_bio;
			
 
				 	r10_bio->devs[slot].rdev = rdev;
			
@@ -1154,55 +1222,86 @@ read_again:
 
				 	        trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
			
 
				 	                              read_bio, disk_devt(mddev->gendisk),
			
 
				 	                              r10_bio->sector);
			
 
				-	if (max_sectors < r10_bio->sectors) {
			
 
				-		/*
			
 
				-		 * Could not read all from this device, so we will need another
			
 
				-		 * r10_bio.
			
 
				-		 */
			
 
				-		sectors_handled = (r10_bio->sector + max_sectors
			
 
				-				   - bio->bi_iter.bi_sector);
			
 
				-		r10_bio->sectors = max_sectors;
			
 
				-		spin_lock_irq(&conf->device_lock);
			
 
				-		if (bio->bi_phys_segments == 0)
			
 
				-			bio->bi_phys_segments = 2;
			
 
				-		else
			
 
				-			bio->bi_phys_segments++;
			
 
				-		spin_unlock_irq(&conf->device_lock);
			
 
				-		/*
			
 
				-		 * Cannot call generic_make_request directly as that will be
			
 
				-		 * queued in __generic_make_request and subsequent
			
 
				-		 * mempool_alloc might block waiting for it.  so hand bio over
			
 
				-		 * to raid10d.
			
 
				-		 */
			
 
				-		reschedule_retry(r10_bio);
			
 
				-
			
 
				-		r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
			
 
				-
			
 
				-		r10_bio->master_bio = bio;
			
 
				-		r10_bio->sectors = bio_sectors(bio) - sectors_handled;
			
 
				-		r10_bio->state = 0;
			
 
				-		r10_bio->mddev = mddev;
			
 
				-		r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
			
 
				-		goto read_again;
			
 
				-	} else
			
 
				-		generic_make_request(read_bio);
			
 
				+	generic_make_request(read_bio);
			
 
				 	return;
			
 
				 }
			
 
				 
			
 
				-static void raid10_write_request(struct mddev *mddev, struct bio *bio,
			
 
				-				 struct r10bio *r10_bio)
			
 
				+static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
			
 
				+				  struct bio *bio, bool replacement,
			
 
				+				  int n_copy)
			
 
				 {
			
 
				-	struct r10conf *conf = mddev->private;
			
 
				-	int i;
			
 
				 	const int op = bio_op(bio);
			
 
				 	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
			
 
				 	const unsigned long do_fua = (bio->bi_opf & REQ_FUA);
			
 
				 	unsigned long flags;
			
 
				-	struct md_rdev *blocked_rdev;
			
 
				 	struct blk_plug_cb *cb;
			
 
				 	struct raid10_plug_cb *plug = NULL;
			
 
				+	struct r10conf *conf = mddev->private;
			
 
				+	struct md_rdev *rdev;
			
 
				+	int devnum = r10_bio->devs[n_copy].devnum;
			
 
				+	struct bio *mbio;
			
 
				+
			
 
				+	if (replacement) {
			
 
				+		rdev = conf->mirrors[devnum].replacement;
			
 
				+		if (rdev == NULL) {
			
 
				+			/* Replacement just got moved to main 'rdev' */
			
 
				+			smp_mb();
			
 
				+			rdev = conf->mirrors[devnum].rdev;
			
 
				+		}
			
 
				+	} else
			
 
				+		rdev = conf->mirrors[devnum].rdev;
			
 
				+
			
 
				+	mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
			
 
				+	if (replacement)
			
 
				+		r10_bio->devs[n_copy].repl_bio = mbio;
			
 
				+	else
			
 
				+		r10_bio->devs[n_copy].bio = mbio;
			
 
				+
			
 
				+	mbio->bi_iter.bi_sector	= (r10_bio->devs[n_copy].addr +
			
 
				+				   choose_data_offset(r10_bio, rdev));
			
 
				+	mbio->bi_bdev = rdev->bdev;
			
 
				+	mbio->bi_end_io	= raid10_end_write_request;
			
 
				+	bio_set_op_attrs(mbio, op, do_sync | do_fua);
			
 
				+	if (!replacement && test_bit(FailFast,
			
 
				+				     &conf->mirrors[devnum].rdev->flags)
			
 
				+			 && enough(conf, devnum))
			
 
				+		mbio->bi_opf |= MD_FAILFAST;
			
 
				+	mbio->bi_private = r10_bio;
			
 
				+
			
 
				+	if (conf->mddev->gendisk)
			
 
				+		trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
			
 
				+				      mbio, disk_devt(conf->mddev->gendisk),
			
 
				+				      r10_bio->sector);
			
 
				+	/* flush_pending_writes() needs access to the rdev so...*/
			
 
				+	mbio->bi_bdev = (void *)rdev;
			
 
				+
			
 
				+	atomic_inc(&r10_bio->remaining);
			
 
				+
			
 
				+	cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));
			
 
				+	if (cb)
			
 
				+		plug = container_of(cb, struct raid10_plug_cb, cb);
			
 
				+	else
			
 
				+		plug = NULL;
			
 
				+	spin_lock_irqsave(&conf->device_lock, flags);
			
 
				+	if (plug) {
			
 
				+		bio_list_add(&plug->pending, mbio);
			
 
				+		plug->pending_cnt++;
			
 
				+	} else {
			
 
				+		bio_list_add(&conf->pending_bio_list, mbio);
			
 
				+		conf->pending_count++;
			
 
				+	}
			
 
				+	spin_unlock_irqrestore(&conf->device_lock, flags);
			
 
				+	if (!plug)
			
 
				+		md_wakeup_thread(mddev->thread);
			
 
				+}
			
 
				+
			
 
				+static void raid10_write_request(struct mddev *mddev, struct bio *bio,
			
 
				+				 struct r10bio *r10_bio)
			
 
				+{
			
 
				+	struct r10conf *conf = mddev->private;
			
 
				+	int i;
			
 
				+	struct md_rdev *blocked_rdev;
			
 
				 	sector_t sectors;
			
 
				-	int sectors_handled;
			
 
				 	int max_sectors;
			
 
				 
			
 
				 	md_write_start(mddev, bio);
			
@@ -1214,7 +1313,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 
				 	 */
			
 
				 	wait_barrier(conf);
			
 
				 
			
 
				-	sectors = bio_sectors(bio);
			
 
				+	sectors = r10_bio->sectors;
			
 
				 	while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
			
 
				 	    bio->bi_iter.bi_sector < conf->reshape_progress &&
			
 
				 	    bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
			
@@ -1262,9 +1361,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 
				 	 * on which we have seen a write error, we want to avoid
			
 
				 	 * writing to those blocks.  This potentially requires several
			
 
				 	 * writes to write around the bad blocks.  Each set of writes
			
 
				-	 * gets its own r10_bio with a set of bios attached.  The number
			
 
				-	 * of r10_bios is recored in bio->bi_phys_segments just as with
			
 
				-	 * the read case.
			
 
				+	 * gets its own r10_bio with a set of bios attached.
			
 
				 	 */
			
 
				 
			
 
				 	r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
			
@@ -1384,145 +1481,31 @@ retry_write:
 
				 		goto retry_write;
			
 
				 	}
			
 
				 
			
 
				-	if (max_sectors < r10_bio->sectors) {
			
 
				-		/* We are splitting this into multiple parts, so
			
 
				-		 * we need to prepare for allocating another r10_bio.
			
 
				-		 */
			
 
				+	if (max_sectors < r10_bio->sectors)
			
 
				 		r10_bio->sectors = max_sectors;
			
 
				-		spin_lock_irq(&conf->device_lock);
			
 
				-		if (bio->bi_phys_segments == 0)
			
 
				-			bio->bi_phys_segments = 2;
			
 
				-		else
			
 
				-			bio->bi_phys_segments++;
			
 
				-		spin_unlock_irq(&conf->device_lock);
			
 
				+
			
 
				+	if (r10_bio->sectors < bio_sectors(bio)) {
			
 
				+		struct bio *split = bio_split(bio, r10_bio->sectors,
			
 
				+					      GFP_NOIO, conf->bio_split);
			
 
				+		bio_chain(split, bio);
			
 
				+		generic_make_request(bio);
			
 
				+		bio = split;
			
 
				+		r10_bio->master_bio = bio;
			
 
				 	}
			
 
				-	sectors_handled = r10_bio->sector + max_sectors -
			
 
				-		bio->bi_iter.bi_sector;
			
 
				 
			
 
				 	atomic_set(&r10_bio->remaining, 1);
			
 
				 	bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
			
 
				 
			
 
				 	for (i = 0; i < conf->copies; i++) {
			
 
				-		struct bio *mbio;
			
 
				-		int d = r10_bio->devs[i].devnum;
			
 
				-		if (r10_bio->devs[i].bio) {
			
 
				-			struct md_rdev *rdev = conf->mirrors[d].rdev;
			
 
				-			mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
			
 
				-			bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
			
 
				-				 max_sectors);
			
 
				-			r10_bio->devs[i].bio = mbio;
			
 
				-
			
 
				-			mbio->bi_iter.bi_sector	= (r10_bio->devs[i].addr+
			
 
				-					   choose_data_offset(r10_bio, rdev));
			
 
				-			mbio->bi_bdev = rdev->bdev;
			
 
				-			mbio->bi_end_io	= raid10_end_write_request;
			
 
				-			bio_set_op_attrs(mbio, op, do_sync | do_fua);
			
 
				-			if (test_bit(FailFast, &conf->mirrors[d].rdev->flags) &&
			
 
				-			    enough(conf, d))
			
 
				-				mbio->bi_opf |= MD_FAILFAST;
			
 
				-			mbio->bi_private = r10_bio;
			
 
				-
			
 
				-			if (conf->mddev->gendisk)
			
 
				-				trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
			
 
				-						      mbio, disk_devt(conf->mddev->gendisk),
			
 
				-						      r10_bio->sector);
			
 
				-			/* flush_pending_writes() needs access to the rdev so...*/
			
 
				-			mbio->bi_bdev = (void*)rdev;
			
 
				-
			
 
				-			atomic_inc(&r10_bio->remaining);
			
 
				-
			
 
				-			cb = blk_check_plugged(raid10_unplug, mddev,
			
 
				-					       sizeof(*plug));
			
 
				-			if (cb)
			
 
				-				plug = container_of(cb, struct raid10_plug_cb,
			
 
				-						    cb);
			
 
				-			else
			
 
				-				plug = NULL;
			
 
				-			spin_lock_irqsave(&conf->device_lock, flags);
			
 
				-			if (plug) {
			
 
				-				bio_list_add(&plug->pending, mbio);
			
 
				-				plug->pending_cnt++;
			
 
				-			} else {
			
 
				-				bio_list_add(&conf->pending_bio_list, mbio);
			
 
				-				conf->pending_count++;
			
 
				-			}
			
 
				-			spin_unlock_irqrestore(&conf->device_lock, flags);
			
 
				-			if (!plug)
			
 
				-				md_wakeup_thread(mddev->thread);
			
 
				-		}
			
 
				-
			
 
				-		if (r10_bio->devs[i].repl_bio) {
			
 
				-			struct md_rdev *rdev = conf->mirrors[d].replacement;
			
 
				-			if (rdev == NULL) {
			
 
				-				/* Replacement just got moved to main 'rdev' */
			
 
				-				smp_mb();
			
 
				-				rdev = conf->mirrors[d].rdev;
			
 
				-			}
			
 
				-			mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
			
 
				-			bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
			
 
				-				 max_sectors);
			
 
				-			r10_bio->devs[i].repl_bio = mbio;
			
 
				-
			
 
				-			mbio->bi_iter.bi_sector	= (r10_bio->devs[i].addr +
			
 
				-					   choose_data_offset(r10_bio, rdev));
			
 
				-			mbio->bi_bdev = rdev->bdev;
			
 
				-			mbio->bi_end_io	= raid10_end_write_request;
			
 
				-			bio_set_op_attrs(mbio, op, do_sync | do_fua);
			
 
				-			mbio->bi_private = r10_bio;
			
 
				-
			
 
				-			if (conf->mddev->gendisk)
			
 
				-				trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
			
 
				-						      mbio, disk_devt(conf->mddev->gendisk),
			
 
				-						      r10_bio->sector);
			
 
				-			/* flush_pending_writes() needs access to the rdev so...*/
			
 
				-			mbio->bi_bdev = (void*)rdev;
			
 
				-
			
 
				-			atomic_inc(&r10_bio->remaining);
			
 
				-
			
 
				-			cb = blk_check_plugged(raid10_unplug, mddev,
			
 
				-					       sizeof(*plug));
			
 
				-			if (cb)
			
 
				-				plug = container_of(cb, struct raid10_plug_cb,
			
 
				-						    cb);
			
 
				-			else
			
 
				-				plug = NULL;
			
 
				-			spin_lock_irqsave(&conf->device_lock, flags);
			
 
				-			if (plug) {
			
 
				-				bio_list_add(&plug->pending, mbio);
			
 
				-				plug->pending_cnt++;
			
 
				-			} else {
			
 
				-				bio_list_add(&conf->pending_bio_list, mbio);
			
 
				-				conf->pending_count++;
			
 
				-			}
			
 
				-			spin_unlock_irqrestore(&conf->device_lock, flags);
			
 
				-			if (!plug)
			
 
				-				md_wakeup_thread(mddev->thread);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	/* Don't remove the bias on 'remaining' (one_write_done) until
			
 
				-	 * after checking if we need to go around again.
			
 
				-	 */
			
 
				-
			
 
				-	if (sectors_handled < bio_sectors(bio)) {
			
 
				-		one_write_done(r10_bio);
			
 
				-		/* We need another r10_bio.  It has already been counted
			
 
				-		 * in bio->bi_phys_segments.
			
 
				-		 */
			
 
				-		r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
			
 
				-
			
 
				-		r10_bio->master_bio = bio;
			
 
				-		r10_bio->sectors = bio_sectors(bio) - sectors_handled;
			
 
				-
			
 
				-		r10_bio->mddev = mddev;
			
 
				-		r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
			
 
				-		r10_bio->state = 0;
			
 
				-		goto retry_write;
			
 
				+		if (r10_bio->devs[i].bio)
			
 
				+			raid10_write_one_disk(mddev, r10_bio, bio, false, i);
			
 
				+		if (r10_bio->devs[i].repl_bio)
			
 
				+			raid10_write_one_disk(mddev, r10_bio, bio, true, i);
			
 
				 	}
			
 
				 	one_write_done(r10_bio);
			
 
				 }
			
 
				 
			
 
				-static void __make_request(struct mddev *mddev, struct bio *bio)
			
 
				+static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
			
 
				 {
			
 
				 	struct r10conf *conf = mddev->private;
			
 
				 	struct r10bio *r10_bio;
			
@@ -1530,21 +1513,12 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 
				 	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
			
 
				 
			
 
				 	r10_bio->master_bio = bio;
			
 
				-	r10_bio->sectors = bio_sectors(bio);
			
 
				+	r10_bio->sectors = sectors;
			
 
				 
			
 
				 	r10_bio->mddev = mddev;
			
 
				 	r10_bio->sector = bio->bi_iter.bi_sector;
			
 
				 	r10_bio->state = 0;
			
 
				-
			
 
				-	/*
			
 
				-	 * We might need to issue multiple reads to different devices if there
			
 
				-	 * are bad blocks around, so we keep track of the number of reads in
			
 
				-	 * bio->bi_phys_segments.  If this is 0, there is only one r10_bio and
			
 
				-	 * no locking will be needed when the request completes.  If it is
			
 
				-	 * non-zero, then it is the number of not-completed requests.
			
 
				-	 */
			
 
				-	bio->bi_phys_segments = 0;
			
 
				-	bio_clear_flag(bio, BIO_SEG_VALID);
			
 
				+	memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies);
			
 
				 
			
 
				 	if (bio_data_dir(bio) == READ)
			
 
				 		raid10_read_request(mddev, bio, r10_bio);
			
@@ -1557,54 +1531,26 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio)
 
				 	struct r10conf *conf = mddev->private;
			
 
				 	sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
			
 
				 	int chunk_sects = chunk_mask + 1;
			
 
				-
			
 
				-	struct bio *split;
			
 
				+	int sectors = bio_sectors(bio);
			
 
				 
			
 
				 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
			
 
				 		md_flush_request(mddev, bio);
			
 
				 		return;
			
 
				 	}
			
 
				 
			
 
				-	do {
			
 
				-
			
 
				-		/*
			
 
				-		 * If this request crosses a chunk boundary, we need to split
			
 
				-		 * it.
			
 
				-		 */
			
 
				-		if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
			
 
				-			     bio_sectors(bio) > chunk_sects
			
 
				-			     && (conf->geo.near_copies < conf->geo.raid_disks
			
 
				-				 || conf->prev.near_copies <
			
 
				-				 conf->prev.raid_disks))) {
			
 
				-			split = bio_split(bio, chunk_sects -
			
 
				-					  (bio->bi_iter.bi_sector &
			
 
				-					   (chunk_sects - 1)),
			
 
				-					  GFP_NOIO, fs_bio_set);
			
 
				-			bio_chain(split, bio);
			
 
				-		} else {
			
 
				-			split = bio;
			
 
				-		}
			
 
				-
			
 
				-		/*
			
 
				-		 * If a bio is splitted, the first part of bio will pass
			
 
				-		 * barrier but the bio is queued in current->bio_list (see
			
 
				-		 * generic_make_request). If there is a raise_barrier() called
			
 
				-		 * here, the second part of bio can't pass barrier. But since
			
 
				-		 * the first part bio isn't dispatched to underlaying disks
			
 
				-		 * yet, the barrier is never released, hence raise_barrier will
			
 
				-		 * alays wait. We have a deadlock.
			
 
				-		 * Note, this only happens in read path. For write path, the
			
 
				-		 * first part of bio is dispatched in a schedule() call
			
 
				-		 * (because of blk plug) or offloaded to raid10d.
			
 
				-		 * Quitting from the function immediately can change the bio
			
 
				-		 * order queued in bio_list and avoid the deadlock.
			
 
				-		 */
			
 
				-		__make_request(mddev, split);
			
 
				-		if (split != bio && bio_data_dir(bio) == READ) {
			
 
				-			generic_make_request(bio);
			
 
				-			break;
			
 
				-		}
			
 
				-	} while (split != bio);
			
 
				+	/*
			
 
				+	 * If this request crosses a chunk boundary, we need to split
			
 
				+	 * it.
			
 
				+	 */
			
 
				+	if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
			
 
				+		     sectors > chunk_sects
			
 
				+		     && (conf->geo.near_copies < conf->geo.raid_disks
			
 
				+			 || conf->prev.near_copies <
			
 
				+			 conf->prev.raid_disks)))
			
 
				+		sectors = chunk_sects -
			
 
				+			(bio->bi_iter.bi_sector &
			
 
				+			 (chunk_sects - 1));
			
 
				+	__make_request(mddev, bio, sectors);
			
 
				 
			
 
				 	/* In case raid10d snuck in to freeze_array */
			
 
				 	wake_up(&conf->wait_barrier);
			
@@ -1928,13 +1874,9 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 
				 			   * but will never see neither -- if they are careful.
			
 
				 			   */
			
 
				 		p->replacement = NULL;
			
 
				-		clear_bit(WantReplacement, &rdev->flags);
			
 
				-	} else
			
 
				-		/* We might have just remove the Replacement as faulty
			
 
				-		 * Clear the flag just in case
			
 
				-		 */
			
 
				-		clear_bit(WantReplacement, &rdev->flags);
			
 
				+	}
			
 
				 
			
 
				+	clear_bit(WantReplacement, &rdev->flags);
			
 
				 	err = md_integrity_register(mddev);
			
 
				 
			
 
				 abort:
			
@@ -1943,17 +1885,9 @@ abort:
 
				 	return err;
			
 
				 }
			
 
				 
			
 
				-static void end_sync_read(struct bio *bio)
			
 
				+static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
			
 
				 {
			
 
				-	struct r10bio *r10_bio = bio->bi_private;
			
 
				 	struct r10conf *conf = r10_bio->mddev->private;
			
 
				-	int d;
			
 
				-
			
 
				-	if (bio == r10_bio->master_bio) {
			
 
				-		/* this is a reshape read */
			
 
				-		d = r10_bio->read_slot; /* really the read dev */
			
 
				-	} else
			
 
				-		d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
			
 
				 
			
 
				 	if (!bio->bi_error)
			
 
				 		set_bit(R10BIO_Uptodate, &r10_bio->state);
			
@@ -1977,6 +1911,23 @@ static void end_sync_read(struct bio *bio)
 
				 	}
			
 
				 }
			
 
				 
			
 
				+static void end_sync_read(struct bio *bio)
			
 
				+{
			
 
				+	struct r10bio *r10_bio = get_resync_r10bio(bio);
			
 
				+	struct r10conf *conf = r10_bio->mddev->private;
			
 
				+	int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
			
 
				+
			
 
				+	__end_sync_read(r10_bio, bio, d);
			
 
				+}
			
 
				+
			
 
				+static void end_reshape_read(struct bio *bio)
			
 
				+{
			
 
				+	/* reshape read bio isn't allocated from r10buf_pool */
			
 
				+	struct r10bio *r10_bio = bio->bi_private;
			
 
				+
			
 
				+	__end_sync_read(r10_bio, bio, r10_bio->read_slot);
			
 
				+}
			
 
				+
			
 
				 static void end_sync_request(struct r10bio *r10_bio)
			
 
				 {
			
 
				 	struct mddev *mddev = r10_bio->mddev;
			
@@ -2006,7 +1957,7 @@ static void end_sync_request(struct r10bio *r10_bio)
 
				 
			
 
				 static void end_sync_write(struct bio *bio)
			
 
				 {
			
 
				-	struct r10bio *r10_bio = bio->bi_private;
			
 
				+	struct r10bio *r10_bio = get_resync_r10bio(bio);
			
 
				 	struct mddev *mddev = r10_bio->mddev;
			
 
				 	struct r10conf *conf = mddev->private;
			
 
				 	int d;
			
@@ -2065,6 +2016,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 
				 	int i, first;
			
 
				 	struct bio *tbio, *fbio;
			
 
				 	int vcnt;
			
 
				+	struct page **tpages, **fpages;
			
 
				 
			
 
				 	atomic_set(&r10_bio->remaining, 1);
			
 
				 
			
@@ -2080,12 +2032,14 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 
				 	fbio = r10_bio->devs[i].bio;
			
 
				 	fbio->bi_iter.bi_size = r10_bio->sectors << 9;
			
 
				 	fbio->bi_iter.bi_idx = 0;
			
 
				+	fpages = get_resync_pages(fbio)->pages;
			
 
				 
			
 
				 	vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
			
 
				 	/* now find blocks with errors */
			
 
				 	for (i=0 ; i < conf->copies ; i++) {
			
 
				 		int  j, d;
			
 
				 		struct md_rdev *rdev;
			
 
				+		struct resync_pages *rp;
			
 
				 
			
 
				 		tbio = r10_bio->devs[i].bio;
			
 
				 
			
@@ -2093,6 +2047,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 
				 			continue;
			
 
				 		if (i == first)
			
 
				 			continue;
			
 
				+
			
 
				+		tpages = get_resync_pages(tbio)->pages;
			
 
				 		d = r10_bio->devs[i].devnum;
			
 
				 		rdev = conf->mirrors[d].rdev;
			
 
				 		if (!r10_bio->devs[i].bio->bi_error) {
			
@@ -2105,8 +2061,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 
				 				int len = PAGE_SIZE;
			
 
				 				if (sectors < (len / 512))
			
 
				 					len = sectors * 512;
			
 
				-				if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
			
 
				-					   page_address(tbio->bi_io_vec[j].bv_page),
			
 
				+				if (memcmp(page_address(fpages[j]),
			
 
				+					   page_address(tpages[j]),
			
 
				 					   len))
			
 
				 					break;
			
 
				 				sectors -= len/512;
			
@@ -2127,11 +2083,13 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 
				 		 * First we need to fixup bv_offset, bv_len and
			
 
				 		 * bi_vecs, as the read request might have corrupted these
			
 
				 		 */
			
 
				+		rp = get_resync_pages(tbio);
			
 
				 		bio_reset(tbio);
			
 
				 
			
 
				 		tbio->bi_vcnt = vcnt;
			
 
				 		tbio->bi_iter.bi_size = fbio->bi_iter.bi_size;
			
 
				-		tbio->bi_private = r10_bio;
			
 
				+		rp->raid_bio = r10_bio;
			
 
				+		tbio->bi_private = rp;
			
 
				 		tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
			
 
				 		tbio->bi_end_io = end_sync_write;
			
 
				 		bio_set_op_attrs(tbio, REQ_OP_WRITE, 0);
			
@@ -2202,6 +2160,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
 
				 	int idx = 0;
			
 
				 	int dr = r10_bio->devs[0].devnum;
			
 
				 	int dw = r10_bio->devs[1].devnum;
			
 
				+	struct page **pages = get_resync_pages(bio)->pages;
			
 
				 
			
 
				 	while (sectors) {
			
 
				 		int s = sectors;
			
@@ -2217,7 +2176,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
 
				 		ok = sync_page_io(rdev,
			
 
				 				  addr,
			
 
				 				  s << 9,
			
 
				-				  bio->bi_io_vec[idx].bv_page,
			
 
				+				  pages[idx],
			
 
				 				  REQ_OP_READ, 0, false);
			
 
				 		if (ok) {
			
 
				 			rdev = conf->mirrors[dw].rdev;
			
@@ -2225,7 +2184,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
 
				 			ok = sync_page_io(rdev,
			
 
				 					  addr,
			
 
				 					  s << 9,
			
 
				-					  bio->bi_io_vec[idx].bv_page,
			
 
				+					  pages[idx],
			
 
				 					  REQ_OP_WRITE, 0, false);
			
 
				 			if (!ok) {
			
 
				 				set_bit(WriteErrorSeen, &rdev->flags);
			
@@ -2625,9 +2584,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
 
				 	struct bio *bio;
			
 
				 	struct r10conf *conf = mddev->private;
			
 
				 	struct md_rdev *rdev = r10_bio->devs[slot].rdev;
			
 
				-	char b[BDEVNAME_SIZE];
			
 
				-	unsigned long do_sync;
			
 
				-	int max_sectors;
			
 
				 	dev_t bio_dev;
			
 
				 	sector_t bio_last_sector;
			
 
				 
			
@@ -2640,7 +2596,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
 
				 	 * frozen.
			
 
				 	 */
			
 
				 	bio = r10_bio->devs[slot].bio;
			
 
				-	bdevname(bio->bi_bdev, b);
			
 
				 	bio_dev = bio->bi_bdev->bd_dev;
			
 
				 	bio_last_sector = r10_bio->devs[slot].addr + rdev->data_offset + r10_bio->sectors;
			
 
				 	bio_put(bio);
			
@@ -2656,69 +2611,9 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
 
				 		md_error(mddev, rdev);
			
 
				 
			
 
				 	rdev_dec_pending(rdev, mddev);
			
 
				-
			
 
				-read_more:
			
 
				-	rdev = read_balance(conf, r10_bio, &max_sectors);
			
 
				-	if (rdev == NULL) {
			
 
				-		pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
			
 
				-				    mdname(mddev), b,
			
 
				-				    (unsigned long long)r10_bio->sector);
			
 
				-		raid_end_bio_io(r10_bio);
			
 
				-		return;
			
 
				-	}
			
 
				-
			
 
				-	do_sync = (r10_bio->master_bio->bi_opf & REQ_SYNC);
			
 
				-	slot = r10_bio->read_slot;
			
 
				-	pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
			
 
				-			   mdname(mddev),
			
 
				-			   bdevname(rdev->bdev, b),
			
 
				-			   (unsigned long long)r10_bio->sector);
			
 
				-	bio = bio_clone_fast(r10_bio->master_bio, GFP_NOIO, mddev->bio_set);
			
 
				-	bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
			
 
				-	r10_bio->devs[slot].bio = bio;
			
 
				-	r10_bio->devs[slot].rdev = rdev;
			
 
				-	bio->bi_iter.bi_sector = r10_bio->devs[slot].addr
			
 
				-		+ choose_data_offset(r10_bio, rdev);
			
 
				-	bio->bi_bdev = rdev->bdev;
			
 
				-	bio_set_op_attrs(bio, REQ_OP_READ, do_sync);
			
 
				-	if (test_bit(FailFast, &rdev->flags) &&
			
 
				-	    test_bit(R10BIO_FailFast, &r10_bio->state))
			
 
				-		bio->bi_opf |= MD_FAILFAST;
			
 
				-	bio->bi_private = r10_bio;
			
 
				-	bio->bi_end_io = raid10_end_read_request;
			
 
				-	trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
			
 
				-			      bio, bio_dev,
			
 
				-			      bio_last_sector - r10_bio->sectors);
			
 
				-
			
 
				-	if (max_sectors < r10_bio->sectors) {
			
 
				-		/* Drat - have to split this up more */
			
 
				-		struct bio *mbio = r10_bio->master_bio;
			
 
				-		int sectors_handled =
			
 
				-			r10_bio->sector + max_sectors
			
 
				-			- mbio->bi_iter.bi_sector;
			
 
				-		r10_bio->sectors = max_sectors;
			
 
				-		spin_lock_irq(&conf->device_lock);
			
 
				-		if (mbio->bi_phys_segments == 0)
			
 
				-			mbio->bi_phys_segments = 2;
			
 
				-		else
			
 
				-			mbio->bi_phys_segments++;
			
 
				-		spin_unlock_irq(&conf->device_lock);
			
 
				-		generic_make_request(bio);
			
 
				-
			
 
				-		r10_bio = mempool_alloc(conf->r10bio_pool,
			
 
				-					GFP_NOIO);
			
 
				-		r10_bio->master_bio = mbio;
			
 
				-		r10_bio->sectors = bio_sectors(mbio) - sectors_handled;
			
 
				-		r10_bio->state = 0;
			
 
				-		set_bit(R10BIO_ReadError,
			
 
				-			&r10_bio->state);
			
 
				-		r10_bio->mddev = mddev;
			
 
				-		r10_bio->sector = mbio->bi_iter.bi_sector
			
 
				-			+ sectors_handled;
			
 
				-
			
 
				-		goto read_more;
			
 
				-	} else
			
 
				-		generic_make_request(bio);
			
 
				+	allow_barrier(conf);
			
 
				+	r10_bio->state = 0;
			
 
				+	raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
			
 
				 }
			
 
				 
			
 
				 static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
			
@@ -2805,6 +2700,11 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 
				 			list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
			
 
				 			conf->nr_queued++;
			
 
				 			spin_unlock_irq(&conf->device_lock);
			
 
				+			/*
			
 
				+			 * In case freeze_array() is waiting for condition
			
 
				+			 * nr_pending == nr_queued + extra to be true.
			
 
				+			 */
			
 
				+			wake_up(&conf->wait_barrier);
			
 
				 			md_wakeup_thread(conf->mddev->thread);
			
 
				 		} else {
			
 
				 			if (test_bit(R10BIO_WriteError,
			
@@ -2879,13 +2779,8 @@ static void raid10d(struct md_thread *thread)
 
				 			recovery_request_write(mddev, r10_bio);
			
 
				 		else if (test_bit(R10BIO_ReadError, &r10_bio->state))
			
 
				 			handle_read_error(mddev, r10_bio);
			
 
				-		else {
			
 
				-			/* just a partial read to be scheduled from a
			
 
				-			 * separate context
			
 
				-			 */
			
 
				-			int slot = r10_bio->read_slot;
			
 
				-			generic_make_request(r10_bio->devs[slot].bio);
			
 
				-		}
			
 
				+		else
			
 
				+			WARN_ON_ONCE(1);
			
 
				 
			
 
				 		cond_resched();
			
 
				 		if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
			
@@ -3199,10 +3094,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
				 					}
			
 
				 				}
			
 
				 				bio = r10_bio->devs[0].bio;
			
 
				-				bio_reset(bio);
			
 
				 				bio->bi_next = biolist;
			
 
				 				biolist = bio;
			
 
				-				bio->bi_private = r10_bio;
			
 
				 				bio->bi_end_io = end_sync_read;
			
 
				 				bio_set_op_attrs(bio, REQ_OP_READ, 0);
			
 
				 				if (test_bit(FailFast, &rdev->flags))
			
@@ -3226,10 +3119,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
				 
			
 
				 				if (!test_bit(In_sync, &mrdev->flags)) {
			
 
				 					bio = r10_bio->devs[1].bio;
			
 
				-					bio_reset(bio);
			
 
				 					bio->bi_next = biolist;
			
 
				 					biolist = bio;
			
 
				-					bio->bi_private = r10_bio;
			
 
				 					bio->bi_end_io = end_sync_write;
			
 
				 					bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
			
 
				 					bio->bi_iter.bi_sector = to_addr
			
@@ -3254,10 +3145,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
				 				if (mreplace == NULL || bio == NULL ||
			
 
				 				    test_bit(Faulty, &mreplace->flags))
			
 
				 					break;
			
 
				-				bio_reset(bio);
			
 
				 				bio->bi_next = biolist;
			
 
				 				biolist = bio;
			
 
				-				bio->bi_private = r10_bio;
			
 
				 				bio->bi_end_io = end_sync_write;
			
 
				 				bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
			
 
				 				bio->bi_iter.bi_sector = to_addr +
			
@@ -3379,7 +3268,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
				 				r10_bio->devs[i].repl_bio->bi_end_io = NULL;
			
 
				 
			
 
				 			bio = r10_bio->devs[i].bio;
			
 
				-			bio_reset(bio);
			
 
				 			bio->bi_error = -EIO;
			
 
				 			rcu_read_lock();
			
 
				 			rdev = rcu_dereference(conf->mirrors[d].rdev);
			
@@ -3404,7 +3292,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
				 			atomic_inc(&r10_bio->remaining);
			
 
				 			bio->bi_next = biolist;
			
 
				 			biolist = bio;
			
 
				-			bio->bi_private = r10_bio;
			
 
				 			bio->bi_end_io = end_sync_read;
			
 
				 			bio_set_op_attrs(bio, REQ_OP_READ, 0);
			
 
				 			if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
			
@@ -3423,13 +3310,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
				 
			
 
				 			/* Need to set up for writing to the replacement */
			
 
				 			bio = r10_bio->devs[i].repl_bio;
			
 
				-			bio_reset(bio);
			
 
				 			bio->bi_error = -EIO;
			
 
				 
			
 
				 			sector = r10_bio->devs[i].addr;
			
 
				 			bio->bi_next = biolist;
			
 
				 			biolist = bio;
			
 
				-			bio->bi_private = r10_bio;
			
 
				 			bio->bi_end_io = end_sync_write;
			
 
				 			bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
			
 
				 			if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
			
@@ -3468,27 +3353,17 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
				 		if (len == 0)
			
 
				 			break;
			
 
				 		for (bio= biolist ; bio ; bio=bio->bi_next) {
			
 
				-			struct bio *bio2;
			
 
				-			page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
			
 
				-			if (bio_add_page(bio, page, len, 0))
			
 
				-				continue;
			
 
				-
			
 
				-			/* stop here */
			
 
				-			bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
			
 
				-			for (bio2 = biolist;
			
 
				-			     bio2 && bio2 != bio;
			
 
				-			     bio2 = bio2->bi_next) {
			
 
				-				/* remove last page from this bio */
			
 
				-				bio2->bi_vcnt--;
			
 
				-				bio2->bi_iter.bi_size -= len;
			
 
				-				bio_clear_flag(bio2, BIO_SEG_VALID);
			
 
				-			}
			
 
				-			goto bio_full;
			
 
				+			struct resync_pages *rp = get_resync_pages(bio);
			
 
				+			page = resync_fetch_page(rp, rp->idx++);
			
 
				+			/*
			
 
				+			 * won't fail because the vec table is big enough
			
 
				+			 * to hold all these pages
			
 
				+			 */
			
 
				+			bio_add_page(bio, page, len, 0);
			
 
				 		}
			
 
				 		nr_sectors += len>>9;
			
 
				 		sector_nr += len>>9;
			
 
				-	} while (biolist->bi_vcnt < RESYNC_PAGES);
			
 
				- bio_full:
			
 
				+	} while (get_resync_pages(biolist)->idx < RESYNC_PAGES);
			
 
				 	r10_bio->sectors = nr_sectors;
			
 
				 
			
 
				 	while (biolist) {
			
@@ -3496,7 +3371,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
				 		biolist = biolist->bi_next;
			
 
				 
			
 
				 		bio->bi_next = NULL;
			
 
				-		r10_bio = bio->bi_private;
			
 
				+		r10_bio = get_resync_r10bio(bio);
			
 
				 		r10_bio->sectors = nr_sectors;
			
 
				 
			
 
				 		if (bio->bi_end_io == end_sync_read) {
			
@@ -3678,6 +3553,10 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 
				 	if (!conf->r10bio_pool)
			
 
				 		goto out;
			
 
				 
			
 
				+	conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
			
 
				+	if (!conf->bio_split)
			
 
				+		goto out;
			
 
				+
			
 
				 	calc_sectors(conf, mddev->dev_sectors);
			
 
				 	if (mddev->reshape_position == MaxSector) {
			
 
				 		conf->prev = conf->geo;
			
@@ -3715,6 +3594,8 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 
				 		mempool_destroy(conf->r10bio_pool);
			
 
				 		kfree(conf->mirrors);
			
 
				 		safe_put_page(conf->tmppage);
			
 
				+		if (conf->bio_split)
			
 
				+			bioset_free(conf->bio_split);
			
 
				 		kfree(conf);
			
 
				 	}
			
 
				 	return ERR_PTR(err);
			
@@ -3760,7 +3641,6 @@ static int raid10_run(struct mddev *mddev)
 
				 
			
 
				 	rdev_for_each(rdev, mddev) {
			
 
				 		long long diff;
			
 
				-		struct request_queue *q;
			
 
				 
			
 
				 		disk_idx = rdev->raid_disk;
			
 
				 		if (disk_idx < 0)
			
@@ -3779,7 +3659,6 @@ static int raid10_run(struct mddev *mddev)
 
				 				goto out_free_conf;
			
 
				 			disk->rdev = rdev;
			
 
				 		}
			
 
				-		q = bdev_get_queue(rdev->bdev);
			
 
				 		diff = (rdev->new_data_offset - rdev->data_offset);
			
 
				 		if (!mddev->reshape_backwards)
			
 
				 			diff = -diff;
			
@@ -3796,6 +3675,7 @@ static int raid10_run(struct mddev *mddev)
 
				 
			
 
				 		if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
			
 
				 			discard_supported = true;
			
 
				+		first = 0;
			
 
				 	}
			
 
				 
			
 
				 	if (mddev->queue) {
			
@@ -3925,6 +3805,8 @@ static void raid10_free(struct mddev *mddev, void *priv)
 
				 	kfree(conf->mirrors);
			
 
				 	kfree(conf->mirrors_old);
			
 
				 	kfree(conf->mirrors_new);
			
 
				+	if (conf->bio_split)
			
 
				+		bioset_free(conf->bio_split);
			
 
				 	kfree(conf);
			
 
				 }
			
 
				 
			
@@ -4198,6 +4080,7 @@ static int raid10_start_reshape(struct mddev *mddev)
 
				 				diff = 0;
			
 
				 			if (first || diff < min_offset_diff)
			
 
				 				min_offset_diff = diff;
			
 
				+			first = 0;
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -4388,6 +4271,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 
				 	struct bio *blist;
			
 
				 	struct bio *bio, *read_bio;
			
 
				 	int sectors_done = 0;
			
 
				+	struct page **pages;
			
 
				 
			
 
				 	if (sector_nr == 0) {
			
 
				 		/* If restarting in the middle, skip the initial sectors */
			
@@ -4508,7 +4392,7 @@ read_more:
 
				 	read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
			
 
				 			       + rdev->data_offset);
			
 
				 	read_bio->bi_private = r10_bio;
			
 
				-	read_bio->bi_end_io = end_sync_read;
			
 
				+	read_bio->bi_end_io = end_reshape_read;
			
 
				 	bio_set_op_attrs(read_bio, REQ_OP_READ, 0);
			
 
				 	read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
			
 
				 	read_bio->bi_error = 0;
			
@@ -4538,11 +4422,9 @@ read_more:
 
				 		if (!rdev2 || test_bit(Faulty, &rdev2->flags))
			
 
				 			continue;
			
 
				 
			
 
				-		bio_reset(b);
			
 
				 		b->bi_bdev = rdev2->bdev;
			
 
				 		b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
			
 
				 			rdev2->new_data_offset;
			
 
				-		b->bi_private = r10_bio;
			
 
				 		b->bi_end_io = end_reshape_write;
			
 
				 		bio_set_op_attrs(b, REQ_OP_WRITE, 0);
			
 
				 		b->bi_next = blist;
			
@@ -4552,31 +4434,22 @@ read_more:
 
				 	/* Now add as many pages as possible to all of these bios. */
			
 
				 
			
 
				 	nr_sectors = 0;
			
 
				+	pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
			
 
				 	for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
			
 
				-		struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
			
 
				+		struct page *page = pages[s / (PAGE_SIZE >> 9)];
			
 
				 		int len = (max_sectors - s) << 9;
			
 
				 		if (len > PAGE_SIZE)
			
 
				 			len = PAGE_SIZE;
			
 
				 		for (bio = blist; bio ; bio = bio->bi_next) {
			
 
				-			struct bio *bio2;
			
 
				-			if (bio_add_page(bio, page, len, 0))
			
 
				-				continue;
			
 
				-
			
 
				-			/* Didn't fit, must stop */
			
 
				-			for (bio2 = blist;
			
 
				-			     bio2 && bio2 != bio;
			
 
				-			     bio2 = bio2->bi_next) {
			
 
				-				/* Remove last page from this bio */
			
 
				-				bio2->bi_vcnt--;
			
 
				-				bio2->bi_iter.bi_size -= len;
			
 
				-				bio_clear_flag(bio2, BIO_SEG_VALID);
			
 
				-			}
			
 
				-			goto bio_full;
			
 
				+			/*
			
 
				+			 * won't fail because the vec table is big enough
			
 
				+			 * to hold all these pages
			
 
				+			 */
			
 
				+			bio_add_page(bio, page, len, 0);
			
 
				 		}
			
 
				 		sector_nr += len >> 9;
			
 
				 		nr_sectors += len >> 9;
			
 
				 	}
			
 
				-bio_full:
			
 
				 	rcu_read_unlock();
			
 
				 	r10_bio->sectors = nr_sectors;
			
 
				 
			
@@ -4690,7 +4563,10 @@ static int handle_reshape_read_error(struct mddev *mddev,
 
				 	struct r10bio *r10b = &on_stack.r10_bio;
			
 
				 	int slot = 0;
			
 
				 	int idx = 0;
			
 
				-	struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
			
 
				+	struct page **pages;
			
 
				+
			
 
				+	/* reshape IOs share pages from .devs[0].bio */
			
 
				+	pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
			
 
				 
			
 
				 	r10b->sector = r10_bio->sector;
			
 
				 	__raid10_find_phys(&conf->prev, r10b);
			
@@ -4719,7 +4595,7 @@ static int handle_reshape_read_error(struct mddev *mddev,
 
				 			success = sync_page_io(rdev,
			
 
				 					       addr,
			
 
				 					       s << 9,
			
 
				-					       bvec[idx].bv_page,
			
 
				+					       pages[idx],
			
 
				 					       REQ_OP_READ, 0, false);
			
 
				 			rdev_dec_pending(rdev, mddev);
			
 
				 			rcu_read_lock();
			
@@ -4747,7 +4623,7 @@ static int handle_reshape_read_error(struct mddev *mddev,
 
				 
			
 
				 static void end_reshape_write(struct bio *bio)
			
 
				 {
			
 
				-	struct r10bio *r10_bio = bio->bi_private;
			
 
				+	struct r10bio *r10_bio = get_resync_r10bio(bio);
			
 
				 	struct mddev *mddev = r10_bio->mddev;
			
 
				 	struct r10conf *conf = mddev->private;
			
 
				 	int d;
			
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -82,6 +82,7 @@ struct r10conf {
 
				 	mempool_t		*r10bio_pool;
			
 
				 	mempool_t		*r10buf_pool;
			
 
				 	struct page		*tmppage;
			
 
				+	struct bio_set		*bio_split;
			
 
				 
			
 
				 	/* When taking over an array from a different personality, we store
			
 
				 	 * the new thread here until we fully activate the array.
			
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -30,6 +30,7 @@
 
				  * underneath hardware sector size. only works with PAGE_SIZE == 4096
			
 
				  */
			
 
				 #define BLOCK_SECTORS (8)
			
 
				+#define BLOCK_SECTOR_SHIFT (3)
			
 
				 
			
 
				 /*
			
 
				  * log->max_free_space is min(1/4 disk size, 10G reclaimable space).
			
@@ -43,7 +44,7 @@
 
				 /* wake up reclaim thread periodically */
			
 
				 #define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
			
 
				 /* start flush with these full stripes */
			
 
				-#define R5C_FULL_STRIPE_FLUSH_BATCH 256
			
 
				+#define R5C_FULL_STRIPE_FLUSH_BATCH(conf) (conf->max_nr_stripes / 4)
			
 
				 /* reclaim stripes in groups */
			
 
				 #define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
			
 
				 
			
@@ -307,8 +308,7 @@ static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
 
				 }
			
 
				 
			
 
				 static void
			
 
				-r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
			
 
				-			      struct bio_list *return_bi)
			
 
				+r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev)
			
 
				 {
			
 
				 	struct bio *wbi, *wbi2;
			
 
				 
			
@@ -317,24 +317,21 @@ r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
 
				 	while (wbi && wbi->bi_iter.bi_sector <
			
 
				 	       dev->sector + STRIPE_SECTORS) {
			
 
				 		wbi2 = r5_next_bio(wbi, dev->sector);
			
 
				-		if (!raid5_dec_bi_active_stripes(wbi)) {
			
 
				-			md_write_end(conf->mddev);
			
 
				-			bio_list_add(return_bi, wbi);
			
 
				-		}
			
 
				+		md_write_end(conf->mddev);
			
 
				+		bio_endio(wbi);
			
 
				 		wbi = wbi2;
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				 void r5c_handle_cached_data_endio(struct r5conf *conf,
			
 
				-	  struct stripe_head *sh, int disks, struct bio_list *return_bi)
			
 
				+				  struct stripe_head *sh, int disks)
			
 
				 {
			
 
				 	int i;
			
 
				 
			
 
				 	for (i = sh->disks; i--; ) {
			
 
				 		if (sh->dev[i].written) {
			
 
				 			set_bit(R5_UPTODATE, &sh->dev[i].flags);
			
 
				-			r5c_return_dev_pending_writes(conf, &sh->dev[i],
			
 
				-						      return_bi);
			
 
				+			r5c_return_dev_pending_writes(conf, &sh->dev[i]);
			
 
				 			bitmap_endwrite(conf->mddev->bitmap, sh->sector,
			
 
				 					STRIPE_SECTORS,
			
 
				 					!test_bit(STRIPE_DEGRADED, &sh->state),
			
@@ -343,6 +340,8 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
 
				 	}
			
 
				 }
			
 
				 
			
 
				+void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
			
 
				+
			
 
				 /* Check whether we should flush some stripes to free up stripe cache */
			
 
				 void r5c_check_stripe_cache_usage(struct r5conf *conf)
			
 
				 {
			
@@ -381,7 +380,7 @@ void r5c_check_cached_full_stripe(struct r5conf *conf)
 
				 	 * or a full stripe (chunk size / 4k stripes).
			
 
				 	 */
			
 
				 	if (atomic_read(&conf->r5c_cached_full_stripes) >=
			
 
				-	    min(R5C_FULL_STRIPE_FLUSH_BATCH,
			
 
				+	    min(R5C_FULL_STRIPE_FLUSH_BATCH(conf),
			
 
				 		conf->chunk_sectors >> STRIPE_SHIFT))
			
 
				 		r5l_wake_reclaim(conf->log, 0);
			
 
				 }
			
@@ -590,7 +589,7 @@ static void r5l_log_endio(struct bio *bio)
 
				 
			
 
				 	spin_lock_irqsave(&log->io_list_lock, flags);
			
 
				 	__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
			
 
				-	if (log->need_cache_flush)
			
 
				+	if (log->need_cache_flush && !list_empty(&io->stripe_list))
			
 
				 		r5l_move_to_end_ios(log);
			
 
				 	else
			
 
				 		r5l_log_run_stripes(log);
			
@@ -618,9 +617,11 @@ static void r5l_log_endio(struct bio *bio)
 
				 			bio_endio(bi);
			
 
				 			atomic_dec(&io->pending_stripe);
			
 
				 		}
			
 
				-		if (atomic_read(&io->pending_stripe) == 0)
			
 
				-			__r5l_stripe_write_finished(io);
			
 
				 	}
			
 
				+
			
 
				+	/* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */
			
 
				+	if (atomic_read(&io->pending_stripe) == 0)
			
 
				+		__r5l_stripe_write_finished(io);
			
 
				 }
			
 
				 
			
 
				 static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
			
@@ -842,6 +843,41 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
 
				 	r5_reserve_log_entry(log, io);
			
 
				 }
			
 
				 
			
 
				+static void r5l_append_flush_payload(struct r5l_log *log, sector_t sect)
			
 
				+{
			
 
				+	struct mddev *mddev = log->rdev->mddev;
			
 
				+	struct r5conf *conf = mddev->private;
			
 
				+	struct r5l_io_unit *io;
			
 
				+	struct r5l_payload_flush *payload;
			
 
				+	int meta_size;
			
 
				+
			
 
				+	/*
			
 
				+	 * payload_flush requires extra writes to the journal.
			
 
				+	 * To avoid handling the extra IO in quiesce, just skip
			
 
				+	 * flush_payload
			
 
				+	 */
			
 
				+	if (conf->quiesce)
			
 
				+		return;
			
 
				+
			
 
				+	mutex_lock(&log->io_mutex);
			
 
				+	meta_size = sizeof(struct r5l_payload_flush) + sizeof(__le64);
			
 
				+
			
 
				+	if (r5l_get_meta(log, meta_size)) {
			
 
				+		mutex_unlock(&log->io_mutex);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	/* current implementation is one stripe per flush payload */
			
 
				+	io = log->current_io;
			
 
				+	payload = page_address(io->meta_page) + io->meta_offset;
			
 
				+	payload->header.type = cpu_to_le16(R5LOG_PAYLOAD_FLUSH);
			
 
				+	payload->header.flags = cpu_to_le16(0);
			
 
				+	payload->size = cpu_to_le32(sizeof(__le64));
			
 
				+	payload->flush_stripes[0] = cpu_to_le64(sect);
			
 
				+	io->meta_offset += meta_size;
			
 
				+	mutex_unlock(&log->io_mutex);
			
 
				+}
			
 
				+
			
 
				 static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
			
 
				 			   int data_pages, int parity_pages)
			
 
				 {
			
@@ -1393,7 +1429,7 @@ static void r5c_do_reclaim(struct r5conf *conf)
 
				 		stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP;
			
 
				 	else if (total_cached > conf->min_nr_stripes * 1 / 2 ||
			
 
				 		 atomic_read(&conf->r5c_cached_full_stripes) - flushing_full >
			
 
				-		 R5C_FULL_STRIPE_FLUSH_BATCH)
			
 
				+		 R5C_FULL_STRIPE_FLUSH_BATCH(conf))
			
 
				 		/*
			
 
				 		 * if stripe cache pressure moderate, or if there is many full
			
 
				 		 * stripes,flush all full stripes
			
@@ -1552,6 +1588,8 @@ bool r5l_log_disk_error(struct r5conf *conf)
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+#define R5L_RECOVERY_PAGE_POOL_SIZE 256
			
 
				+
			
 
				 struct r5l_recovery_ctx {
			
 
				 	struct page *meta_page;		/* current meta */
			
 
				 	sector_t meta_total_blocks;	/* total size of current meta and data */
			
@@ -1560,18 +1598,131 @@ struct r5l_recovery_ctx {
 
				 	int data_parity_stripes;	/* number of data_parity stripes */
			
 
				 	int data_only_stripes;		/* number of data_only stripes */
			
 
				 	struct list_head cached_list;
			
 
				+
			
 
				+	/*
			
 
				+	 * read ahead page pool (ra_pool)
			
 
				+	 * in recovery, log is read sequentially. It is not efficient to
			
 
				+	 * read every page with sync_page_io(). The read ahead page pool
			
 
				+	 * reads multiple pages with one IO, so further log read can
			
 
				+	 * just copy data from the pool.
			
 
				+	 */
			
 
				+	struct page *ra_pool[R5L_RECOVERY_PAGE_POOL_SIZE];
			
 
				+	sector_t pool_offset;	/* offset of first page in the pool */
			
 
				+	int total_pages;	/* total allocated pages */
			
 
				+	int valid_pages;	/* pages with valid data */
			
 
				+	struct bio *ra_bio;	/* bio to do the read ahead */
			
 
				 };
			
 
				 
			
 
				+static int r5l_recovery_allocate_ra_pool(struct r5l_log *log,
			
 
				+					    struct r5l_recovery_ctx *ctx)
			
 
				+{
			
 
				+	struct page *page;
			
 
				+
			
 
				+	ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_PAGES, log->bs);
			
 
				+	if (!ctx->ra_bio)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	ctx->valid_pages = 0;
			
 
				+	ctx->total_pages = 0;
			
 
				+	while (ctx->total_pages < R5L_RECOVERY_PAGE_POOL_SIZE) {
			
 
				+		page = alloc_page(GFP_KERNEL);
			
 
				+
			
 
				+		if (!page)
			
 
				+			break;
			
 
				+		ctx->ra_pool[ctx->total_pages] = page;
			
 
				+		ctx->total_pages += 1;
			
 
				+	}
			
 
				+
			
 
				+	if (ctx->total_pages == 0) {
			
 
				+		bio_put(ctx->ra_bio);
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	ctx->pool_offset = 0;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void r5l_recovery_free_ra_pool(struct r5l_log *log,
			
 
				+					struct r5l_recovery_ctx *ctx)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < ctx->total_pages; ++i)
			
 
				+		put_page(ctx->ra_pool[i]);
			
 
				+	bio_put(ctx->ra_bio);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * fetch ctx->valid_pages pages from offset
			
 
				+ * In normal cases, ctx->valid_pages == ctx->total_pages after the call.
			
 
				+ * However, if the offset is close to the end of the journal device,
			
 
				+ * ctx->valid_pages could be smaller than ctx->total_pages
			
 
				+ */
			
 
				+static int r5l_recovery_fetch_ra_pool(struct r5l_log *log,
			
 
				+				      struct r5l_recovery_ctx *ctx,
			
 
				+				      sector_t offset)
			
 
				+{
			
 
				+	bio_reset(ctx->ra_bio);
			
 
				+	ctx->ra_bio->bi_bdev = log->rdev->bdev;
			
 
				+	bio_set_op_attrs(ctx->ra_bio, REQ_OP_READ, 0);
			
 
				+	ctx->ra_bio->bi_iter.bi_sector = log->rdev->data_offset + offset;
			
 
				+
			
 
				+	ctx->valid_pages = 0;
			
 
				+	ctx->pool_offset = offset;
			
 
				+
			
 
				+	while (ctx->valid_pages < ctx->total_pages) {
			
 
				+		bio_add_page(ctx->ra_bio,
			
 
				+			     ctx->ra_pool[ctx->valid_pages], PAGE_SIZE, 0);
			
 
				+		ctx->valid_pages += 1;
			
 
				+
			
 
				+		offset = r5l_ring_add(log, offset, BLOCK_SECTORS);
			
 
				+
			
 
				+		if (offset == 0)  /* reached end of the device */
			
 
				+			break;
			
 
				+	}
			
 
				+
			
 
				+	return submit_bio_wait(ctx->ra_bio);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * try read a page from the read ahead page pool, if the page is not in the
			
 
				+ * pool, call r5l_recovery_fetch_ra_pool
			
 
				+ */
			
 
				+static int r5l_recovery_read_page(struct r5l_log *log,
			
 
				+				  struct r5l_recovery_ctx *ctx,
			
 
				+				  struct page *page,
			
 
				+				  sector_t offset)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	if (offset < ctx->pool_offset ||
			
 
				+	    offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS) {
			
 
				+		ret = r5l_recovery_fetch_ra_pool(log, ctx, offset);
			
 
				+		if (ret)
			
 
				+			return ret;
			
 
				+	}
			
 
				+
			
 
				+	BUG_ON(offset < ctx->pool_offset ||
			
 
				+	       offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS);
			
 
				+
			
 
				+	memcpy(page_address(page),
			
 
				+	       page_address(ctx->ra_pool[(offset - ctx->pool_offset) >>
			
 
				+					 BLOCK_SECTOR_SHIFT]),
			
 
				+	       PAGE_SIZE);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 static int r5l_recovery_read_meta_block(struct r5l_log *log,
			
 
				 					struct r5l_recovery_ctx *ctx)
			
 
				 {
			
 
				 	struct page *page = ctx->meta_page;
			
 
				 	struct r5l_meta_block *mb;
			
 
				 	u32 crc, stored_crc;
			
 
				+	int ret;
			
 
				 
			
 
				-	if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0,
			
 
				-			  false))
			
 
				-		return -EIO;
			
 
				+	ret = r5l_recovery_read_page(log, ctx, page, ctx->pos);
			
 
				+	if (ret != 0)
			
 
				+		return ret;
			
 
				 
			
 
				 	mb = page_address(page);
			
 
				 	stored_crc = le32_to_cpu(mb->checksum);
			
@@ -1653,8 +1804,7 @@ static void r5l_recovery_load_data(struct r5l_log *log,
 
				 	raid5_compute_sector(conf,
			
 
				 			     le64_to_cpu(payload->location), 0,
			
 
				 			     &dd_idx, sh);
			
 
				-	sync_page_io(log->rdev, log_offset, PAGE_SIZE,
			
 
				-		     sh->dev[dd_idx].page, REQ_OP_READ, 0, false);
			
 
				+	r5l_recovery_read_page(log, ctx, sh->dev[dd_idx].page, log_offset);
			
 
				 	sh->dev[dd_idx].log_checksum =
			
 
				 		le32_to_cpu(payload->checksum[0]);
			
 
				 	ctx->meta_total_blocks += BLOCK_SECTORS;
			
@@ -1673,17 +1823,15 @@ static void r5l_recovery_load_parity(struct r5l_log *log,
 
				 	struct r5conf *conf = mddev->private;
			
 
				 
			
 
				 	ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
			
 
				-	sync_page_io(log->rdev, log_offset, PAGE_SIZE,
			
 
				-		     sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false);
			
 
				+	r5l_recovery_read_page(log, ctx, sh->dev[sh->pd_idx].page, log_offset);
			
 
				 	sh->dev[sh->pd_idx].log_checksum =
			
 
				 		le32_to_cpu(payload->checksum[0]);
			
 
				 	set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags);
			
 
				 
			
 
				 	if (sh->qd_idx >= 0) {
			
 
				-		sync_page_io(log->rdev,
			
 
				-			     r5l_ring_add(log, log_offset, BLOCK_SECTORS),
			
 
				-			     PAGE_SIZE, sh->dev[sh->qd_idx].page,
			
 
				-			     REQ_OP_READ, 0, false);
			
 
				+		r5l_recovery_read_page(
			
 
				+			log, ctx, sh->dev[sh->qd_idx].page,
			
 
				+			r5l_ring_add(log, log_offset, BLOCK_SECTORS));
			
 
				 		sh->dev[sh->qd_idx].log_checksum =
			
 
				 			le32_to_cpu(payload->checksum[1]);
			
 
				 		set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags);
			
@@ -1814,14 +1962,15 @@ r5c_recovery_replay_stripes(struct list_head *cached_stripe_list,
 
				 
			
 
				 /* if matches return 0; otherwise return -EINVAL */
			
 
				 static int
			
 
				-r5l_recovery_verify_data_checksum(struct r5l_log *log, struct page *page,
			
 
				+r5l_recovery_verify_data_checksum(struct r5l_log *log,
			
 
				+				  struct r5l_recovery_ctx *ctx,
			
 
				+				  struct page *page,
			
 
				 				  sector_t log_offset, __le32 log_checksum)
			
 
				 {
			
 
				 	void *addr;
			
 
				 	u32 checksum;
			
 
				 
			
 
				-	sync_page_io(log->rdev, log_offset, PAGE_SIZE,
			
 
				-		     page, REQ_OP_READ, 0, false);
			
 
				+	r5l_recovery_read_page(log, ctx, page, log_offset);
			
 
				 	addr = kmap_atomic(page);
			
 
				 	checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
			
 
				 	kunmap_atomic(addr);
			
@@ -1843,6 +1992,7 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
 
				 	sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
			
 
				 	struct page *page;
			
 
				 	struct r5l_payload_data_parity *payload;
			
 
				+	struct r5l_payload_flush *payload_flush;
			
 
				 
			
 
				 	page = alloc_page(GFP_KERNEL);
			
 
				 	if (!page)
			
@@ -1850,33 +2000,42 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
 
				 
			
 
				 	while (mb_offset < le32_to_cpu(mb->meta_size)) {
			
 
				 		payload = (void *)mb + mb_offset;
			
 
				+		payload_flush = (void *)mb + mb_offset;
			
 
				 
			
 
				-		if (payload->header.type == R5LOG_PAYLOAD_DATA) {
			
 
				+		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
			
 
				 			if (r5l_recovery_verify_data_checksum(
			
 
				-				    log, page, log_offset,
			
 
				+				    log, ctx, page, log_offset,
			
 
				 				    payload->checksum[0]) < 0)
			
 
				 				goto mismatch;
			
 
				-		} else if (payload->header.type == R5LOG_PAYLOAD_PARITY) {
			
 
				+		} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) {
			
 
				 			if (r5l_recovery_verify_data_checksum(
			
 
				-				    log, page, log_offset,
			
 
				+				    log, ctx, page, log_offset,
			
 
				 				    payload->checksum[0]) < 0)
			
 
				 				goto mismatch;
			
 
				 			if (conf->max_degraded == 2 && /* q for RAID 6 */
			
 
				 			    r5l_recovery_verify_data_checksum(
			
 
				-				    log, page,
			
 
				+				    log, ctx, page,
			
 
				 				    r5l_ring_add(log, log_offset,
			
 
				 						 BLOCK_SECTORS),
			
 
				 				    payload->checksum[1]) < 0)
			
 
				 				goto mismatch;
			
 
				-		} else /* not R5LOG_PAYLOAD_DATA or R5LOG_PAYLOAD_PARITY */
			
 
				+		} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
			
 
				+			/* nothing to do for R5LOG_PAYLOAD_FLUSH here */
			
 
				+		} else /* not R5LOG_PAYLOAD_DATA/PARITY/FLUSH */
			
 
				 			goto mismatch;
			
 
				 
			
 
				-		log_offset = r5l_ring_add(log, log_offset,
			
 
				-					  le32_to_cpu(payload->size));
			
 
				+		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
			
 
				+			mb_offset += sizeof(struct r5l_payload_flush) +
			
 
				+				le32_to_cpu(payload_flush->size);
			
 
				+		} else {
			
 
				+			/* DATA or PARITY payload */
			
 
				+			log_offset = r5l_ring_add(log, log_offset,
			
 
				+						  le32_to_cpu(payload->size));
			
 
				+			mb_offset += sizeof(struct r5l_payload_data_parity) +
			
 
				+				sizeof(__le32) *
			
 
				+				(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
			
 
				+		}
			
 
				 
			
 
				-		mb_offset += sizeof(struct r5l_payload_data_parity) +
			
 
				-			sizeof(__le32) *
			
 
				-			(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
			
 
				 	}
			
 
				 
			
 
				 	put_page(page);
			
@@ -1904,6 +2063,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 
				 	struct r5conf *conf = mddev->private;
			
 
				 	struct r5l_meta_block *mb;
			
 
				 	struct r5l_payload_data_parity *payload;
			
 
				+	struct r5l_payload_flush *payload_flush;
			
 
				 	int mb_offset;
			
 
				 	sector_t log_offset;
			
 
				 	sector_t stripe_sect;
			
@@ -1929,7 +2089,31 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 
				 		int dd;
			
 
				 
			
 
				 		payload = (void *)mb + mb_offset;
			
 
				-		stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ?
			
 
				+		payload_flush = (void *)mb + mb_offset;
			
 
				+
			
 
				+		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
			
 
				+			int i, count;
			
 
				+
			
 
				+			count = le32_to_cpu(payload_flush->size) / sizeof(__le64);
			
 
				+			for (i = 0; i < count; ++i) {
			
 
				+				stripe_sect = le64_to_cpu(payload_flush->flush_stripes[i]);
			
 
				+				sh = r5c_recovery_lookup_stripe(cached_stripe_list,
			
 
				+								stripe_sect);
			
 
				+				if (sh) {
			
 
				+					WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
			
 
				+					r5l_recovery_reset_stripe(sh);
			
 
				+					list_del_init(&sh->lru);
			
 
				+					raid5_release_stripe(sh);
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			mb_offset += sizeof(struct r5l_payload_flush) +
			
 
				+				le32_to_cpu(payload_flush->size);
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		/* DATA or PARITY payload */
			
 
				+		stripe_sect = (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) ?
			
 
				 			raid5_compute_sector(
			
 
				 				conf, le64_to_cpu(payload->location), 0, &dd,
			
 
				 				NULL)
			
@@ -1967,7 +2151,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 
				 			list_add_tail(&sh->lru, cached_stripe_list);
			
 
				 		}
			
 
				 
			
 
				-		if (payload->header.type == R5LOG_PAYLOAD_DATA) {
			
 
				+		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
			
 
				 			if (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
			
 
				 			    test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) {
			
 
				 				r5l_recovery_replay_one_stripe(conf, sh, ctx);
			
@@ -1975,7 +2159,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 
				 			}
			
 
				 			r5l_recovery_load_data(log, sh, ctx, payload,
			
 
				 					       log_offset);
			
 
				-		} else if (payload->header.type == R5LOG_PAYLOAD_PARITY)
			
 
				+		} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
			
 
				 			r5l_recovery_load_parity(log, sh, ctx, payload,
			
 
				 						 log_offset);
			
 
				 		else
			
@@ -2177,7 +2361,7 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
 
				 				payload = (void *)mb + offset;
			
 
				 				payload->header.type = cpu_to_le16(
			
 
				 					R5LOG_PAYLOAD_DATA);
			
 
				-				payload->size = BLOCK_SECTORS;
			
 
				+				payload->size = cpu_to_le32(BLOCK_SECTORS);
			
 
				 				payload->location = cpu_to_le64(
			
 
				 					raid5_compute_blocknr(sh, i, 0));
			
 
				 				addr = kmap_atomic(dev->page);
			
@@ -2241,55 +2425,70 @@ static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
 
				 static int r5l_recovery_log(struct r5l_log *log)
			
 
				 {
			
 
				 	struct mddev *mddev = log->rdev->mddev;
			
 
				-	struct r5l_recovery_ctx ctx;
			
 
				+	struct r5l_recovery_ctx *ctx;
			
 
				 	int ret;
			
 
				 	sector_t pos;
			
 
				 
			
 
				-	ctx.pos = log->last_checkpoint;
			
 
				-	ctx.seq = log->last_cp_seq;
			
 
				-	ctx.meta_page = alloc_page(GFP_KERNEL);
			
 
				-	ctx.data_only_stripes = 0;
			
 
				-	ctx.data_parity_stripes = 0;
			
 
				-	INIT_LIST_HEAD(&ctx.cached_list);
			
 
				-
			
 
				-	if (!ctx.meta_page)
			
 
				+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
			
 
				+	if (!ctx)
			
 
				 		return -ENOMEM;
			
 
				 
			
 
				-	ret = r5c_recovery_flush_log(log, &ctx);
			
 
				-	__free_page(ctx.meta_page);
			
 
				+	ctx->pos = log->last_checkpoint;
			
 
				+	ctx->seq = log->last_cp_seq;
			
 
				+	INIT_LIST_HEAD(&ctx->cached_list);
			
 
				+	ctx->meta_page = alloc_page(GFP_KERNEL);
			
 
				 
			
 
				-	if (ret)
			
 
				-		return ret;
			
 
				+	if (!ctx->meta_page) {
			
 
				+		ret =  -ENOMEM;
			
 
				+		goto meta_page;
			
 
				+	}
			
 
				 
			
 
				-	pos = ctx.pos;
			
 
				-	ctx.seq += 10000;
			
 
				+	if (r5l_recovery_allocate_ra_pool(log, ctx) != 0) {
			
 
				+		ret = -ENOMEM;
			
 
				+		goto ra_pool;
			
 
				+	}
			
 
				 
			
 
				+	ret = r5c_recovery_flush_log(log, ctx);
			
 
				 
			
 
				-	if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0))
			
 
				+	if (ret)
			
 
				+		goto error;
			
 
				+
			
 
				+	pos = ctx->pos;
			
 
				+	ctx->seq += 10000;
			
 
				+
			
 
				+	if ((ctx->data_only_stripes == 0) && (ctx->data_parity_stripes == 0))
			
 
				 		pr_debug("md/raid:%s: starting from clean shutdown\n",
			
 
				 			 mdname(mddev));
			
 
				 	else
			
 
				 		pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
			
 
				-			 mdname(mddev), ctx.data_only_stripes,
			
 
				-			 ctx.data_parity_stripes);
			
 
				-
			
 
				-	if (ctx.data_only_stripes == 0) {
			
 
				-		log->next_checkpoint = ctx.pos;
			
 
				-		r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
			
 
				-		ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
			
 
				-	} else if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
			
 
				+			 mdname(mddev), ctx->data_only_stripes,
			
 
				+			 ctx->data_parity_stripes);
			
 
				+
			
 
				+	if (ctx->data_only_stripes == 0) {
			
 
				+		log->next_checkpoint = ctx->pos;
			
 
				+		r5l_log_write_empty_meta_block(log, ctx->pos, ctx->seq++);
			
 
				+		ctx->pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
			
 
				+	} else if (r5c_recovery_rewrite_data_only_stripes(log, ctx)) {
			
 
				 		pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
			
 
				 		       mdname(mddev));
			
 
				-		return -EIO;
			
 
				+		ret =  -EIO;
			
 
				+		goto error;
			
 
				 	}
			
 
				 
			
 
				-	log->log_start = ctx.pos;
			
 
				-	log->seq = ctx.seq;
			
 
				+	log->log_start = ctx->pos;
			
 
				+	log->seq = ctx->seq;
			
 
				 	log->last_checkpoint = pos;
			
 
				 	r5l_write_super(log, pos);
			
 
				 
			
 
				-	r5c_recovery_flush_data_only_stripes(log, &ctx);
			
 
				-	return 0;
			
 
				+	r5c_recovery_flush_data_only_stripes(log, ctx);
			
 
				+	ret = 0;
			
 
				+error:
			
 
				+	r5l_recovery_free_ra_pool(log, ctx);
			
 
				+ra_pool:
			
 
				+	__free_page(ctx->meta_page);
			
 
				+meta_page:
			
 
				+	kfree(ctx);
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 static void r5l_write_super(struct r5l_log *log, sector_t cp)
			
@@ -2618,11 +2817,11 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
 
				 		atomic_dec(&conf->r5c_flushing_full_stripes);
			
 
				 		atomic_dec(&conf->r5c_cached_full_stripes);
			
 
				 	}
			
 
				+
			
 
				+	r5l_append_flush_payload(log, sh->sector);
			
 
				 }
			
 
				 
			
 
				-int
			
 
				-r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
			
 
				-	       struct stripe_head_state *s)
			
 
				+int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
			
 
				 {
			
 
				 	struct r5conf *conf = sh->raid_conf;
			
 
				 	int pages = 0;
			
@@ -2785,6 +2984,10 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 
				 {
			
 
				 	struct request_queue *q = bdev_get_queue(rdev->bdev);
			
 
				 	struct r5l_log *log;
			
 
				+	char b[BDEVNAME_SIZE];
			
 
				+
			
 
				+	pr_debug("md/raid:%s: using device %s as journal\n",
			
 
				+		 mdname(conf->mddev), bdevname(rdev->bdev, b));
			
 
				 
			
 
				 	if (PAGE_SIZE != 4096)
			
 
				 		return -EINVAL;
			
@@ -2887,8 +3090,13 @@ io_kc:
 
				 	return -EINVAL;
			
 
				 }
			
 
				 
			
 
				-void r5l_exit_log(struct r5l_log *log)
			
 
				+void r5l_exit_log(struct r5conf *conf)
			
 
				 {
			
 
				+	struct r5l_log *log = conf->log;
			
 
				+
			
 
				+	conf->log = NULL;
			
 
				+	synchronize_rcu();
			
 
				+
			
 
				 	flush_work(&log->disable_writeback_work);
			
 
				 	md_unregister_thread(&log->reclaim_thread);
			
 
				 	mempool_destroy(log->meta_pool);
			
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -0,0 +1,115 @@
 
				+#ifndef _RAID5_LOG_H
			
 
				+#define _RAID5_LOG_H
			
 
				+
			
 
				+extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
			
 
				+extern void r5l_exit_log(struct r5conf *conf);
			
 
				+extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
			
 
				+extern void r5l_write_stripe_run(struct r5l_log *log);
			
 
				+extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
			
 
				+extern void r5l_stripe_write_finished(struct stripe_head *sh);
			
 
				+extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
			
 
				+extern void r5l_quiesce(struct r5l_log *log, int state);
			
 
				+extern bool r5l_log_disk_error(struct r5conf *conf);
			
 
				+extern bool r5c_is_writeback(struct r5l_log *log);
			
 
				+extern int
			
 
				+r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
			
 
				+		      struct stripe_head_state *s, int disks);
			
 
				+extern void
			
 
				+r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
			
 
				+			    struct stripe_head_state *s);
			
 
				+extern void r5c_release_extra_page(struct stripe_head *sh);
			
 
				+extern void r5c_use_extra_page(struct stripe_head *sh);
			
 
				+extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
			
 
				+extern void r5c_handle_cached_data_endio(struct r5conf *conf,
			
 
				+	struct stripe_head *sh, int disks);
			
 
				+extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh);
			
 
				+extern void r5c_make_stripe_write_out(struct stripe_head *sh);
			
 
				+extern void r5c_flush_cache(struct r5conf *conf, int num);
			
 
				+extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
			
 
				+extern void r5c_check_cached_full_stripe(struct r5conf *conf);
			
 
				+extern struct md_sysfs_entry r5c_journal_mode;
			
 
				+extern void r5c_update_on_rdev_error(struct mddev *mddev);
			
 
				+extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
			
 
				+
			
 
				+extern struct dma_async_tx_descriptor *
			
 
				+ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
			
 
				+		       struct dma_async_tx_descriptor *tx);
			
 
				+extern int ppl_init_log(struct r5conf *conf);
			
 
				+extern void ppl_exit_log(struct r5conf *conf);
			
 
				+extern int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh);
			
 
				+extern void ppl_write_stripe_run(struct r5conf *conf);
			
 
				+extern void ppl_stripe_write_finished(struct stripe_head *sh);
			
 
				+extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add);
			
 
				+
			
 
				+static inline bool raid5_has_ppl(struct r5conf *conf)
			
 
				+{
			
 
				+	return test_bit(MD_HAS_PPL, &conf->mddev->flags);
			
 
				+}
			
 
				+
			
 
				+static inline int log_stripe(struct stripe_head *sh, struct stripe_head_state *s)
			
 
				+{
			
 
				+	struct r5conf *conf = sh->raid_conf;
			
 
				+
			
 
				+	if (conf->log) {
			
 
				+		if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
			
 
				+			/* writing out phase */
			
 
				+			if (s->waiting_extra_page)
			
 
				+				return 0;
			
 
				+			return r5l_write_stripe(conf->log, sh);
			
 
				+		} else if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) {
			
 
				+			/* caching phase */
			
 
				+			return r5c_cache_data(conf->log, sh);
			
 
				+		}
			
 
				+	} else if (raid5_has_ppl(conf)) {
			
 
				+		return ppl_write_stripe(conf, sh);
			
 
				+	}
			
 
				+
			
 
				+	return -EAGAIN;
			
 
				+}
			
 
				+
			
 
				+static inline void log_stripe_write_finished(struct stripe_head *sh)
			
 
				+{
			
 
				+	struct r5conf *conf = sh->raid_conf;
			
 
				+
			
 
				+	if (conf->log)
			
 
				+		r5l_stripe_write_finished(sh);
			
 
				+	else if (raid5_has_ppl(conf))
			
 
				+		ppl_stripe_write_finished(sh);
			
 
				+}
			
 
				+
			
 
				+static inline void log_write_stripe_run(struct r5conf *conf)
			
 
				+{
			
 
				+	if (conf->log)
			
 
				+		r5l_write_stripe_run(conf->log);
			
 
				+	else if (raid5_has_ppl(conf))
			
 
				+		ppl_write_stripe_run(conf);
			
 
				+}
			
 
				+
			
 
				+static inline void log_exit(struct r5conf *conf)
			
 
				+{
			
 
				+	if (conf->log)
			
 
				+		r5l_exit_log(conf);
			
 
				+	else if (raid5_has_ppl(conf))
			
 
				+		ppl_exit_log(conf);
			
 
				+}
			
 
				+
			
 
				+static inline int log_init(struct r5conf *conf, struct md_rdev *journal_dev,
			
 
				+			   bool ppl)
			
 
				+{
			
 
				+	if (journal_dev)
			
 
				+		return r5l_init_log(conf, journal_dev);
			
 
				+	else if (ppl)
			
 
				+		return ppl_init_log(conf);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static inline int log_modify(struct r5conf *conf, struct md_rdev *rdev, bool add)
			
 
				+{
			
 
				+	if (raid5_has_ppl(conf))
			
 
				+		return ppl_modify_log(conf, rdev, add);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -0,0 +1,1271 @@
 
				+/*
			
 
				+ * Partial Parity Log for closing the RAID5 write hole
			
 
				+ * Copyright (c) 2017, Intel Corporation.
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify it
			
 
				+ * under the terms and conditions of the GNU General Public License,
			
 
				+ * version 2, as published by the Free Software Foundation.
			
 
				+ *
			
 
				+ * This program is distributed in the hope it will be useful, but WITHOUT
			
 
				+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
			
 
				+ * more details.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/kernel.h>
			
 
				+#include <linux/blkdev.h>
			
 
				+#include <linux/slab.h>
			
 
				+#include <linux/crc32c.h>
			
 
				+#include <linux/flex_array.h>
			
 
				+#include <linux/async_tx.h>
			
 
				+#include <linux/raid/md_p.h>
			
 
				+#include "md.h"
			
 
				+#include "raid5.h"
			
 
				+
			
 
				+/*
			
 
				+ * PPL consists of a 4KB header (struct ppl_header) and at least 128KB for
			
 
				+ * partial parity data. The header contains an array of entries
			
 
				+ * (struct ppl_header_entry) which describe the logged write requests.
			
 
				+ * Partial parity for the entries comes after the header, written in the same
			
 
				+ * sequence as the entries:
			
 
				+ *
			
 
				+ * Header
			
 
				+ *   entry0
			
 
				+ *   ...
			
 
				+ *   entryN
			
 
				+ * PP data
			
 
				+ *   PP for entry0
			
 
				+ *   ...
			
 
				+ *   PP for entryN
			
 
				+ *
			
 
				+ * An entry describes one or more consecutive stripe_heads, up to a full
			
 
				+ * stripe. The modifed raid data chunks form an m-by-n matrix, where m is the
			
 
				+ * number of stripe_heads in the entry and n is the number of modified data
			
 
				+ * disks. Every stripe_head in the entry must write to the same data disks.
			
 
				+ * An example of a valid case described by a single entry (writes to the first
			
 
				+ * stripe of a 4 disk array, 16k chunk size):
			
 
				+ *
			
 
				+ * sh->sector   dd0   dd1   dd2    ppl
			
 
				+ *            +-----+-----+-----+
			
 
				+ * 0          | --- | --- | --- | +----+
			
 
				+ * 8          | -W- | -W- | --- | | pp |   data_sector = 8
			
 
				+ * 16         | -W- | -W- | --- | | pp |   data_size = 3 * 2 * 4k
			
 
				+ * 24         | -W- | -W- | --- | | pp |   pp_size = 3 * 4k
			
 
				+ *            +-----+-----+-----+ +----+
			
 
				+ *
			
 
				+ * data_sector is the first raid sector of the modified data, data_size is the
			
 
				+ * total size of modified data and pp_size is the size of partial parity for
			
 
				+ * this entry. Entries for full stripe writes contain no partial parity
			
 
				+ * (pp_size = 0), they only mark the stripes for which parity should be
			
 
				+ * recalculated after an unclean shutdown. Every entry holds a checksum of its
			
 
				+ * partial parity, the header also has a checksum of the header itself.
			
 
				+ *
			
 
				+ * A write request is always logged to the PPL instance stored on the parity
			
 
				+ * disk of the corresponding stripe. For each member disk there is one ppl_log
			
 
				+ * used to handle logging for this disk, independently from others. They are
			
 
				+ * grouped in child_logs array in struct ppl_conf, which is assigned to
			
 
				+ * r5conf->log_private.
			
 
				+ *
			
 
				+ * ppl_io_unit represents a full PPL write, header_page contains the ppl_header.
			
 
				+ * PPL entries for logged stripes are added in ppl_log_stripe(). A stripe_head
			
 
				+ * can be appended to the last entry if it meets the conditions for a valid
			
 
				+ * entry described above, otherwise a new entry is added. Checksums of entries
			
 
				+ * are calculated incrementally as stripes containing partial parity are being
			
 
				+ * added. ppl_submit_iounit() calculates the checksum of the header and submits
			
 
				+ * a bio containing the header page and partial parity pages (sh->ppl_page) for
			
 
				+ * all stripes of the io_unit. When the PPL write completes, the stripes
			
 
				+ * associated with the io_unit are released and raid5d starts writing their data
			
 
				+ * and parity. When all stripes are written, the io_unit is freed and the next
			
 
				+ * can be submitted.
			
 
				+ *
			
 
				+ * An io_unit is used to gather stripes until it is submitted or becomes full
			
 
				+ * (if the maximum number of entries or size of PPL is reached). Another io_unit
			
 
				+ * can't be submitted until the previous has completed (PPL and stripe
			
 
				+ * data+parity is written). The log->io_list tracks all io_units of a log
			
 
				+ * (for a single member disk). New io_units are added to the end of the list
			
 
				+ * and the first io_unit is submitted, if it is not submitted already.
			
 
				+ * The current io_unit accepting new stripes is always at the end of the list.
			
 
				+ */
			
 
				+
			
 
				+struct ppl_conf {
			
 
				+	struct mddev *mddev;
			
 
				+
			
 
				+	/* array of child logs, one for each raid disk */
			
 
				+	struct ppl_log *child_logs;
			
 
				+	int count;
			
 
				+
			
 
				+	int block_size;		/* the logical block size used for data_sector
			
 
				+				 * in ppl_header_entry */
			
 
				+	u32 signature;		/* raid array identifier */
			
 
				+	atomic64_t seq;		/* current log write sequence number */
			
 
				+
			
 
				+	struct kmem_cache *io_kc;
			
 
				+	mempool_t *io_pool;
			
 
				+	struct bio_set *bs;
			
 
				+
			
 
				+	/* used only for recovery */
			
 
				+	int recovered_entries;
			
 
				+	int mismatch_count;
			
 
				+
			
 
				+	/* stripes to retry if failed to allocate io_unit */
			
 
				+	struct list_head no_mem_stripes;
			
 
				+	spinlock_t no_mem_stripes_lock;
			
 
				+};
			
 
				+
			
 
				+struct ppl_log {
			
 
				+	struct ppl_conf *ppl_conf;	/* shared between all log instances */
			
 
				+
			
 
				+	struct md_rdev *rdev;		/* array member disk associated with
			
 
				+					 * this log instance */
			
 
				+	struct mutex io_mutex;
			
 
				+	struct ppl_io_unit *current_io;	/* current io_unit accepting new data
			
 
				+					 * always at the end of io_list */
			
 
				+	spinlock_t io_list_lock;
			
 
				+	struct list_head io_list;	/* all io_units of this log */
			
 
				+};
			
 
				+
			
 
				+#define PPL_IO_INLINE_BVECS 32
			
 
				+
			
 
				+struct ppl_io_unit {
			
 
				+	struct ppl_log *log;
			
 
				+
			
 
				+	struct page *header_page;	/* for ppl_header */
			
 
				+
			
 
				+	unsigned int entries_count;	/* number of entries in ppl_header */
			
 
				+	unsigned int pp_size;		/* total size current of partial parity */
			
 
				+
			
 
				+	u64 seq;			/* sequence number of this log write */
			
 
				+	struct list_head log_sibling;	/* log->io_list */
			
 
				+
			
 
				+	struct list_head stripe_list;	/* stripes added to the io_unit */
			
 
				+	atomic_t pending_stripes;	/* how many stripes not written to raid */
			
 
				+
			
 
				+	bool submitted;			/* true if write to log started */
			
 
				+
			
 
				+	/* inline bio and its biovec for submitting the iounit */
			
 
				+	struct bio bio;
			
 
				+	struct bio_vec biovec[PPL_IO_INLINE_BVECS];
			
 
				+};
			
 
				+
			
 
				+struct dma_async_tx_descriptor *
			
 
				+ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
			
 
				+		       struct dma_async_tx_descriptor *tx)
			
 
				+{
			
 
				+	int disks = sh->disks;
			
 
				+	struct page **srcs = flex_array_get(percpu->scribble, 0);
			
 
				+	int count = 0, pd_idx = sh->pd_idx, i;
			
 
				+	struct async_submit_ctl submit;
			
 
				+
			
 
				+	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
			
 
				+
			
 
				+	/*
			
 
				+	 * Partial parity is the XOR of stripe data chunks that are not changed
			
 
				+	 * during the write request. Depending on available data
			
 
				+	 * (read-modify-write vs. reconstruct-write case) we calculate it
			
 
				+	 * differently.
			
 
				+	 */
			
 
				+	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
			
 
				+		/*
			
 
				+		 * rmw: xor old data and parity from updated disks
			
 
				+		 * This is calculated earlier by ops_run_prexor5() so just copy
			
 
				+		 * the parity dev page.
			
 
				+		 */
			
 
				+		srcs[count++] = sh->dev[pd_idx].page;
			
 
				+	} else if (sh->reconstruct_state == reconstruct_state_drain_run) {
			
 
				+		/* rcw: xor data from all not updated disks */
			
 
				+		for (i = disks; i--;) {
			
 
				+			struct r5dev *dev = &sh->dev[i];
			
 
				+			if (test_bit(R5_UPTODATE, &dev->flags))
			
 
				+				srcs[count++] = dev->page;
			
 
				+		}
			
 
				+	} else {
			
 
				+		return tx;
			
 
				+	}
			
 
				+
			
 
				+	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx,
			
 
				+			  NULL, sh, flex_array_get(percpu->scribble, 0)
			
 
				+			  + sizeof(struct page *) * (sh->disks + 2));
			
 
				+
			
 
				+	if (count == 1)
			
 
				+		tx = async_memcpy(sh->ppl_page, srcs[0], 0, 0, PAGE_SIZE,
			
 
				+				  &submit);
			
 
				+	else
			
 
				+		tx = async_xor(sh->ppl_page, srcs, 0, count, PAGE_SIZE,
			
 
				+			       &submit);
			
 
				+
			
 
				+	return tx;
			
 
				+}
			
 
				+
			
 
				+static void *ppl_io_pool_alloc(gfp_t gfp_mask, void *pool_data)
			
 
				+{
			
 
				+	struct kmem_cache *kc = pool_data;
			
 
				+	struct ppl_io_unit *io;
			
 
				+
			
 
				+	io = kmem_cache_alloc(kc, gfp_mask);
			
 
				+	if (!io)
			
 
				+		return NULL;
			
 
				+
			
 
				+	io->header_page = alloc_page(gfp_mask);
			
 
				+	if (!io->header_page) {
			
 
				+		kmem_cache_free(kc, io);
			
 
				+		return NULL;
			
 
				+	}
			
 
				+
			
 
				+	return io;
			
 
				+}
			
 
				+
			
 
				+static void ppl_io_pool_free(void *element, void *pool_data)
			
 
				+{
			
 
				+	struct kmem_cache *kc = pool_data;
			
 
				+	struct ppl_io_unit *io = element;
			
 
				+
			
 
				+	__free_page(io->header_page);
			
 
				+	kmem_cache_free(kc, io);
			
 
				+}
			
 
				+
			
 
				+static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log,
			
 
				+					  struct stripe_head *sh)
			
 
				+{
			
 
				+	struct ppl_conf *ppl_conf = log->ppl_conf;
			
 
				+	struct ppl_io_unit *io;
			
 
				+	struct ppl_header *pplhdr;
			
 
				+	struct page *header_page;
			
 
				+
			
 
				+	io = mempool_alloc(ppl_conf->io_pool, GFP_NOWAIT);
			
 
				+	if (!io)
			
 
				+		return NULL;
			
 
				+
			
 
				+	header_page = io->header_page;
			
 
				+	memset(io, 0, sizeof(*io));
			
 
				+	io->header_page = header_page;
			
 
				+
			
 
				+	io->log = log;
			
 
				+	INIT_LIST_HEAD(&io->log_sibling);
			
 
				+	INIT_LIST_HEAD(&io->stripe_list);
			
 
				+	atomic_set(&io->pending_stripes, 0);
			
 
				+	bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS);
			
 
				+
			
 
				+	pplhdr = page_address(io->header_page);
			
 
				+	clear_page(pplhdr);
			
 
				+	memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
			
 
				+	pplhdr->signature = cpu_to_le32(ppl_conf->signature);
			
 
				+
			
 
				+	io->seq = atomic64_add_return(1, &ppl_conf->seq);
			
 
				+	pplhdr->generation = cpu_to_le64(io->seq);
			
 
				+
			
 
				+	return io;
			
 
				+}
			
 
				+
			
 
				+static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh)
			
 
				+{
			
 
				+	struct ppl_io_unit *io = log->current_io;
			
 
				+	struct ppl_header_entry *e = NULL;
			
 
				+	struct ppl_header *pplhdr;
			
 
				+	int i;
			
 
				+	sector_t data_sector = 0;
			
 
				+	int data_disks = 0;
			
 
				+	unsigned int entry_space = (log->rdev->ppl.size << 9) - PPL_HEADER_SIZE;
			
 
				+	struct r5conf *conf = sh->raid_conf;
			
 
				+
			
 
				+	pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector);
			
 
				+
			
 
				+	/* check if current io_unit is full */
			
 
				+	if (io && (io->pp_size == entry_space ||
			
 
				+		   io->entries_count == PPL_HDR_MAX_ENTRIES)) {
			
 
				+		pr_debug("%s: add io_unit blocked by seq: %llu\n",
			
 
				+			 __func__, io->seq);
			
 
				+		io = NULL;
			
 
				+	}
			
 
				+
			
 
				+	/* add a new unit if there is none or the current is full */
			
 
				+	if (!io) {
			
 
				+		io = ppl_new_iounit(log, sh);
			
 
				+		if (!io)
			
 
				+			return -ENOMEM;
			
 
				+		spin_lock_irq(&log->io_list_lock);
			
 
				+		list_add_tail(&io->log_sibling, &log->io_list);
			
 
				+		spin_unlock_irq(&log->io_list_lock);
			
 
				+
			
 
				+		log->current_io = io;
			
 
				+	}
			
 
				+
			
 
				+	for (i = 0; i < sh->disks; i++) {
			
 
				+		struct r5dev *dev = &sh->dev[i];
			
 
				+
			
 
				+		if (i != sh->pd_idx && test_bit(R5_Wantwrite, &dev->flags)) {
			
 
				+			if (!data_disks || dev->sector < data_sector)
			
 
				+				data_sector = dev->sector;
			
 
				+			data_disks++;
			
 
				+		}
			
 
				+	}
			
 
				+	BUG_ON(!data_disks);
			
 
				+
			
 
				+	pr_debug("%s: seq: %llu data_sector: %llu data_disks: %d\n", __func__,
			
 
				+		 io->seq, (unsigned long long)data_sector, data_disks);
			
 
				+
			
 
				+	pplhdr = page_address(io->header_page);
			
 
				+
			
 
				+	if (io->entries_count > 0) {
			
 
				+		struct ppl_header_entry *last =
			
 
				+				&pplhdr->entries[io->entries_count - 1];
			
 
				+		struct stripe_head *sh_last = list_last_entry(
			
 
				+				&io->stripe_list, struct stripe_head, log_list);
			
 
				+		u64 data_sector_last = le64_to_cpu(last->data_sector);
			
 
				+		u32 data_size_last = le32_to_cpu(last->data_size);
			
 
				+
			
 
				+		/*
			
 
				+		 * Check if we can append the stripe to the last entry. It must
			
 
				+		 * be just after the last logged stripe and write to the same
			
 
				+		 * disks. Use bit shift and logarithm to avoid 64-bit division.
			
 
				+		 */
			
 
				+		if ((sh->sector == sh_last->sector + STRIPE_SECTORS) &&
			
 
				+		    (data_sector >> ilog2(conf->chunk_sectors) ==
			
 
				+		     data_sector_last >> ilog2(conf->chunk_sectors)) &&
			
 
				+		    ((data_sector - data_sector_last) * data_disks ==
			
 
				+		     data_size_last >> 9))
			
 
				+			e = last;
			
 
				+	}
			
 
				+
			
 
				+	if (!e) {
			
 
				+		e = &pplhdr->entries[io->entries_count++];
			
 
				+		e->data_sector = cpu_to_le64(data_sector);
			
 
				+		e->parity_disk = cpu_to_le32(sh->pd_idx);
			
 
				+		e->checksum = cpu_to_le32(~0);
			
 
				+	}
			
 
				+
			
 
				+	le32_add_cpu(&e->data_size, data_disks << PAGE_SHIFT);
			
 
				+
			
 
				+	/* don't write any PP if full stripe write */
			
 
				+	if (!test_bit(STRIPE_FULL_WRITE, &sh->state)) {
			
 
				+		le32_add_cpu(&e->pp_size, PAGE_SIZE);
			
 
				+		io->pp_size += PAGE_SIZE;
			
 
				+		e->checksum = cpu_to_le32(crc32c_le(le32_to_cpu(e->checksum),
			
 
				+						    page_address(sh->ppl_page),
			
 
				+						    PAGE_SIZE));
			
 
				+	}
			
 
				+
			
 
				+	list_add_tail(&sh->log_list, &io->stripe_list);
			
 
				+	atomic_inc(&io->pending_stripes);
			
 
				+	sh->ppl_io = io;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh)
			
 
				+{
			
 
				+	struct ppl_conf *ppl_conf = conf->log_private;
			
 
				+	struct ppl_io_unit *io = sh->ppl_io;
			
 
				+	struct ppl_log *log;
			
 
				+
			
 
				+	if (io || test_bit(STRIPE_SYNCING, &sh->state) || !sh->ppl_page ||
			
 
				+	    !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
			
 
				+	    !test_bit(R5_Insync, &sh->dev[sh->pd_idx].flags)) {
			
 
				+		clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
			
 
				+		return -EAGAIN;
			
 
				+	}
			
 
				+
			
 
				+	log = &ppl_conf->child_logs[sh->pd_idx];
			
 
				+
			
 
				+	mutex_lock(&log->io_mutex);
			
 
				+
			
 
				+	if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) {
			
 
				+		mutex_unlock(&log->io_mutex);
			
 
				+		return -EAGAIN;
			
 
				+	}
			
 
				+
			
 
				+	set_bit(STRIPE_LOG_TRAPPED, &sh->state);
			
 
				+	clear_bit(STRIPE_DELAYED, &sh->state);
			
 
				+	atomic_inc(&sh->count);
			
 
				+
			
 
				+	if (ppl_log_stripe(log, sh)) {
			
 
				+		spin_lock_irq(&ppl_conf->no_mem_stripes_lock);
			
 
				+		list_add_tail(&sh->log_list, &ppl_conf->no_mem_stripes);
			
 
				+		spin_unlock_irq(&ppl_conf->no_mem_stripes_lock);
			
 
				+	}
			
 
				+
			
 
				+	mutex_unlock(&log->io_mutex);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void ppl_log_endio(struct bio *bio)
			
 
				+{
			
 
				+	struct ppl_io_unit *io = bio->bi_private;
			
 
				+	struct ppl_log *log = io->log;
			
 
				+	struct ppl_conf *ppl_conf = log->ppl_conf;
			
 
				+	struct stripe_head *sh, *next;
			
 
				+
			
 
				+	pr_debug("%s: seq: %llu\n", __func__, io->seq);
			
 
				+
			
 
				+	if (bio->bi_error)
			
 
				+		md_error(ppl_conf->mddev, log->rdev);
			
 
				+
			
 
				+	list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
			
 
				+		list_del_init(&sh->log_list);
			
 
				+
			
 
				+		set_bit(STRIPE_HANDLE, &sh->state);
			
 
				+		raid5_release_stripe(sh);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void ppl_submit_iounit_bio(struct ppl_io_unit *io, struct bio *bio)
			
 
				+{
			
 
				+	char b[BDEVNAME_SIZE];
			
 
				+
			
 
				+	pr_debug("%s: seq: %llu size: %u sector: %llu dev: %s\n",
			
 
				+		 __func__, io->seq, bio->bi_iter.bi_size,
			
 
				+		 (unsigned long long)bio->bi_iter.bi_sector,
			
 
				+		 bdevname(bio->bi_bdev, b));
			
 
				+
			
 
				+	submit_bio(bio);
			
 
				+}
			
 
				+
			
 
				+static void ppl_submit_iounit(struct ppl_io_unit *io)
			
 
				+{
			
 
				+	struct ppl_log *log = io->log;
			
 
				+	struct ppl_conf *ppl_conf = log->ppl_conf;
			
 
				+	struct ppl_header *pplhdr = page_address(io->header_page);
			
 
				+	struct bio *bio = &io->bio;
			
 
				+	struct stripe_head *sh;
			
 
				+	int i;
			
 
				+
			
 
				+	bio->bi_private = io;
			
 
				+
			
 
				+	if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) {
			
 
				+		ppl_log_endio(bio);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	for (i = 0; i < io->entries_count; i++) {
			
 
				+		struct ppl_header_entry *e = &pplhdr->entries[i];
			
 
				+
			
 
				+		pr_debug("%s: seq: %llu entry: %d data_sector: %llu pp_size: %u data_size: %u\n",
			
 
				+			 __func__, io->seq, i, le64_to_cpu(e->data_sector),
			
 
				+			 le32_to_cpu(e->pp_size), le32_to_cpu(e->data_size));
			
 
				+
			
 
				+		e->data_sector = cpu_to_le64(le64_to_cpu(e->data_sector) >>
			
 
				+					     ilog2(ppl_conf->block_size >> 9));
			
 
				+		e->checksum = cpu_to_le32(~le32_to_cpu(e->checksum));
			
 
				+	}
			
 
				+
			
 
				+	pplhdr->entries_count = cpu_to_le32(io->entries_count);
			
 
				+	pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE));
			
 
				+
			
 
				+	bio->bi_end_io = ppl_log_endio;
			
 
				+	bio->bi_opf = REQ_OP_WRITE | REQ_FUA;
			
 
				+	bio->bi_bdev = log->rdev->bdev;
			
 
				+	bio->bi_iter.bi_sector = log->rdev->ppl.sector;
			
 
				+	bio_add_page(bio, io->header_page, PAGE_SIZE, 0);
			
 
				+
			
 
				+	list_for_each_entry(sh, &io->stripe_list, log_list) {
			
 
				+		/* entries for full stripe writes have no partial parity */
			
 
				+		if (test_bit(STRIPE_FULL_WRITE, &sh->state))
			
 
				+			continue;
			
 
				+
			
 
				+		if (!bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0)) {
			
 
				+			struct bio *prev = bio;
			
 
				+
			
 
				+			bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES,
			
 
				+					       ppl_conf->bs);
			
 
				+			bio->bi_opf = prev->bi_opf;
			
 
				+			bio->bi_bdev = prev->bi_bdev;
			
 
				+			bio->bi_iter.bi_sector = bio_end_sector(prev);
			
 
				+			bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0);
			
 
				+
			
 
				+			bio_chain(bio, prev);
			
 
				+			ppl_submit_iounit_bio(io, prev);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	ppl_submit_iounit_bio(io, bio);
			
 
				+}
			
 
				+
			
 
				+static void ppl_submit_current_io(struct ppl_log *log)
			
 
				+{
			
 
				+	struct ppl_io_unit *io;
			
 
				+
			
 
				+	spin_lock_irq(&log->io_list_lock);
			
 
				+
			
 
				+	io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit,
			
 
				+				      log_sibling);
			
 
				+	if (io && io->submitted)
			
 
				+		io = NULL;
			
 
				+
			
 
				+	spin_unlock_irq(&log->io_list_lock);
			
 
				+
			
 
				+	if (io) {
			
 
				+		io->submitted = true;
			
 
				+
			
 
				+		if (io == log->current_io)
			
 
				+			log->current_io = NULL;
			
 
				+
			
 
				+		ppl_submit_iounit(io);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void ppl_write_stripe_run(struct r5conf *conf)
			
 
				+{
			
 
				+	struct ppl_conf *ppl_conf = conf->log_private;
			
 
				+	struct ppl_log *log;
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < ppl_conf->count; i++) {
			
 
				+		log = &ppl_conf->child_logs[i];
			
 
				+
			
 
				+		mutex_lock(&log->io_mutex);
			
 
				+		ppl_submit_current_io(log);
			
 
				+		mutex_unlock(&log->io_mutex);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void ppl_io_unit_finished(struct ppl_io_unit *io)
			
 
				+{
			
 
				+	struct ppl_log *log = io->log;
			
 
				+	struct ppl_conf *ppl_conf = log->ppl_conf;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	pr_debug("%s: seq: %llu\n", __func__, io->seq);
			
 
				+
			
 
				+	local_irq_save(flags);
			
 
				+
			
 
				+	spin_lock(&log->io_list_lock);
			
 
				+	list_del(&io->log_sibling);
			
 
				+	spin_unlock(&log->io_list_lock);
			
 
				+
			
 
				+	mempool_free(io, ppl_conf->io_pool);
			
 
				+
			
 
				+	spin_lock(&ppl_conf->no_mem_stripes_lock);
			
 
				+	if (!list_empty(&ppl_conf->no_mem_stripes)) {
			
 
				+		struct stripe_head *sh;
			
 
				+
			
 
				+		sh = list_first_entry(&ppl_conf->no_mem_stripes,
			
 
				+				      struct stripe_head, log_list);
			
 
				+		list_del_init(&sh->log_list);
			
 
				+		set_bit(STRIPE_HANDLE, &sh->state);
			
 
				+		raid5_release_stripe(sh);
			
 
				+	}
			
 
				+	spin_unlock(&ppl_conf->no_mem_stripes_lock);
			
 
				+
			
 
				+	local_irq_restore(flags);
			
 
				+}
			
 
				+
			
 
				+void ppl_stripe_write_finished(struct stripe_head *sh)
			
 
				+{
			
 
				+	struct ppl_io_unit *io;
			
 
				+
			
 
				+	io = sh->ppl_io;
			
 
				+	sh->ppl_io = NULL;
			
 
				+
			
 
				+	if (io && atomic_dec_and_test(&io->pending_stripes))
			
 
				+		ppl_io_unit_finished(io);
			
 
				+}
			
 
				+
			
 
				+static void ppl_xor(int size, struct page *page1, struct page *page2)
			
 
				+{
			
 
				+	struct async_submit_ctl submit;
			
 
				+	struct dma_async_tx_descriptor *tx;
			
 
				+	struct page *xor_srcs[] = { page1, page2 };
			
 
				+
			
 
				+	init_async_submit(&submit, ASYNC_TX_ACK|ASYNC_TX_XOR_DROP_DST,
			
 
				+			  NULL, NULL, NULL, NULL);
			
 
				+	tx = async_xor(page1, xor_srcs, 0, 2, size, &submit);
			
 
				+
			
 
				+	async_tx_quiesce(&tx);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * PPL recovery strategy: xor partial parity and data from all modified data
			
 
				+ * disks within a stripe and write the result as the new stripe parity. If all
			
 
				+ * stripe data disks are modified (full stripe write), no partial parity is
			
 
				+ * available, so just xor the data disks.
			
 
				+ *
			
 
				+ * Recovery of a PPL entry shall occur only if all modified data disks are
			
 
				+ * available and read from all of them succeeds.
			
 
				+ *
			
 
				+ * A PPL entry applies to a stripe, partial parity size for an entry is at most
			
 
				+ * the size of the chunk. Examples of possible cases for a single entry:
			
 
				+ *
			
 
				+ * case 0: single data disk write:
			
 
				+ *   data0    data1    data2     ppl        parity
			
 
				+ * +--------+--------+--------+           +--------------------+
			
 
				+ * | ------ | ------ | ------ | +----+    | (no change)        |
			
 
				+ * | ------ | -data- | ------ | | pp | -> | data1 ^ pp         |
			
 
				+ * | ------ | -data- | ------ | | pp | -> | data1 ^ pp         |
			
 
				+ * | ------ | ------ | ------ | +----+    | (no change)        |
			
 
				+ * +--------+--------+--------+           +--------------------+
			
 
				+ * pp_size = data_size
			
 
				+ *
			
 
				+ * case 1: more than one data disk write:
			
 
				+ *   data0    data1    data2     ppl        parity
			
 
				+ * +--------+--------+--------+           +--------------------+
			
 
				+ * | ------ | ------ | ------ | +----+    | (no change)        |
			
 
				+ * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp |
			
 
				+ * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp |
			
 
				+ * | ------ | ------ | ------ | +----+    | (no change)        |
			
 
				+ * +--------+--------+--------+           +--------------------+
			
 
				+ * pp_size = data_size / modified_data_disks
			
 
				+ *
			
 
				+ * case 2: write to all data disks (also full stripe write):
			
 
				+ *   data0    data1    data2                parity
			
 
				+ * +--------+--------+--------+           +--------------------+
			
 
				+ * | ------ | ------ | ------ |           | (no change)        |
			
 
				+ * | -data- | -data- | -data- | --------> | xor all data       |
			
 
				+ * | ------ | ------ | ------ | --------> | (no change)        |
			
 
				+ * | ------ | ------ | ------ |           | (no change)        |
			
 
				+ * +--------+--------+--------+           +--------------------+
			
 
				+ * pp_size = 0
			
 
				+ *
			
 
				+ * The following cases are possible only in other implementations. The recovery
			
 
				+ * code can handle them, but they are not generated at runtime because they can
			
 
				+ * be reduced to cases 0, 1 and 2:
			
 
				+ *
			
 
				+ * case 3:
			
 
				+ *   data0    data1    data2     ppl        parity
			
 
				+ * +--------+--------+--------+ +----+    +--------------------+
			
 
				+ * | ------ | -data- | -data- | | pp |    | data1 ^ data2 ^ pp |
			
 
				+ * | ------ | -data- | -data- | | pp | -> | data1 ^ data2 ^ pp |
			
 
				+ * | -data- | -data- | -data- | | -- | -> | xor all data       |
			
 
				+ * | -data- | -data- | ------ | | pp |    | data0 ^ data1 ^ pp |
			
 
				+ * +--------+--------+--------+ +----+    +--------------------+
			
 
				+ * pp_size = chunk_size
			
 
				+ *
			
 
				+ * case 4:
			
 
				+ *   data0    data1    data2     ppl        parity
			
 
				+ * +--------+--------+--------+ +----+    +--------------------+
			
 
				+ * | ------ | -data- | ------ | | pp |    | data1 ^ pp         |
			
 
				+ * | ------ | ------ | ------ | | -- | -> | (no change)        |
			
 
				+ * | ------ | ------ | ------ | | -- | -> | (no change)        |
			
 
				+ * | -data- | ------ | ------ | | pp |    | data0 ^ pp         |
			
 
				+ * +--------+--------+--------+ +----+    +--------------------+
			
 
				+ * pp_size = chunk_size
			
 
				+ */
			
 
				+static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
			
 
				+			     sector_t ppl_sector)
			
 
				+{
			
 
				+	struct ppl_conf *ppl_conf = log->ppl_conf;
			
 
				+	struct mddev *mddev = ppl_conf->mddev;
			
 
				+	struct r5conf *conf = mddev->private;
			
 
				+	int block_size = ppl_conf->block_size;
			
 
				+	struct page *page1;
			
 
				+	struct page *page2;
			
 
				+	sector_t r_sector_first;
			
 
				+	sector_t r_sector_last;
			
 
				+	int strip_sectors;
			
 
				+	int data_disks;
			
 
				+	int i;
			
 
				+	int ret = 0;
			
 
				+	char b[BDEVNAME_SIZE];
			
 
				+	unsigned int pp_size = le32_to_cpu(e->pp_size);
			
 
				+	unsigned int data_size = le32_to_cpu(e->data_size);
			
 
				+
			
 
				+	page1 = alloc_page(GFP_KERNEL);
			
 
				+	page2 = alloc_page(GFP_KERNEL);
			
 
				+
			
 
				+	if (!page1 || !page2) {
			
 
				+		ret = -ENOMEM;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	r_sector_first = le64_to_cpu(e->data_sector) * (block_size >> 9);
			
 
				+
			
 
				+	if ((pp_size >> 9) < conf->chunk_sectors) {
			
 
				+		if (pp_size > 0) {
			
 
				+			data_disks = data_size / pp_size;
			
 
				+			strip_sectors = pp_size >> 9;
			
 
				+		} else {
			
 
				+			data_disks = conf->raid_disks - conf->max_degraded;
			
 
				+			strip_sectors = (data_size >> 9) / data_disks;
			
 
				+		}
			
 
				+		r_sector_last = r_sector_first +
			
 
				+				(data_disks - 1) * conf->chunk_sectors +
			
 
				+				strip_sectors;
			
 
				+	} else {
			
 
				+		data_disks = conf->raid_disks - conf->max_degraded;
			
 
				+		strip_sectors = conf->chunk_sectors;
			
 
				+		r_sector_last = r_sector_first + (data_size >> 9);
			
 
				+	}
			
 
				+
			
 
				+	pr_debug("%s: array sector first: %llu last: %llu\n", __func__,
			
 
				+		 (unsigned long long)r_sector_first,
			
 
				+		 (unsigned long long)r_sector_last);
			
 
				+
			
 
				+	/* if start and end is 4k aligned, use a 4k block */
			
 
				+	if (block_size == 512 &&
			
 
				+	    (r_sector_first & (STRIPE_SECTORS - 1)) == 0 &&
			
 
				+	    (r_sector_last & (STRIPE_SECTORS - 1)) == 0)
			
 
				+		block_size = STRIPE_SIZE;
			
 
				+
			
 
				+	/* iterate through blocks in strip */
			
 
				+	for (i = 0; i < strip_sectors; i += (block_size >> 9)) {
			
 
				+		bool update_parity = false;
			
 
				+		sector_t parity_sector;
			
 
				+		struct md_rdev *parity_rdev;
			
 
				+		struct stripe_head sh;
			
 
				+		int disk;
			
 
				+		int indent = 0;
			
 
				+
			
 
				+		pr_debug("%s:%*s iter %d start\n", __func__, indent, "", i);
			
 
				+		indent += 2;
			
 
				+
			
 
				+		memset(page_address(page1), 0, PAGE_SIZE);
			
 
				+
			
 
				+		/* iterate through data member disks */
			
 
				+		for (disk = 0; disk < data_disks; disk++) {
			
 
				+			int dd_idx;
			
 
				+			struct md_rdev *rdev;
			
 
				+			sector_t sector;
			
 
				+			sector_t r_sector = r_sector_first + i +
			
 
				+					    (disk * conf->chunk_sectors);
			
 
				+
			
 
				+			pr_debug("%s:%*s data member disk %d start\n",
			
 
				+				 __func__, indent, "", disk);
			
 
				+			indent += 2;
			
 
				+
			
 
				+			if (r_sector >= r_sector_last) {
			
 
				+				pr_debug("%s:%*s array sector %llu doesn't need parity update\n",
			
 
				+					 __func__, indent, "",
			
 
				+					 (unsigned long long)r_sector);
			
 
				+				indent -= 2;
			
 
				+				continue;
			
 
				+			}
			
 
				+
			
 
				+			update_parity = true;
			
 
				+
			
 
				+			/* map raid sector to member disk */
			
 
				+			sector = raid5_compute_sector(conf, r_sector, 0,
			
 
				+						      &dd_idx, NULL);
			
 
				+			pr_debug("%s:%*s processing array sector %llu => data member disk %d, sector %llu\n",
			
 
				+				 __func__, indent, "",
			
 
				+				 (unsigned long long)r_sector, dd_idx,
			
 
				+				 (unsigned long long)sector);
			
 
				+
			
 
				+			rdev = conf->disks[dd_idx].rdev;
			
 
				+			if (!rdev) {
			
 
				+				pr_debug("%s:%*s data member disk %d missing\n",
			
 
				+					 __func__, indent, "", dd_idx);
			
 
				+				update_parity = false;
			
 
				+				break;
			
 
				+			}
			
 
				+
			
 
				+			pr_debug("%s:%*s reading data member disk %s sector %llu\n",
			
 
				+				 __func__, indent, "", bdevname(rdev->bdev, b),
			
 
				+				 (unsigned long long)sector);
			
 
				+			if (!sync_page_io(rdev, sector, block_size, page2,
			
 
				+					REQ_OP_READ, 0, false)) {
			
 
				+				md_error(mddev, rdev);
			
 
				+				pr_debug("%s:%*s read failed!\n", __func__,
			
 
				+					 indent, "");
			
 
				+				ret = -EIO;
			
 
				+				goto out;
			
 
				+			}
			
 
				+
			
 
				+			ppl_xor(block_size, page1, page2);
			
 
				+
			
 
				+			indent -= 2;
			
 
				+		}
			
 
				+
			
 
				+		if (!update_parity)
			
 
				+			continue;
			
 
				+
			
 
				+		if (pp_size > 0) {
			
 
				+			pr_debug("%s:%*s reading pp disk sector %llu\n",
			
 
				+				 __func__, indent, "",
			
 
				+				 (unsigned long long)(ppl_sector + i));
			
 
				+			if (!sync_page_io(log->rdev,
			
 
				+					ppl_sector - log->rdev->data_offset + i,
			
 
				+					block_size, page2, REQ_OP_READ, 0,
			
 
				+					false)) {
			
 
				+				pr_debug("%s:%*s read failed!\n", __func__,
			
 
				+					 indent, "");
			
 
				+				md_error(mddev, log->rdev);
			
 
				+				ret = -EIO;
			
 
				+				goto out;
			
 
				+			}
			
 
				+
			
 
				+			ppl_xor(block_size, page1, page2);
			
 
				+		}
			
 
				+
			
 
				+		/* map raid sector to parity disk */
			
 
				+		parity_sector = raid5_compute_sector(conf, r_sector_first + i,
			
 
				+				0, &disk, &sh);
			
 
				+		BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk));
			
 
				+		parity_rdev = conf->disks[sh.pd_idx].rdev;
			
 
				+
			
 
				+		BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev);
			
 
				+		pr_debug("%s:%*s write parity at sector %llu, disk %s\n",
			
 
				+			 __func__, indent, "",
			
 
				+			 (unsigned long long)parity_sector,
			
 
				+			 bdevname(parity_rdev->bdev, b));
			
 
				+		if (!sync_page_io(parity_rdev, parity_sector, block_size,
			
 
				+				page1, REQ_OP_WRITE, 0, false)) {
			
 
				+			pr_debug("%s:%*s parity write error!\n", __func__,
			
 
				+				 indent, "");
			
 
				+			md_error(mddev, parity_rdev);
			
 
				+			ret = -EIO;
			
 
				+			goto out;
			
 
				+		}
			
 
				+	}
			
 
				+out:
			
 
				+	if (page1)
			
 
				+		__free_page(page1);
			
 
				+	if (page2)
			
 
				+		__free_page(page2);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr)
			
 
				+{
			
 
				+	struct ppl_conf *ppl_conf = log->ppl_conf;
			
 
				+	struct md_rdev *rdev = log->rdev;
			
 
				+	struct mddev *mddev = rdev->mddev;
			
 
				+	sector_t ppl_sector = rdev->ppl.sector + (PPL_HEADER_SIZE >> 9);
			
 
				+	struct page *page;
			
 
				+	int i;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	page = alloc_page(GFP_KERNEL);
			
 
				+	if (!page)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	/* iterate through all PPL entries saved */
			
 
				+	for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++) {
			
 
				+		struct ppl_header_entry *e = &pplhdr->entries[i];
			
 
				+		u32 pp_size = le32_to_cpu(e->pp_size);
			
 
				+		sector_t sector = ppl_sector;
			
 
				+		int ppl_entry_sectors = pp_size >> 9;
			
 
				+		u32 crc, crc_stored;
			
 
				+
			
 
				+		pr_debug("%s: disk: %d entry: %d ppl_sector: %llu pp_size: %u\n",
			
 
				+			 __func__, rdev->raid_disk, i,
			
 
				+			 (unsigned long long)ppl_sector, pp_size);
			
 
				+
			
 
				+		crc = ~0;
			
 
				+		crc_stored = le32_to_cpu(e->checksum);
			
 
				+
			
 
				+		/* read parial parity for this entry and calculate its checksum */
			
 
				+		while (pp_size) {
			
 
				+			int s = pp_size > PAGE_SIZE ? PAGE_SIZE : pp_size;
			
 
				+
			
 
				+			if (!sync_page_io(rdev, sector - rdev->data_offset,
			
 
				+					s, page, REQ_OP_READ, 0, false)) {
			
 
				+				md_error(mddev, rdev);
			
 
				+				ret = -EIO;
			
 
				+				goto out;
			
 
				+			}
			
 
				+
			
 
				+			crc = crc32c_le(crc, page_address(page), s);
			
 
				+
			
 
				+			pp_size -= s;
			
 
				+			sector += s >> 9;
			
 
				+		}
			
 
				+
			
 
				+		crc = ~crc;
			
 
				+
			
 
				+		if (crc != crc_stored) {
			
 
				+			/*
			
 
				+			 * Don't recover this entry if the checksum does not
			
 
				+			 * match, but keep going and try to recover other
			
 
				+			 * entries.
			
 
				+			 */
			
 
				+			pr_debug("%s: ppl entry crc does not match: stored: 0x%x calculated: 0x%x\n",
			
 
				+				 __func__, crc_stored, crc);
			
 
				+			ppl_conf->mismatch_count++;
			
 
				+		} else {
			
 
				+			ret = ppl_recover_entry(log, e, ppl_sector);
			
 
				+			if (ret)
			
 
				+				goto out;
			
 
				+			ppl_conf->recovered_entries++;
			
 
				+		}
			
 
				+
			
 
				+		ppl_sector += ppl_entry_sectors;
			
 
				+	}
			
 
				+
			
 
				+	/* flush the disk cache after recovery if necessary */
			
 
				+	ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL, NULL);
			
 
				+out:
			
 
				+	__free_page(page);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int ppl_write_empty_header(struct ppl_log *log)
			
 
				+{
			
 
				+	struct page *page;
			
 
				+	struct ppl_header *pplhdr;
			
 
				+	struct md_rdev *rdev = log->rdev;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	pr_debug("%s: disk: %d ppl_sector: %llu\n", __func__,
			
 
				+		 rdev->raid_disk, (unsigned long long)rdev->ppl.sector);
			
 
				+
			
 
				+	page = alloc_page(GFP_NOIO | __GFP_ZERO);
			
 
				+	if (!page)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	pplhdr = page_address(page);
			
 
				+	memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
			
 
				+	pplhdr->signature = cpu_to_le32(log->ppl_conf->signature);
			
 
				+	pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE));
			
 
				+
			
 
				+	if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
			
 
				+			  PPL_HEADER_SIZE, page, REQ_OP_WRITE | REQ_FUA, 0,
			
 
				+			  false)) {
			
 
				+		md_error(rdev->mddev, rdev);
			
 
				+		ret = -EIO;
			
 
				+	}
			
 
				+
			
 
				+	__free_page(page);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int ppl_load_distributed(struct ppl_log *log)
			
 
				+{
			
 
				+	struct ppl_conf *ppl_conf = log->ppl_conf;
			
 
				+	struct md_rdev *rdev = log->rdev;
			
 
				+	struct mddev *mddev = rdev->mddev;
			
 
				+	struct page *page;
			
 
				+	struct ppl_header *pplhdr;
			
 
				+	u32 crc, crc_stored;
			
 
				+	u32 signature;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	pr_debug("%s: disk: %d\n", __func__, rdev->raid_disk);
			
 
				+
			
 
				+	/* read PPL header */
			
 
				+	page = alloc_page(GFP_KERNEL);
			
 
				+	if (!page)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
			
 
				+			  PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
			
 
				+		md_error(mddev, rdev);
			
 
				+		ret = -EIO;
			
 
				+		goto out;
			
 
				+	}
			
 
				+	pplhdr = page_address(page);
			
 
				+
			
 
				+	/* check header validity */
			
 
				+	crc_stored = le32_to_cpu(pplhdr->checksum);
			
 
				+	pplhdr->checksum = 0;
			
 
				+	crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE);
			
 
				+
			
 
				+	if (crc_stored != crc) {
			
 
				+		pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x\n",
			
 
				+			 __func__, crc_stored, crc);
			
 
				+		ppl_conf->mismatch_count++;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	signature = le32_to_cpu(pplhdr->signature);
			
 
				+
			
 
				+	if (mddev->external) {
			
 
				+		/*
			
 
				+		 * For external metadata the header signature is set and
			
 
				+		 * validated in userspace.
			
 
				+		 */
			
 
				+		ppl_conf->signature = signature;
			
 
				+	} else if (ppl_conf->signature != signature) {
			
 
				+		pr_debug("%s: ppl header signature does not match: stored: 0x%x configured: 0x%x\n",
			
 
				+			 __func__, signature, ppl_conf->signature);
			
 
				+		ppl_conf->mismatch_count++;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	/* attempt to recover from log if we are starting a dirty array */
			
 
				+	if (!mddev->pers && mddev->recovery_cp != MaxSector)
			
 
				+		ret = ppl_recover(log, pplhdr);
			
 
				+out:
			
 
				+	/* write empty header if we are starting the array */
			
 
				+	if (!ret && !mddev->pers)
			
 
				+		ret = ppl_write_empty_header(log);
			
 
				+
			
 
				+	__free_page(page);
			
 
				+
			
 
				+	pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
			
 
				+		 __func__, ret, ppl_conf->mismatch_count,
			
 
				+		 ppl_conf->recovered_entries);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int ppl_load(struct ppl_conf *ppl_conf)
			
 
				+{
			
 
				+	int ret = 0;
			
 
				+	u32 signature = 0;
			
 
				+	bool signature_set = false;
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < ppl_conf->count; i++) {
			
 
				+		struct ppl_log *log = &ppl_conf->child_logs[i];
			
 
				+
			
 
				+		/* skip missing drive */
			
 
				+		if (!log->rdev)
			
 
				+			continue;
			
 
				+
			
 
				+		ret = ppl_load_distributed(log);
			
 
				+		if (ret)
			
 
				+			break;
			
 
				+
			
 
				+		/*
			
 
				+		 * For external metadata we can't check if the signature is
			
 
				+		 * correct on a single drive, but we can check if it is the same
			
 
				+		 * on all drives.
			
 
				+		 */
			
 
				+		if (ppl_conf->mddev->external) {
			
 
				+			if (!signature_set) {
			
 
				+				signature = ppl_conf->signature;
			
 
				+				signature_set = true;
			
 
				+			} else if (signature != ppl_conf->signature) {
			
 
				+				pr_warn("md/raid:%s: PPL header signature does not match on all member drives\n",
			
 
				+					mdname(ppl_conf->mddev));
			
 
				+				ret = -EINVAL;
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
			
 
				+		 __func__, ret, ppl_conf->mismatch_count,
			
 
				+		 ppl_conf->recovered_entries);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static void __ppl_exit_log(struct ppl_conf *ppl_conf)
			
 
				+{
			
 
				+	clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
			
 
				+
			
 
				+	kfree(ppl_conf->child_logs);
			
 
				+
			
 
				+	if (ppl_conf->bs)
			
 
				+		bioset_free(ppl_conf->bs);
			
 
				+	mempool_destroy(ppl_conf->io_pool);
			
 
				+	kmem_cache_destroy(ppl_conf->io_kc);
			
 
				+
			
 
				+	kfree(ppl_conf);
			
 
				+}
			
 
				+
			
 
				+void ppl_exit_log(struct r5conf *conf)
			
 
				+{
			
 
				+	struct ppl_conf *ppl_conf = conf->log_private;
			
 
				+
			
 
				+	if (ppl_conf) {
			
 
				+		__ppl_exit_log(ppl_conf);
			
 
				+		conf->log_private = NULL;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int ppl_validate_rdev(struct md_rdev *rdev)
			
 
				+{
			
 
				+	char b[BDEVNAME_SIZE];
			
 
				+	int ppl_data_sectors;
			
 
				+	int ppl_size_new;
			
 
				+
			
 
				+	/*
			
 
				+	 * The configured PPL size must be enough to store
			
 
				+	 * the header and (at the very least) partial parity
			
 
				+	 * for one stripe. Round it down to ensure the data
			
 
				+	 * space is cleanly divisible by stripe size.
			
 
				+	 */
			
 
				+	ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9);
			
 
				+
			
 
				+	if (ppl_data_sectors > 0)
			
 
				+		ppl_data_sectors = rounddown(ppl_data_sectors, STRIPE_SECTORS);
			
 
				+
			
 
				+	if (ppl_data_sectors <= 0) {
			
 
				+		pr_warn("md/raid:%s: PPL space too small on %s\n",
			
 
				+			mdname(rdev->mddev), bdevname(rdev->bdev, b));
			
 
				+		return -ENOSPC;
			
 
				+	}
			
 
				+
			
 
				+	ppl_size_new = ppl_data_sectors + (PPL_HEADER_SIZE >> 9);
			
 
				+
			
 
				+	if ((rdev->ppl.sector < rdev->data_offset &&
			
 
				+	     rdev->ppl.sector + ppl_size_new > rdev->data_offset) ||
			
 
				+	    (rdev->ppl.sector >= rdev->data_offset &&
			
 
				+	     rdev->data_offset + rdev->sectors > rdev->ppl.sector)) {
			
 
				+		pr_warn("md/raid:%s: PPL space overlaps with data on %s\n",
			
 
				+			mdname(rdev->mddev), bdevname(rdev->bdev, b));
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	if (!rdev->mddev->external &&
			
 
				+	    ((rdev->ppl.offset > 0 && rdev->ppl.offset < (rdev->sb_size >> 9)) ||
			
 
				+	     (rdev->ppl.offset <= 0 && rdev->ppl.offset + ppl_size_new > 0))) {
			
 
				+		pr_warn("md/raid:%s: PPL space overlaps with superblock on %s\n",
			
 
				+			mdname(rdev->mddev), bdevname(rdev->bdev, b));
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	rdev->ppl.size = ppl_size_new;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int ppl_init_log(struct r5conf *conf)
			
 
				+{
			
 
				+	struct ppl_conf *ppl_conf;
			
 
				+	struct mddev *mddev = conf->mddev;
			
 
				+	int ret = 0;
			
 
				+	int i;
			
 
				+	bool need_cache_flush = false;
			
 
				+
			
 
				+	pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n",
			
 
				+		 mdname(conf->mddev));
			
 
				+
			
 
				+	if (PAGE_SIZE != 4096)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	if (mddev->level != 5) {
			
 
				+		pr_warn("md/raid:%s PPL is not compatible with raid level %d\n",
			
 
				+			mdname(mddev), mddev->level);
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	if (mddev->bitmap_info.file || mddev->bitmap_info.offset) {
			
 
				+		pr_warn("md/raid:%s PPL is not compatible with bitmap\n",
			
 
				+			mdname(mddev));
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
			
 
				+		pr_warn("md/raid:%s PPL is not compatible with journal\n",
			
 
				+			mdname(mddev));
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL);
			
 
				+	if (!ppl_conf)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	ppl_conf->mddev = mddev;
			
 
				+
			
 
				+	ppl_conf->io_kc = KMEM_CACHE(ppl_io_unit, 0);
			
 
				+	if (!ppl_conf->io_kc) {
			
 
				+		ret = -ENOMEM;
			
 
				+		goto err;
			
 
				+	}
			
 
				+
			
 
				+	ppl_conf->io_pool = mempool_create(conf->raid_disks, ppl_io_pool_alloc,
			
 
				+					   ppl_io_pool_free, ppl_conf->io_kc);
			
 
				+	if (!ppl_conf->io_pool) {
			
 
				+		ret = -ENOMEM;
			
 
				+		goto err;
			
 
				+	}
			
 
				+
			
 
				+	ppl_conf->bs = bioset_create(conf->raid_disks, 0);
			
 
				+	if (!ppl_conf->bs) {
			
 
				+		ret = -ENOMEM;
			
 
				+		goto err;
			
 
				+	}
			
 
				+
			
 
				+	ppl_conf->count = conf->raid_disks;
			
 
				+	ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log),
			
 
				+				       GFP_KERNEL);
			
 
				+	if (!ppl_conf->child_logs) {
			
 
				+		ret = -ENOMEM;
			
 
				+		goto err;
			
 
				+	}
			
 
				+
			
 
				+	atomic64_set(&ppl_conf->seq, 0);
			
 
				+	INIT_LIST_HEAD(&ppl_conf->no_mem_stripes);
			
 
				+	spin_lock_init(&ppl_conf->no_mem_stripes_lock);
			
 
				+
			
 
				+	if (!mddev->external) {
			
 
				+		ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid));
			
 
				+		ppl_conf->block_size = 512;
			
 
				+	} else {
			
 
				+		ppl_conf->block_size = queue_logical_block_size(mddev->queue);
			
 
				+	}
			
 
				+
			
 
				+	for (i = 0; i < ppl_conf->count; i++) {
			
 
				+		struct ppl_log *log = &ppl_conf->child_logs[i];
			
 
				+		struct md_rdev *rdev = conf->disks[i].rdev;
			
 
				+
			
 
				+		mutex_init(&log->io_mutex);
			
 
				+		spin_lock_init(&log->io_list_lock);
			
 
				+		INIT_LIST_HEAD(&log->io_list);
			
 
				+
			
 
				+		log->ppl_conf = ppl_conf;
			
 
				+		log->rdev = rdev;
			
 
				+
			
 
				+		if (rdev) {
			
 
				+			struct request_queue *q;
			
 
				+
			
 
				+			ret = ppl_validate_rdev(rdev);
			
 
				+			if (ret)
			
 
				+				goto err;
			
 
				+
			
 
				+			q = bdev_get_queue(rdev->bdev);
			
 
				+			if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
			
 
				+				need_cache_flush = true;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (need_cache_flush)
			
 
				+		pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n",
			
 
				+			mdname(mddev));
			
 
				+
			
 
				+	/* load and possibly recover the logs from the member disks */
			
 
				+	ret = ppl_load(ppl_conf);
			
 
				+
			
 
				+	if (ret) {
			
 
				+		goto err;
			
 
				+	} else if (!mddev->pers &&
			
 
				+		   mddev->recovery_cp == 0 && !mddev->degraded &&
			
 
				+		   ppl_conf->recovered_entries > 0 &&
			
 
				+		   ppl_conf->mismatch_count == 0) {
			
 
				+		/*
			
 
				+		 * If we are starting a dirty array and the recovery succeeds
			
 
				+		 * without any issues, set the array as clean.
			
 
				+		 */
			
 
				+		mddev->recovery_cp = MaxSector;
			
 
				+		set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
			
 
				+	} else if (mddev->pers && ppl_conf->mismatch_count > 0) {
			
 
				+		/* no mismatch allowed when enabling PPL for a running array */
			
 
				+		ret = -EINVAL;
			
 
				+		goto err;
			
 
				+	}
			
 
				+
			
 
				+	conf->log_private = ppl_conf;
			
 
				+	set_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
			
 
				+
			
 
				+	return 0;
			
 
				+err:
			
 
				+	__ppl_exit_log(ppl_conf);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add)
			
 
				+{
			
 
				+	struct ppl_conf *ppl_conf = conf->log_private;
			
 
				+	struct ppl_log *log;
			
 
				+	int ret = 0;
			
 
				+	char b[BDEVNAME_SIZE];
			
 
				+
			
 
				+	if (!rdev)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	pr_debug("%s: disk: %d operation: %s dev: %s\n",
			
 
				+		 __func__, rdev->raid_disk, add ? "add" : "remove",
			
 
				+		 bdevname(rdev->bdev, b));
			
 
				+
			
 
				+	if (rdev->raid_disk < 0)
			
 
				+		return 0;
			
 
				+
			
 
				+	if (rdev->raid_disk >= ppl_conf->count)
			
 
				+		return -ENODEV;
			
 
				+
			
 
				+	log = &ppl_conf->child_logs[rdev->raid_disk];
			
 
				+
			
 
				+	mutex_lock(&log->io_mutex);
			
 
				+	if (add) {
			
 
				+		ret = ppl_validate_rdev(rdev);
			
 
				+		if (!ret) {
			
 
				+			log->rdev = rdev;
			
 
				+			ret = ppl_write_empty_header(log);
			
 
				+		}
			
 
				+	} else {
			
 
				+		log->rdev = NULL;
			
 
				+	}
			
 
				+	mutex_unlock(&log->io_mutex);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -58,11 +58,13 @@
 
				 #include <linux/sched/signal.h>
			
 
				 
			
 
				 #include <trace/events/block.h>
			
 
				+#include <linux/list_sort.h>
			
 
				 
			
 
				 #include "md.h"
			
 
				 #include "raid5.h"
			
 
				 #include "raid0.h"
			
 
				 #include "bitmap.h"
			
 
				+#include "raid5-log.h"
			
 
				 
			
 
				 #define UNSUPPORTED_MDDEV_FLAGS	(1L << MD_FAILFAST_SUPPORTED)
			
 
				 
			
@@ -156,17 +158,6 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
 
				 	return slot;
			
 
				 }
			
 
				 
			
 
				-static void return_io(struct bio_list *return_bi)
			
 
				-{
			
 
				-	struct bio *bi;
			
 
				-	while ((bi = bio_list_pop(return_bi)) != NULL) {
			
 
				-		bi->bi_iter.bi_size = 0;
			
 
				-		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
			
 
				-					 bi, 0);
			
 
				-		bio_endio(bi);
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				 static void print_raid5_conf (struct r5conf *conf);
			
 
				 
			
 
				 static int stripe_operations_active(struct stripe_head *sh)
			
@@ -176,6 +167,13 @@ static int stripe_operations_active(struct stripe_head *sh)
 
				 	       test_bit(STRIPE_COMPUTE_RUN, &sh->state);
			
 
				 }
			
 
				 
			
 
				+static bool stripe_is_lowprio(struct stripe_head *sh)
			
 
				+{
			
 
				+	return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) ||
			
 
				+		test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) &&
			
 
				+	       !test_bit(STRIPE_R5C_CACHING, &sh->state);
			
 
				+}
			
 
				+
			
 
				 static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
			
 
				 {
			
 
				 	struct r5conf *conf = sh->raid_conf;
			
@@ -191,7 +189,10 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
 
				 	if (list_empty(&sh->lru)) {
			
 
				 		struct r5worker_group *group;
			
 
				 		group = conf->worker_groups + cpu_to_group(cpu);
			
 
				-		list_add_tail(&sh->lru, &group->handle_list);
			
 
				+		if (stripe_is_lowprio(sh))
			
 
				+			list_add_tail(&sh->lru, &group->loprio_list);
			
 
				+		else
			
 
				+			list_add_tail(&sh->lru, &group->handle_list);
			
 
				 		group->stripes_cnt++;
			
 
				 		sh->group = group;
			
 
				 	}
			
@@ -254,7 +255,12 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 
				 			clear_bit(STRIPE_DELAYED, &sh->state);
			
 
				 			clear_bit(STRIPE_BIT_DELAY, &sh->state);
			
 
				 			if (conf->worker_cnt_per_group == 0) {
			
 
				-				list_add_tail(&sh->lru, &conf->handle_list);
			
 
				+				if (stripe_is_lowprio(sh))
			
 
				+					list_add_tail(&sh->lru,
			
 
				+							&conf->loprio_list);
			
 
				+				else
			
 
				+					list_add_tail(&sh->lru,
			
 
				+							&conf->handle_list);
			
 
				 			} else {
			
 
				 				raid5_wakeup_stripe_thread(sh);
			
 
				 				return;
			
@@ -481,6 +487,7 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
 
				 		sh->dev[i].page = page;
			
 
				 		sh->dev[i].orig_page = page;
			
 
				 	}
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -729,7 +736,7 @@ static bool stripe_can_batch(struct stripe_head *sh)
 
				 {
			
 
				 	struct r5conf *conf = sh->raid_conf;
			
 
				 
			
 
				-	if (conf->log)
			
 
				+	if (conf->log || raid5_has_ppl(conf))
			
 
				 		return false;
			
 
				 	return test_bit(STRIPE_BATCH_READY, &sh->state) &&
			
 
				 		!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
			
@@ -863,41 +870,107 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
 
				 	return 1;
			
 
				 }
			
 
				 
			
 
				-static void flush_deferred_bios(struct r5conf *conf)
			
 
				+static void dispatch_bio_list(struct bio_list *tmp)
			
 
				 {
			
 
				-	struct bio_list tmp;
			
 
				 	struct bio *bio;
			
 
				 
			
 
				-	if (!conf->batch_bio_dispatch || !conf->group_cnt)
			
 
				+	while ((bio = bio_list_pop(tmp)))
			
 
				+		generic_make_request(bio);
			
 
				+}
			
 
				+
			
 
				+static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b)
			
 
				+{
			
 
				+	const struct r5pending_data *da = list_entry(a,
			
 
				+				struct r5pending_data, sibling);
			
 
				+	const struct r5pending_data *db = list_entry(b,
			
 
				+				struct r5pending_data, sibling);
			
 
				+	if (da->sector > db->sector)
			
 
				+		return 1;
			
 
				+	if (da->sector < db->sector)
			
 
				+		return -1;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void dispatch_defer_bios(struct r5conf *conf, int target,
			
 
				+				struct bio_list *list)
			
 
				+{
			
 
				+	struct r5pending_data *data;
			
 
				+	struct list_head *first, *next = NULL;
			
 
				+	int cnt = 0;
			
 
				+
			
 
				+	if (conf->pending_data_cnt == 0)
			
 
				+		return;
			
 
				+
			
 
				+	list_sort(NULL, &conf->pending_list, cmp_stripe);
			
 
				+
			
 
				+	first = conf->pending_list.next;
			
 
				+
			
 
				+	/* temporarily move the head */
			
 
				+	if (conf->next_pending_data)
			
 
				+		list_move_tail(&conf->pending_list,
			
 
				+				&conf->next_pending_data->sibling);
			
 
				+
			
 
				+	while (!list_empty(&conf->pending_list)) {
			
 
				+		data = list_first_entry(&conf->pending_list,
			
 
				+			struct r5pending_data, sibling);
			
 
				+		if (&data->sibling == first)
			
 
				+			first = data->sibling.next;
			
 
				+		next = data->sibling.next;
			
 
				+
			
 
				+		bio_list_merge(list, &data->bios);
			
 
				+		list_move(&data->sibling, &conf->free_list);
			
 
				+		cnt++;
			
 
				+		if (cnt >= target)
			
 
				+			break;
			
 
				+	}
			
 
				+	conf->pending_data_cnt -= cnt;
			
 
				+	BUG_ON(conf->pending_data_cnt < 0 || cnt < target);
			
 
				+
			
 
				+	if (next != &conf->pending_list)
			
 
				+		conf->next_pending_data = list_entry(next,
			
 
				+				struct r5pending_data, sibling);
			
 
				+	else
			
 
				+		conf->next_pending_data = NULL;
			
 
				+	/* list isn't empty */
			
 
				+	if (first != &conf->pending_list)
			
 
				+		list_move_tail(&conf->pending_list, first);
			
 
				+}
			
 
				+
			
 
				+static void flush_deferred_bios(struct r5conf *conf)
			
 
				+{
			
 
				+	struct bio_list tmp = BIO_EMPTY_LIST;
			
 
				+
			
 
				+	if (conf->pending_data_cnt == 0)
			
 
				 		return;
			
 
				 
			
 
				-	bio_list_init(&tmp);
			
 
				 	spin_lock(&conf->pending_bios_lock);
			
 
				-	bio_list_merge(&tmp, &conf->pending_bios);
			
 
				-	bio_list_init(&conf->pending_bios);
			
 
				+	dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp);
			
 
				+	BUG_ON(conf->pending_data_cnt != 0);
			
 
				 	spin_unlock(&conf->pending_bios_lock);
			
 
				 
			
 
				-	while ((bio = bio_list_pop(&tmp)))
			
 
				-		generic_make_request(bio);
			
 
				+	dispatch_bio_list(&tmp);
			
 
				 }
			
 
				 
			
 
				-static void defer_bio_issue(struct r5conf *conf, struct bio *bio)
			
 
				+static void defer_issue_bios(struct r5conf *conf, sector_t sector,
			
 
				+				struct bio_list *bios)
			
 
				 {
			
 
				-	/*
			
 
				-	 * change group_cnt will drain all bios, so this is safe
			
 
				-	 *
			
 
				-	 * A read generally means a read-modify-write, which usually means a
			
 
				-	 * randwrite, so we don't delay it
			
 
				-	 */
			
 
				-	if (!conf->batch_bio_dispatch || !conf->group_cnt ||
			
 
				-	    bio_op(bio) == REQ_OP_READ) {
			
 
				-		generic_make_request(bio);
			
 
				-		return;
			
 
				-	}
			
 
				+	struct bio_list tmp = BIO_EMPTY_LIST;
			
 
				+	struct r5pending_data *ent;
			
 
				+
			
 
				 	spin_lock(&conf->pending_bios_lock);
			
 
				-	bio_list_add(&conf->pending_bios, bio);
			
 
				+	ent = list_first_entry(&conf->free_list, struct r5pending_data,
			
 
				+							sibling);
			
 
				+	list_move_tail(&ent->sibling, &conf->pending_list);
			
 
				+	ent->sector = sector;
			
 
				+	bio_list_init(&ent->bios);
			
 
				+	bio_list_merge(&ent->bios, bios);
			
 
				+	conf->pending_data_cnt++;
			
 
				+	if (conf->pending_data_cnt >= PENDING_IO_MAX)
			
 
				+		dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp);
			
 
				+
			
 
				 	spin_unlock(&conf->pending_bios_lock);
			
 
				-	md_wakeup_thread(conf->mddev->thread);
			
 
				+
			
 
				+	dispatch_bio_list(&tmp);
			
 
				 }
			
 
				 
			
 
				 static void
			
@@ -910,21 +983,15 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 
				 	struct r5conf *conf = sh->raid_conf;
			
 
				 	int i, disks = sh->disks;
			
 
				 	struct stripe_head *head_sh = sh;
			
 
				+	struct bio_list pending_bios = BIO_EMPTY_LIST;
			
 
				+	bool should_defer;
			
 
				 
			
 
				 	might_sleep();
			
 
				 
			
 
				-	if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
			
 
				-		/* writing out phase */
			
 
				-		if (s->waiting_extra_page)
			
 
				-			return;
			
 
				-		if (r5l_write_stripe(conf->log, sh) == 0)
			
 
				-			return;
			
 
				-	} else {  /* caching phase */
			
 
				-		if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) {
			
 
				-			r5c_cache_data(conf->log, sh, s);
			
 
				-			return;
			
 
				-		}
			
 
				-	}
			
 
				+	if (log_stripe(sh, s) == 0)
			
 
				+		return;
			
 
				+
			
 
				+	should_defer = conf->batch_bio_dispatch && conf->group_cnt;
			
 
				 
			
 
				 	for (i = disks; i--; ) {
			
 
				 		int op, op_flags = 0;
			
@@ -1080,7 +1147,10 @@ again:
 
				 				trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
			
 
				 						      bi, disk_devt(conf->mddev->gendisk),
			
 
				 						      sh->dev[i].sector);
			
 
				-			defer_bio_issue(conf, bi);
			
 
				+			if (should_defer && op_is_write(op))
			
 
				+				bio_list_add(&pending_bios, bi);
			
 
				+			else
			
 
				+				generic_make_request(bi);
			
 
				 		}
			
 
				 		if (rrdev) {
			
 
				 			if (s->syncing || s->expanding || s->expanded
			
@@ -1125,7 +1195,10 @@ again:
 
				 				trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
			
 
				 						      rbi, disk_devt(conf->mddev->gendisk),
			
 
				 						      sh->dev[i].sector);
			
 
				-			defer_bio_issue(conf, rbi);
			
 
				+			if (should_defer && op_is_write(op))
			
 
				+				bio_list_add(&pending_bios, rbi);
			
 
				+			else
			
 
				+				generic_make_request(rbi);
			
 
				 		}
			
 
				 		if (!rdev && !rrdev) {
			
 
				 			if (op_is_write(op))
			
@@ -1143,6 +1216,9 @@ again:
 
				 		if (sh != head_sh)
			
 
				 			goto again;
			
 
				 	}
			
 
				+
			
 
				+	if (should_defer && !bio_list_empty(&pending_bios))
			
 
				+		defer_issue_bios(conf, head_sh->sector, &pending_bios);
			
 
				 }
			
 
				 
			
 
				 static struct dma_async_tx_descriptor *
			
@@ -1212,7 +1288,6 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
 
				 static void ops_complete_biofill(void *stripe_head_ref)
			
 
				 {
			
 
				 	struct stripe_head *sh = stripe_head_ref;
			
 
				-	struct bio_list return_bi = BIO_EMPTY_LIST;
			
 
				 	int i;
			
 
				 
			
 
				 	pr_debug("%s: stripe %llu\n", __func__,
			
@@ -1236,16 +1311,13 @@ static void ops_complete_biofill(void *stripe_head_ref)
 
				 			while (rbi && rbi->bi_iter.bi_sector <
			
 
				 				dev->sector + STRIPE_SECTORS) {
			
 
				 				rbi2 = r5_next_bio(rbi, dev->sector);
			
 
				-				if (!raid5_dec_bi_active_stripes(rbi))
			
 
				-					bio_list_add(&return_bi, rbi);
			
 
				+				bio_endio(rbi);
			
 
				 				rbi = rbi2;
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				 	clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
			
 
				 
			
 
				-	return_io(&return_bi);
			
 
				-
			
 
				 	set_bit(STRIPE_HANDLE, &sh->state);
			
 
				 	raid5_release_stripe(sh);
			
 
				 }
			
@@ -2014,6 +2086,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 
				 			tx = ops_run_prexor6(sh, percpu, tx);
			
 
				 	}
			
 
				 
			
 
				+	if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
			
 
				+		tx = ops_run_partial_parity(sh, percpu, tx);
			
 
				+
			
 
				 	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
			
 
				 		tx = ops_run_biodrain(sh, tx);
			
 
				 		overlap_clear++;
			
@@ -2046,8 +2121,15 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 
				 	put_cpu();
			
 
				 }
			
 
				 
			
 
				+static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
			
 
				+{
			
 
				+	if (sh->ppl_page)
			
 
				+		__free_page(sh->ppl_page);
			
 
				+	kmem_cache_free(sc, sh);
			
 
				+}
			
 
				+
			
 
				 static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
			
 
				-	int disks)
			
 
				+	int disks, struct r5conf *conf)
			
 
				 {
			
 
				 	struct stripe_head *sh;
			
 
				 	int i;
			
@@ -2061,6 +2143,7 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
 
				 		INIT_LIST_HEAD(&sh->r5c);
			
 
				 		INIT_LIST_HEAD(&sh->log_list);
			
 
				 		atomic_set(&sh->count, 1);
			
 
				+		sh->raid_conf = conf;
			
 
				 		sh->log_start = MaxSector;
			
 
				 		for (i = 0; i < disks; i++) {
			
 
				 			struct r5dev *dev = &sh->dev[i];
			
@@ -2068,6 +2151,14 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
 
				 			bio_init(&dev->req, &dev->vec, 1);
			
 
				 			bio_init(&dev->rreq, &dev->rvec, 1);
			
 
				 		}
			
 
				+
			
 
				+		if (raid5_has_ppl(conf)) {
			
 
				+			sh->ppl_page = alloc_page(gfp);
			
 
				+			if (!sh->ppl_page) {
			
 
				+				free_stripe(sc, sh);
			
 
				+				sh = NULL;
			
 
				+			}
			
 
				+		}
			
 
				 	}
			
 
				 	return sh;
			
 
				 }
			
@@ -2075,15 +2166,13 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
 
				 {
			
 
				 	struct stripe_head *sh;
			
 
				 
			
 
				-	sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size);
			
 
				+	sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf);
			
 
				 	if (!sh)
			
 
				 		return 0;
			
 
				 
			
 
				-	sh->raid_conf = conf;
			
 
				-
			
 
				 	if (grow_buffers(sh, gfp)) {
			
 
				 		shrink_buffers(sh);
			
 
				-		kmem_cache_free(conf->slab_cache, sh);
			
 
				+		free_stripe(conf->slab_cache, sh);
			
 
				 		return 0;
			
 
				 	}
			
 
				 	sh->hash_lock_index =
			
@@ -2210,7 +2299,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 
				 	 *    pages have been transferred over, and the old kmem_cache is
			
 
				 	 *    freed when all stripes are done.
			
 
				 	 * 3/ reallocate conf->disks to be suitable bigger.  If this fails,
			
 
				-	 *    we simple return a failre status - no need to clean anything up.
			
 
				+	 *    we simple return a failure status - no need to clean anything up.
			
 
				 	 * 4/ allocate new pages for the new slots in the new stripe_heads.
			
 
				 	 *    If this fails, we don't bother trying the shrink the
			
 
				 	 *    stripe_heads down again, we just leave them as they are.
			
@@ -2228,9 +2317,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 
				 	int i;
			
 
				 	int hash, cnt;
			
 
				 
			
 
				-	if (newsize <= conf->pool_size)
			
 
				-		return 0; /* never bother to shrink */
			
 
				-
			
 
				 	err = md_allow_write(conf->mddev);
			
 
				 	if (err)
			
 
				 		return err;
			
@@ -2246,11 +2332,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 
				 	mutex_lock(&conf->cache_size_mutex);
			
 
				 
			
 
				 	for (i = conf->max_nr_stripes; i; i--) {
			
 
				-		nsh = alloc_stripe(sc, GFP_KERNEL, newsize);
			
 
				+		nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf);
			
 
				 		if (!nsh)
			
 
				 			break;
			
 
				 
			
 
				-		nsh->raid_conf = conf;
			
 
				 		list_add(&nsh->lru, &newstripes);
			
 
				 	}
			
 
				 	if (i) {
			
@@ -2258,7 +2343,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 
				 		while (!list_empty(&newstripes)) {
			
 
				 			nsh = list_entry(newstripes.next, struct stripe_head, lru);
			
 
				 			list_del(&nsh->lru);
			
 
				-			kmem_cache_free(sc, nsh);
			
 
				+			free_stripe(sc, nsh);
			
 
				 		}
			
 
				 		kmem_cache_destroy(sc);
			
 
				 		mutex_unlock(&conf->cache_size_mutex);
			
@@ -2284,7 +2369,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 
				 			nsh->dev[i].orig_page = osh->dev[i].page;
			
 
				 		}
			
 
				 		nsh->hash_lock_index = hash;
			
 
				-		kmem_cache_free(conf->slab_cache, osh);
			
 
				+		free_stripe(conf->slab_cache, osh);
			
 
				 		cnt++;
			
 
				 		if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
			
 
				 		    !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
			
@@ -2323,6 +2408,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 
				 		err = -ENOMEM;
			
 
				 
			
 
				 	mutex_unlock(&conf->cache_size_mutex);
			
 
				+
			
 
				+	conf->slab_cache = sc;
			
 
				+	conf->active_name = 1-conf->active_name;
			
 
				+
			
 
				 	/* Step 4, return new stripes to service */
			
 
				 	while(!list_empty(&newstripes)) {
			
 
				 		nsh = list_entry(newstripes.next, struct stripe_head, lru);
			
@@ -2340,8 +2429,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 
				 	}
			
 
				 	/* critical section pass, GFP_NOIO no longer needed */
			
 
				 
			
 
				-	conf->slab_cache = sc;
			
 
				-	conf->active_name = 1-conf->active_name;
			
 
				 	if (!err)
			
 
				 		conf->pool_size = newsize;
			
 
				 	return err;
			
@@ -2359,7 +2446,7 @@ static int drop_one_stripe(struct r5conf *conf)
 
				 		return 0;
			
 
				 	BUG_ON(atomic_read(&sh->count));
			
 
				 	shrink_buffers(sh);
			
 
				-	kmem_cache_free(conf->slab_cache, sh);
			
 
				+	free_stripe(conf->slab_cache, sh);
			
 
				 	atomic_dec(&conf->active_stripes);
			
 
				 	conf->max_nr_stripes--;
			
 
				 	return 1;
			
@@ -3082,6 +3169,12 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 
				 		s->locked++;
			
 
				 	}
			
 
				 
			
 
				+	if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page &&
			
 
				+	    test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
			
 
				+	    !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
			
 
				+	    test_bit(R5_Insync, &sh->dev[pd_idx].flags))
			
 
				+		set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
			
 
				+
			
 
				 	pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
			
 
				 		__func__, (unsigned long long)sh->sector,
			
 
				 		s->locked, s->ops_request);
			
@@ -3103,14 +3196,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 
				 		(unsigned long long)bi->bi_iter.bi_sector,
			
 
				 		(unsigned long long)sh->sector);
			
 
				 
			
 
				-	/*
			
 
				-	 * If several bio share a stripe. The bio bi_phys_segments acts as a
			
 
				-	 * reference count to avoid race. The reference count should already be
			
 
				-	 * increased before this function is called (for example, in
			
 
				-	 * raid5_make_request()), so other bio sharing this stripe will not free the
			
 
				-	 * stripe. If a stripe is owned by one stripe, the stripe lock will
			
 
				-	 * protect it.
			
 
				-	 */
			
 
				 	spin_lock_irq(&sh->stripe_lock);
			
 
				 	/* Don't allow new IO added to stripes in batch list */
			
 
				 	if (sh->batch_head)
			
@@ -3129,6 +3214,36 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 
				 	if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
			
 
				 		goto overlap;
			
 
				 
			
 
				+	if (forwrite && raid5_has_ppl(conf)) {
			
 
				+		/*
			
 
				+		 * With PPL only writes to consecutive data chunks within a
			
 
				+		 * stripe are allowed because for a single stripe_head we can
			
 
				+		 * only have one PPL entry at a time, which describes one data
			
 
				+		 * range. Not really an overlap, but wait_for_overlap can be
			
 
				+		 * used to handle this.
			
 
				+		 */
			
 
				+		sector_t sector;
			
 
				+		sector_t first = 0;
			
 
				+		sector_t last = 0;
			
 
				+		int count = 0;
			
 
				+		int i;
			
 
				+
			
 
				+		for (i = 0; i < sh->disks; i++) {
			
 
				+			if (i != sh->pd_idx &&
			
 
				+			    (i == dd_idx || sh->dev[i].towrite)) {
			
 
				+				sector = sh->dev[i].sector;
			
 
				+				if (count == 0 || sector < first)
			
 
				+					first = sector;
			
 
				+				if (sector > last)
			
 
				+					last = sector;
			
 
				+				count++;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		if (first + conf->chunk_sectors * (count - 1) != last)
			
 
				+			goto overlap;
			
 
				+	}
			
 
				+
			
 
				 	if (!forwrite || previous)
			
 
				 		clear_bit(STRIPE_BATCH_READY, &sh->state);
			
 
				 
			
@@ -3136,7 +3251,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 
				 	if (*bip)
			
 
				 		bi->bi_next = *bip;
			
 
				 	*bip = bi;
			
 
				-	raid5_inc_bi_active_stripes(bi);
			
 
				+	bio_inc_remaining(bi);
			
 
				+	md_write_inc(conf->mddev, bi);
			
 
				 
			
 
				 	if (forwrite) {
			
 
				 		/* check if page is covered */
			
@@ -3213,8 +3329,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
 
				 
			
 
				 static void
			
 
				 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
			
 
				-				struct stripe_head_state *s, int disks,
			
 
				-				struct bio_list *return_bi)
			
 
				+		     struct stripe_head_state *s, int disks)
			
 
				 {
			
 
				 	int i;
			
 
				 	BUG_ON(sh->batch_head);
			
@@ -3250,7 +3365,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 
				 		if (bi)
			
 
				 			bitmap_end = 1;
			
 
				 
			
 
				-		r5l_stripe_write_finished(sh);
			
 
				+		log_stripe_write_finished(sh);
			
 
				 
			
 
				 		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
			
 
				 			wake_up(&conf->wait_for_overlap);
			
@@ -3260,10 +3375,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 
				 			struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
			
 
				 
			
 
				 			bi->bi_error = -EIO;
			
 
				-			if (!raid5_dec_bi_active_stripes(bi)) {
			
 
				-				md_write_end(conf->mddev);
			
 
				-				bio_list_add(return_bi, bi);
			
 
				-			}
			
 
				+			md_write_end(conf->mddev);
			
 
				+			bio_endio(bi);
			
 
				 			bi = nextbi;
			
 
				 		}
			
 
				 		if (bitmap_end)
			
@@ -3284,10 +3397,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 
				 			struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
			
 
				 
			
 
				 			bi->bi_error = -EIO;
			
 
				-			if (!raid5_dec_bi_active_stripes(bi)) {
			
 
				-				md_write_end(conf->mddev);
			
 
				-				bio_list_add(return_bi, bi);
			
 
				-			}
			
 
				+			md_write_end(conf->mddev);
			
 
				+			bio_endio(bi);
			
 
				 			bi = bi2;
			
 
				 		}
			
 
				 
			
@@ -3312,8 +3423,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 
				 					r5_next_bio(bi, sh->dev[i].sector);
			
 
				 
			
 
				 				bi->bi_error = -EIO;
			
 
				-				if (!raid5_dec_bi_active_stripes(bi))
			
 
				-					bio_list_add(return_bi, bi);
			
 
				+				bio_endio(bi);
			
 
				 				bi = nextbi;
			
 
				 			}
			
 
				 		}
			
@@ -3449,7 +3559,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
 
				 	    !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
			
 
				 		/* Pre-reads at not permitted until after short delay
			
 
				 		 * to gather multiple requests.  However if this
			
 
				-		 * device is no Insync, the block could only be be computed
			
 
				+		 * device is no Insync, the block could only be computed
			
 
				 		 * and there is no need to delay that.
			
 
				 		 */
			
 
				 		return 0;
			
@@ -3468,7 +3578,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
 
				 
			
 
				 	/* If we are forced to do a reconstruct-write, either because
			
 
				 	 * the current RAID6 implementation only supports that, or
			
 
				-	 * or because parity cannot be trusted and we are currently
			
 
				+	 * because parity cannot be trusted and we are currently
			
 
				 	 * recovering it, there is extra need to be careful.
			
 
				 	 * If one of the devices that we would need to read, because
			
 
				 	 * it is not being overwritten (and maybe not written at all)
			
@@ -3508,9 +3618,20 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
 
				 		BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
			
 
				 		BUG_ON(test_bit(R5_Wantread, &dev->flags));
			
 
				 		BUG_ON(sh->batch_head);
			
 
				+
			
 
				+		/*
			
 
				+		 * In the raid6 case if the only non-uptodate disk is P
			
 
				+		 * then we already trusted P to compute the other failed
			
 
				+		 * drives. It is safe to compute rather than re-read P.
			
 
				+		 * In other cases we only compute blocks from failed
			
 
				+		 * devices, otherwise check/repair might fail to detect
			
 
				+		 * a real inconsistency.
			
 
				+		 */
			
 
				+
			
 
				 		if ((s->uptodate == disks - 1) &&
			
 
				+		    ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) ||
			
 
				 		    (s->failed && (disk_idx == s->failed_num[0] ||
			
 
				-				   disk_idx == s->failed_num[1]))) {
			
 
				+				   disk_idx == s->failed_num[1])))) {
			
 
				 			/* have disk failed, and we're requested to fetch it;
			
 
				 			 * do compute it
			
 
				 			 */
			
@@ -3612,7 +3733,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
 
				  * never LOCKED, so we don't need to test 'failed' directly.
			
 
				  */
			
 
				 static void handle_stripe_clean_event(struct r5conf *conf,
			
 
				-	struct stripe_head *sh, int disks, struct bio_list *return_bi)
			
 
				+	struct stripe_head *sh, int disks)
			
 
				 {
			
 
				 	int i;
			
 
				 	struct r5dev *dev;
			
@@ -3644,10 +3765,8 @@ returnbi:
 
				 				while (wbi && wbi->bi_iter.bi_sector <
			
 
				 					dev->sector + STRIPE_SECTORS) {
			
 
				 					wbi2 = r5_next_bio(wbi, dev->sector);
			
 
				-					if (!raid5_dec_bi_active_stripes(wbi)) {
			
 
				-						md_write_end(conf->mddev);
			
 
				-						bio_list_add(return_bi, wbi);
			
 
				-					}
			
 
				+					md_write_end(conf->mddev);
			
 
				+					bio_endio(wbi);
			
 
				 					wbi = wbi2;
			
 
				 				}
			
 
				 				bitmap_endwrite(conf->mddev->bitmap, sh->sector,
			
@@ -3669,7 +3788,7 @@ returnbi:
 
				 				discard_pending = 1;
			
 
				 		}
			
 
				 
			
 
				-	r5l_stripe_write_finished(sh);
			
 
				+	log_stripe_write_finished(sh);
			
 
				 
			
 
				 	if (!discard_pending &&
			
 
				 	    test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
			
@@ -4556,7 +4675,8 @@ static void handle_stripe(struct stripe_head *sh)
 
				 	if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
			
 
				 		goto finish;
			
 
				 
			
 
				-	if (s.handle_bad_blocks) {
			
 
				+	if (s.handle_bad_blocks ||
			
 
				+	    test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
			
 
				 		set_bit(STRIPE_HANDLE, &sh->state);
			
 
				 		goto finish;
			
 
				 	}
			
@@ -4589,7 +4709,7 @@ static void handle_stripe(struct stripe_head *sh)
 
				 		sh->reconstruct_state = 0;
			
 
				 		break_stripe_batch_list(sh, 0);
			
 
				 		if (s.to_read+s.to_write+s.written)
			
 
				-			handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
			
 
				+			handle_failed_stripe(conf, sh, &s, disks);
			
 
				 		if (s.syncing + s.replacing)
			
 
				 			handle_failed_sync(conf, sh, &s);
			
 
				 	}
			
@@ -4655,11 +4775,11 @@ static void handle_stripe(struct stripe_head *sh)
 
				 			     && !test_bit(R5_LOCKED, &qdev->flags)
			
 
				 			     && (test_bit(R5_UPTODATE, &qdev->flags) ||
			
 
				 				 test_bit(R5_Discard, &qdev->flags))))))
			
 
				-		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
			
 
				+		handle_stripe_clean_event(conf, sh, disks);
			
 
				 
			
 
				 	if (s.just_cached)
			
 
				-		r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi);
			
 
				-	r5l_stripe_write_finished(sh);
			
 
				+		r5c_handle_cached_data_endio(conf, sh, disks);
			
 
				+	log_stripe_write_finished(sh);
			
 
				 
			
 
				 	/* Now we might consider reading some blocks, either to check/generate
			
 
				 	 * parity, or to satisfy requests
			
@@ -4886,16 +5006,6 @@ finish:
 
				 			md_wakeup_thread(conf->mddev->thread);
			
 
				 	}
			
 
				 
			
 
				-	if (!bio_list_empty(&s.return_bi)) {
			
 
				-		if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
			
 
				-			spin_lock_irq(&conf->device_lock);
			
 
				-			bio_list_merge(&conf->return_bi, &s.return_bi);
			
 
				-			spin_unlock_irq(&conf->device_lock);
			
 
				-			md_wakeup_thread(conf->mddev->thread);
			
 
				-		} else
			
 
				-			return_io(&s.return_bi);
			
 
				-	}
			
 
				-
			
 
				 	clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
			
 
				 }
			
 
				 
			
@@ -4984,12 +5094,14 @@ static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
 
				 	md_wakeup_thread(conf->mddev->thread);
			
 
				 }
			
 
				 
			
 
				-static struct bio *remove_bio_from_retry(struct r5conf *conf)
			
 
				+static struct bio *remove_bio_from_retry(struct r5conf *conf,
			
 
				+					 unsigned int *offset)
			
 
				 {
			
 
				 	struct bio *bi;
			
 
				 
			
 
				 	bi = conf->retry_read_aligned;
			
 
				 	if (bi) {
			
 
				+		*offset = conf->retry_read_offset;
			
 
				 		conf->retry_read_aligned = NULL;
			
 
				 		return bi;
			
 
				 	}
			
@@ -4997,11 +5109,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
 
				 	if(bi) {
			
 
				 		conf->retry_read_aligned_list = bi->bi_next;
			
 
				 		bi->bi_next = NULL;
			
 
				-		/*
			
 
				-		 * this sets the active strip count to 1 and the processed
			
 
				-		 * strip count to zero (upper 8 bits)
			
 
				-		 */
			
 
				-		raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */
			
 
				+		*offset = 0;
			
 
				 	}
			
 
				 
			
 
				 	return bi;
			
@@ -5136,24 +5244,20 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 
				 static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
			
 
				 {
			
 
				 	struct bio *split;
			
 
				+	sector_t sector = raid_bio->bi_iter.bi_sector;
			
 
				+	unsigned chunk_sects = mddev->chunk_sectors;
			
 
				+	unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
			
 
				 
			
 
				-	do {
			
 
				-		sector_t sector = raid_bio->bi_iter.bi_sector;
			
 
				-		unsigned chunk_sects = mddev->chunk_sectors;
			
 
				-		unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
			
 
				-
			
 
				-		if (sectors < bio_sectors(raid_bio)) {
			
 
				-			split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set);
			
 
				-			bio_chain(split, raid_bio);
			
 
				-		} else
			
 
				-			split = raid_bio;
			
 
				+	if (sectors < bio_sectors(raid_bio)) {
			
 
				+		struct r5conf *conf = mddev->private;
			
 
				+		split = bio_split(raid_bio, sectors, GFP_NOIO, conf->bio_split);
			
 
				+		bio_chain(split, raid_bio);
			
 
				+		generic_make_request(raid_bio);
			
 
				+		raid_bio = split;
			
 
				+	}
			
 
				 
			
 
				-		if (!raid5_read_one_chunk(mddev, split)) {
			
 
				-			if (split != raid_bio)
			
 
				-				generic_make_request(raid_bio);
			
 
				-			return split;
			
 
				-		}
			
 
				-	} while (split != raid_bio);
			
 
				+	if (!raid5_read_one_chunk(mddev, raid_bio))
			
 
				+		return raid_bio;
			
 
				 
			
 
				 	return NULL;
			
 
				 }
			
@@ -5170,19 +5274,27 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
 
				  */
			
 
				 static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
			
 
				 {
			
 
				-	struct stripe_head *sh = NULL, *tmp;
			
 
				+	struct stripe_head *sh, *tmp;
			
 
				 	struct list_head *handle_list = NULL;
			
 
				-	struct r5worker_group *wg = NULL;
			
 
				+	struct r5worker_group *wg;
			
 
				+	bool second_try = !r5c_is_writeback(conf->log);
			
 
				+	bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state);
			
 
				 
			
 
				+again:
			
 
				+	wg = NULL;
			
 
				+	sh = NULL;
			
 
				 	if (conf->worker_cnt_per_group == 0) {
			
 
				-		handle_list = &conf->handle_list;
			
 
				+		handle_list = try_loprio ? &conf->loprio_list :
			
 
				+					&conf->handle_list;
			
 
				 	} else if (group != ANY_GROUP) {
			
 
				-		handle_list = &conf->worker_groups[group].handle_list;
			
 
				+		handle_list = try_loprio ? &conf->worker_groups[group].loprio_list :
			
 
				+				&conf->worker_groups[group].handle_list;
			
 
				 		wg = &conf->worker_groups[group];
			
 
				 	} else {
			
 
				 		int i;
			
 
				 		for (i = 0; i < conf->group_cnt; i++) {
			
 
				-			handle_list = &conf->worker_groups[i].handle_list;
			
 
				+			handle_list = try_loprio ? &conf->worker_groups[i].loprio_list :
			
 
				+				&conf->worker_groups[i].handle_list;
			
 
				 			wg = &conf->worker_groups[i];
			
 
				 			if (!list_empty(handle_list))
			
 
				 				break;
			
@@ -5233,8 +5345,13 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
 
				 		wg = NULL;
			
 
				 	}
			
 
				 
			
 
				-	if (!sh)
			
 
				-		return NULL;
			
 
				+	if (!sh) {
			
 
				+		if (second_try)
			
 
				+			return NULL;
			
 
				+		second_try = true;
			
 
				+		try_loprio = !try_loprio;
			
 
				+		goto again;
			
 
				+	}
			
 
				 
			
 
				 	if (wg) {
			
 
				 		wg->stripes_cnt--;
			
@@ -5323,7 +5440,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 
				 	struct r5conf *conf = mddev->private;
			
 
				 	sector_t logical_sector, last_sector;
			
 
				 	struct stripe_head *sh;
			
 
				-	int remaining;
			
 
				 	int stripe_sectors;
			
 
				 
			
 
				 	if (mddev->reshape_position != MaxSector)
			
@@ -5334,7 +5450,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 
				 	last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
			
 
				 
			
 
				 	bi->bi_next = NULL;
			
 
				-	bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
			
 
				+	md_write_start(mddev, bi);
			
 
				 
			
 
				 	stripe_sectors = conf->chunk_sectors *
			
 
				 		(conf->raid_disks - conf->max_degraded);
			
@@ -5380,7 +5496,8 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 
				 				continue;
			
 
				 			sh->dev[d].towrite = bi;
			
 
				 			set_bit(R5_OVERWRITE, &sh->dev[d].flags);
			
 
				-			raid5_inc_bi_active_stripes(bi);
			
 
				+			bio_inc_remaining(bi);
			
 
				+			md_write_inc(mddev, bi);
			
 
				 			sh->overwrite_disks++;
			
 
				 		}
			
 
				 		spin_unlock_irq(&sh->stripe_lock);
			
@@ -5403,11 +5520,8 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 
				 		release_stripe_plug(mddev, sh);
			
 
				 	}
			
 
				 
			
 
				-	remaining = raid5_dec_bi_active_stripes(bi);
			
 
				-	if (remaining == 0) {
			
 
				-		md_write_end(mddev);
			
 
				-		bio_endio(bi);
			
 
				-	}
			
 
				+	md_write_end(mddev);
			
 
				+	bio_endio(bi);
			
 
				 }
			
 
				 
			
 
				 static void raid5_make_request(struct mddev *mddev, struct bio * bi)
			
@@ -5418,7 +5532,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 
				 	sector_t logical_sector, last_sector;
			
 
				 	struct stripe_head *sh;
			
 
				 	const int rw = bio_data_dir(bi);
			
 
				-	int remaining;
			
 
				 	DEFINE_WAIT(w);
			
 
				 	bool do_prepare;
			
 
				 	bool do_flush = false;
			
@@ -5440,8 +5553,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 
				 		do_flush = bi->bi_opf & REQ_PREFLUSH;
			
 
				 	}
			
 
				 
			
 
				-	md_write_start(mddev, bi);
			
 
				-
			
 
				 	/*
			
 
				 	 * If array is degraded, better not do chunk aligned read because
			
 
				 	 * later we might have to read it again in order to reconstruct
			
@@ -5462,7 +5573,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 
				 	logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
			
 
				 	last_sector = bio_end_sector(bi);
			
 
				 	bi->bi_next = NULL;
			
 
				-	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
			
 
				+	md_write_start(mddev, bi);
			
 
				 
			
 
				 	prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
			
 
				 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
			
@@ -5597,16 +5708,9 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 
				 	}
			
 
				 	finish_wait(&conf->wait_for_overlap, &w);
			
 
				 
			
 
				-	remaining = raid5_dec_bi_active_stripes(bi);
			
 
				-	if (remaining == 0) {
			
 
				-
			
 
				-		if ( rw == WRITE )
			
 
				-			md_write_end(mddev);
			
 
				-
			
 
				-		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
			
 
				-					 bi, 0);
			
 
				-		bio_endio(bi);
			
 
				-	}
			
 
				+	if (rw == WRITE)
			
 
				+		md_write_end(mddev);
			
 
				+	bio_endio(bi);
			
 
				 }
			
 
				 
			
 
				 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
			
@@ -5955,7 +6059,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
 
				 	return STRIPE_SECTORS;
			
 
				 }
			
 
				 
			
 
				-static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
			
 
				+static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
			
 
				+			       unsigned int offset)
			
 
				 {
			
 
				 	/* We may not be able to submit a whole bio at once as there
			
 
				 	 * may not be enough stripe_heads available.
			
@@ -5971,7 +6076,6 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 
				 	int dd_idx;
			
 
				 	sector_t sector, logical_sector, last_sector;
			
 
				 	int scnt = 0;
			
 
				-	int remaining;
			
 
				 	int handled = 0;
			
 
				 
			
 
				 	logical_sector = raid_bio->bi_iter.bi_sector &
			
@@ -5985,7 +6089,7 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 
				 		     sector += STRIPE_SECTORS,
			
 
				 		     scnt++) {
			
 
				 
			
 
				-		if (scnt < raid5_bi_processed_stripes(raid_bio))
			
 
				+		if (scnt < offset)
			
 
				 			/* already done this stripe */
			
 
				 			continue;
			
 
				 
			
@@ -5993,15 +6097,15 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 
				 
			
 
				 		if (!sh) {
			
 
				 			/* failed to get a stripe - must wait */
			
 
				-			raid5_set_bi_processed_stripes(raid_bio, scnt);
			
 
				 			conf->retry_read_aligned = raid_bio;
			
 
				+			conf->retry_read_offset = scnt;
			
 
				 			return handled;
			
 
				 		}
			
 
				 
			
 
				 		if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
			
 
				 			raid5_release_stripe(sh);
			
 
				-			raid5_set_bi_processed_stripes(raid_bio, scnt);
			
 
				 			conf->retry_read_aligned = raid_bio;
			
 
				+			conf->retry_read_offset = scnt;
			
 
				 			return handled;
			
 
				 		}
			
 
				 
			
@@ -6010,12 +6114,9 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 
				 		raid5_release_stripe(sh);
			
 
				 		handled++;
			
 
				 	}
			
 
				-	remaining = raid5_dec_bi_active_stripes(raid_bio);
			
 
				-	if (remaining == 0) {
			
 
				-		trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
			
 
				-					 raid_bio, 0);
			
 
				-		bio_endio(raid_bio);
			
 
				-	}
			
 
				+
			
 
				+	bio_endio(raid_bio);
			
 
				+
			
 
				 	if (atomic_dec_and_test(&conf->active_aligned_reads))
			
 
				 		wake_up(&conf->wait_for_quiescent);
			
 
				 	return handled;
			
@@ -6058,7 +6159,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
 
				 
			
 
				 	for (i = 0; i < batch_size; i++)
			
 
				 		handle_stripe(batch[i]);
			
 
				-	r5l_write_stripe_run(conf->log);
			
 
				+	log_write_stripe_run(conf);
			
 
				 
			
 
				 	cond_resched();
			
 
				 
			
@@ -6075,6 +6176,7 @@ static void raid5_do_work(struct work_struct *work)
 
				 	struct r5worker *worker = container_of(work, struct r5worker, work);
			
 
				 	struct r5worker_group *group = worker->group;
			
 
				 	struct r5conf *conf = group->conf;
			
 
				+	struct mddev *mddev = conf->mddev;
			
 
				 	int group_id = group - conf->worker_groups;
			
 
				 	int handled;
			
 
				 	struct blk_plug plug;
			
@@ -6095,6 +6197,9 @@ static void raid5_do_work(struct work_struct *work)
 
				 		if (!batch_size && !released)
			
 
				 			break;
			
 
				 		handled += batch_size;
			
 
				+		wait_event_lock_irq(mddev->sb_wait,
			
 
				+			!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
			
 
				+			conf->device_lock);
			
 
				 	}
			
 
				 	pr_debug("%d stripes handled\n", handled);
			
 
				 
			
@@ -6122,24 +6227,13 @@ static void raid5d(struct md_thread *thread)
 
				 
			
 
				 	md_check_recovery(mddev);
			
 
				 
			
 
				-	if (!bio_list_empty(&conf->return_bi) &&
			
 
				-	    !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
			
 
				-		struct bio_list tmp = BIO_EMPTY_LIST;
			
 
				-		spin_lock_irq(&conf->device_lock);
			
 
				-		if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
			
 
				-			bio_list_merge(&tmp, &conf->return_bi);
			
 
				-			bio_list_init(&conf->return_bi);
			
 
				-		}
			
 
				-		spin_unlock_irq(&conf->device_lock);
			
 
				-		return_io(&tmp);
			
 
				-	}
			
 
				-
			
 
				 	blk_start_plug(&plug);
			
 
				 	handled = 0;
			
 
				 	spin_lock_irq(&conf->device_lock);
			
 
				 	while (1) {
			
 
				 		struct bio *bio;
			
 
				 		int batch_size, released;
			
 
				+		unsigned int offset;
			
 
				 
			
 
				 		released = release_stripe_list(conf, conf->temp_inactive_list);
			
 
				 		if (released)
			
@@ -6157,10 +6251,10 @@ static void raid5d(struct md_thread *thread)
 
				 		}
			
 
				 		raid5_activate_delayed(conf);
			
 
				 
			
 
				-		while ((bio = remove_bio_from_retry(conf))) {
			
 
				+		while ((bio = remove_bio_from_retry(conf, &offset))) {
			
 
				 			int ok;
			
 
				 			spin_unlock_irq(&conf->device_lock);
			
 
				-			ok = retry_aligned_read(conf, bio);
			
 
				+			ok = retry_aligned_read(conf, bio, offset);
			
 
				 			spin_lock_irq(&conf->device_lock);
			
 
				 			if (!ok)
			
 
				 				break;
			
@@ -6544,6 +6638,7 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt,
 
				 
			
 
				 		group = &(*worker_groups)[i];
			
 
				 		INIT_LIST_HEAD(&group->handle_list);
			
 
				+		INIT_LIST_HEAD(&group->loprio_list);
			
 
				 		group->conf = conf;
			
 
				 		group->workers = workers + i * cnt;
			
 
				 
			
@@ -6634,8 +6729,8 @@ static void free_conf(struct r5conf *conf)
 
				 {
			
 
				 	int i;
			
 
				 
			
 
				-	if (conf->log)
			
 
				-		r5l_exit_log(conf->log);
			
 
				+	log_exit(conf);
			
 
				+
			
 
				 	if (conf->shrinker.nr_deferred)
			
 
				 		unregister_shrinker(&conf->shrinker);
			
 
				 
			
@@ -6646,7 +6741,10 @@ static void free_conf(struct r5conf *conf)
 
				 		if (conf->disks[i].extra_page)
			
 
				 			put_page(conf->disks[i].extra_page);
			
 
				 	kfree(conf->disks);
			
 
				+	if (conf->bio_split)
			
 
				+		bioset_free(conf->bio_split);
			
 
				 	kfree(conf->stripe_hashtbl);
			
 
				+	kfree(conf->pending_data);
			
 
				 	kfree(conf);
			
 
				 }
			
 
				 
			
@@ -6756,6 +6854,14 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 
				 	conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
			
 
				 	if (conf == NULL)
			
 
				 		goto abort;
			
 
				+	INIT_LIST_HEAD(&conf->free_list);
			
 
				+	INIT_LIST_HEAD(&conf->pending_list);
			
 
				+	conf->pending_data = kzalloc(sizeof(struct r5pending_data) *
			
 
				+		PENDING_IO_MAX, GFP_KERNEL);
			
 
				+	if (!conf->pending_data)
			
 
				+		goto abort;
			
 
				+	for (i = 0; i < PENDING_IO_MAX; i++)
			
 
				+		list_add(&conf->pending_data[i].sibling, &conf->free_list);
			
 
				 	/* Don't enable multi-threading by default*/
			
 
				 	if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
			
 
				 				 &new_group)) {
			
@@ -6771,15 +6877,14 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 
				 	init_waitqueue_head(&conf->wait_for_stripe);
			
 
				 	init_waitqueue_head(&conf->wait_for_overlap);
			
 
				 	INIT_LIST_HEAD(&conf->handle_list);
			
 
				+	INIT_LIST_HEAD(&conf->loprio_list);
			
 
				 	INIT_LIST_HEAD(&conf->hold_list);
			
 
				 	INIT_LIST_HEAD(&conf->delayed_list);
			
 
				 	INIT_LIST_HEAD(&conf->bitmap_list);
			
 
				-	bio_list_init(&conf->return_bi);
			
 
				 	init_llist_head(&conf->released_stripes);
			
 
				 	atomic_set(&conf->active_stripes, 0);
			
 
				 	atomic_set(&conf->preread_active_stripes, 0);
			
 
				 	atomic_set(&conf->active_aligned_reads, 0);
			
 
				-	bio_list_init(&conf->pending_bios);
			
 
				 	spin_lock_init(&conf->pending_bios_lock);
			
 
				 	conf->batch_bio_dispatch = true;
			
 
				 	rdev_for_each(rdev, mddev) {
			
@@ -6813,6 +6918,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 
				 			goto abort;
			
 
				 	}
			
 
				 
			
 
				+	conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
			
 
				+	if (!conf->bio_split)
			
 
				+		goto abort;
			
 
				 	conf->mddev = mddev;
			
 
				 
			
 
				 	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
			
@@ -7097,6 +7205,13 @@ static int raid5_run(struct mddev *mddev)
 
				 		BUG_ON(mddev->delta_disks != 0);
			
 
				 	}
			
 
				 
			
 
				+	if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
			
 
				+	    test_bit(MD_HAS_PPL, &mddev->flags)) {
			
 
				+		pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
			
 
				+			mdname(mddev));
			
 
				+		clear_bit(MD_HAS_PPL, &mddev->flags);
			
 
				+	}
			
 
				+
			
 
				 	if (mddev->private == NULL)
			
 
				 		conf = setup_conf(mddev);
			
 
				 	else
			
@@ -7188,7 +7303,10 @@ static int raid5_run(struct mddev *mddev)
 
				 
			
 
				 	if (mddev->degraded > dirty_parity_disks &&
			
 
				 	    mddev->recovery_cp != MaxSector) {
			
 
				-		if (mddev->ok_start_degraded)
			
 
				+		if (test_bit(MD_HAS_PPL, &mddev->flags))
			
 
				+			pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
			
 
				+				mdname(mddev));
			
 
				+		else if (mddev->ok_start_degraded)
			
 
				 			pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
			
 
				 				mdname(mddev));
			
 
				 		else {
			
@@ -7254,14 +7372,6 @@ static int raid5_run(struct mddev *mddev)
 
				 		mddev->queue->limits.discard_alignment = stripe;
			
 
				 		mddev->queue->limits.discard_granularity = stripe;
			
 
				 
			
 
				-		/*
			
 
				-		 * We use 16-bit counter of active stripes in bi_phys_segments
			
 
				-		 * (minus one for over-loaded initialization)
			
 
				-		 */
			
 
				-		blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS);
			
 
				-		blk_queue_max_discard_sectors(mddev->queue,
			
 
				-					      0xfffe * STRIPE_SECTORS);
			
 
				-
			
 
				 		blk_queue_max_write_same_sectors(mddev->queue, 0);
			
 
				 		blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
			
 
				 
			
@@ -7299,14 +7409,8 @@ static int raid5_run(struct mddev *mddev)
 
				 		blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
			
 
				 	}
			
 
				 
			
 
				-	if (journal_dev) {
			
 
				-		char b[BDEVNAME_SIZE];
			
 
				-
			
 
				-		pr_debug("md/raid:%s: using device %s as journal\n",
			
 
				-			 mdname(mddev), bdevname(journal_dev->bdev, b));
			
 
				-		if (r5l_init_log(conf, journal_dev))
			
 
				-			goto abort;
			
 
				-	}
			
 
				+	if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
			
 
				+		goto abort;
			
 
				 
			
 
				 	return 0;
			
 
				 abort:
			
@@ -7420,17 +7524,16 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 
				 
			
 
				 	print_raid5_conf(conf);
			
 
				 	if (test_bit(Journal, &rdev->flags) && conf->log) {
			
 
				-		struct r5l_log *log;
			
 
				 		/*
			
 
				 		 * we can't wait pending write here, as this is called in
			
 
				 		 * raid5d, wait will deadlock.
			
 
				+		 * neilb: there is no locking about new writes here,
			
 
				+		 * so this cannot be safe.
			
 
				 		 */
			
 
				-		if (atomic_read(&mddev->writes_pending))
			
 
				+		if (atomic_read(&conf->active_stripes)) {
			
 
				 			return -EBUSY;
			
 
				-		log = conf->log;
			
 
				-		conf->log = NULL;
			
 
				-		synchronize_rcu();
			
 
				-		r5l_exit_log(log);
			
 
				+		}
			
 
				+		log_exit(conf);
			
 
				 		return 0;
			
 
				 	}
			
 
				 	if (rdev == p->rdev)
			
@@ -7469,6 +7572,11 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 
				 			*rdevp = rdev;
			
 
				 		}
			
 
				 	}
			
 
				+	if (!err) {
			
 
				+		err = log_modify(conf, rdev, false);
			
 
				+		if (err)
			
 
				+			goto abort;
			
 
				+	}
			
 
				 	if (p->replacement) {
			
 
				 		/* We must have just cleared 'rdev' */
			
 
				 		p->rdev = p->replacement;
			
@@ -7477,12 +7585,12 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 
				 			   * but will never see neither - if they are careful
			
 
				 			   */
			
 
				 		p->replacement = NULL;
			
 
				-		clear_bit(WantReplacement, &rdev->flags);
			
 
				-	} else
			
 
				-		/* We might have just removed the Replacement as faulty-
			
 
				-		 * clear the bit just in case
			
 
				-		 */
			
 
				-		clear_bit(WantReplacement, &rdev->flags);
			
 
				+
			
 
				+		if (!err)
			
 
				+			err = log_modify(conf, p->rdev, true);
			
 
				+	}
			
 
				+
			
 
				+	clear_bit(WantReplacement, &rdev->flags);
			
 
				 abort:
			
 
				 
			
 
				 	print_raid5_conf(conf);
			
@@ -7499,7 +7607,6 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 
				 	int last = conf->raid_disks - 1;
			
 
				 
			
 
				 	if (test_bit(Journal, &rdev->flags)) {
			
 
				-		char b[BDEVNAME_SIZE];
			
 
				 		if (conf->log)
			
 
				 			return -EBUSY;
			
 
				 
			
@@ -7508,9 +7615,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 
				 		 * The array is in readonly mode if journal is missing, so no
			
 
				 		 * write requests running. We should be safe
			
 
				 		 */
			
 
				-		r5l_init_log(conf, rdev);
			
 
				-		pr_debug("md/raid:%s: using device %s as journal\n",
			
 
				-			 mdname(mddev), bdevname(rdev->bdev, b));
			
 
				+		log_init(conf, rdev, false);
			
 
				 		return 0;
			
 
				 	}
			
 
				 	if (mddev->recovery_disabled == conf->recovery_disabled)
			
@@ -7537,10 +7642,12 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 
				 		if (p->rdev == NULL) {
			
 
				 			clear_bit(In_sync, &rdev->flags);
			
 
				 			rdev->raid_disk = disk;
			
 
				-			err = 0;
			
 
				 			if (rdev->saved_raid_disk != disk)
			
 
				 				conf->fullsync = 1;
			
 
				 			rcu_assign_pointer(p->rdev, rdev);
			
 
				+
			
 
				+			err = log_modify(conf, rdev, true);
			
 
				+
			
 
				 			goto out;
			
 
				 		}
			
 
				 	}
			
@@ -7574,7 +7681,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
 
				 	sector_t newsize;
			
 
				 	struct r5conf *conf = mddev->private;
			
 
				 
			
 
				-	if (conf->log)
			
 
				+	if (conf->log || raid5_has_ppl(conf))
			
 
				 		return -EINVAL;
			
 
				 	sectors &= ~((sector_t)conf->chunk_sectors - 1);
			
 
				 	newsize = raid5_size(mddev, sectors, mddev->raid_disks);
			
@@ -7625,7 +7732,7 @@ static int check_reshape(struct mddev *mddev)
 
				 {
			
 
				 	struct r5conf *conf = mddev->private;
			
 
				 
			
 
				-	if (conf->log)
			
 
				+	if (conf->log || raid5_has_ppl(conf))
			
 
				 		return -EINVAL;
			
 
				 	if (mddev->delta_disks == 0 &&
			
 
				 	    mddev->new_layout == mddev->layout &&
			
@@ -7658,6 +7765,9 @@ static int check_reshape(struct mddev *mddev)
 
				 				      mddev->chunk_sectors)
			
 
				 			    ) < 0)
			
 
				 			return -ENOMEM;
			
 
				+
			
 
				+	if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size)
			
 
				+		return 0; /* never bother to shrink */
			
 
				 	return resize_stripes(conf, (conf->previous_raid_disks
			
 
				 				     + mddev->delta_disks));
			
 
				 }
			
@@ -8148,6 +8258,68 @@ static void *raid6_takeover(struct mddev *mddev)
 
				 	return setup_conf(mddev);
			
 
				 }
			
 
				 
			
 
				+static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
			
 
				+{
			
 
				+	struct r5conf *conf;
			
 
				+	int err;
			
 
				+
			
 
				+	err = mddev_lock(mddev);
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+	conf = mddev->private;
			
 
				+	if (!conf) {
			
 
				+		mddev_unlock(mddev);
			
 
				+		return -ENODEV;
			
 
				+	}
			
 
				+
			
 
				+	if (strncmp(buf, "ppl", 3) == 0) {
			
 
				+		/* ppl only works with RAID 5 */
			
 
				+		if (!raid5_has_ppl(conf) && conf->level == 5) {
			
 
				+			err = log_init(conf, NULL, true);
			
 
				+			if (!err) {
			
 
				+				err = resize_stripes(conf, conf->pool_size);
			
 
				+				if (err)
			
 
				+					log_exit(conf);
			
 
				+			}
			
 
				+		} else
			
 
				+			err = -EINVAL;
			
 
				+	} else if (strncmp(buf, "resync", 6) == 0) {
			
 
				+		if (raid5_has_ppl(conf)) {
			
 
				+			mddev_suspend(mddev);
			
 
				+			log_exit(conf);
			
 
				+			mddev_resume(mddev);
			
 
				+			err = resize_stripes(conf, conf->pool_size);
			
 
				+		} else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
			
 
				+			   r5l_log_disk_error(conf)) {
			
 
				+			bool journal_dev_exists = false;
			
 
				+			struct md_rdev *rdev;
			
 
				+
			
 
				+			rdev_for_each(rdev, mddev)
			
 
				+				if (test_bit(Journal, &rdev->flags)) {
			
 
				+					journal_dev_exists = true;
			
 
				+					break;
			
 
				+				}
			
 
				+
			
 
				+			if (!journal_dev_exists) {
			
 
				+				mddev_suspend(mddev);
			
 
				+				clear_bit(MD_HAS_JOURNAL, &mddev->flags);
			
 
				+				mddev_resume(mddev);
			
 
				+			} else  /* need remove journal device first */
			
 
				+				err = -EBUSY;
			
 
				+		} else
			
 
				+			err = -EINVAL;
			
 
				+	} else {
			
 
				+		err = -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	if (!err)
			
 
				+		md_update_sb(mddev, 1);
			
 
				+
			
 
				+	mddev_unlock(mddev);
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				 static struct md_personality raid6_personality =
			
 
				 {
			
 
				 	.name		= "raid6",
			
@@ -8170,6 +8342,7 @@ static struct md_personality raid6_personality =
 
				 	.quiesce	= raid5_quiesce,
			
 
				 	.takeover	= raid6_takeover,
			
 
				 	.congested	= raid5_congested,
			
 
				+	.change_consistency_policy = raid5_change_consistency_policy,
			
 
				 };
			
 
				 static struct md_personality raid5_personality =
			
 
				 {
			
@@ -8193,6 +8366,7 @@ static struct md_personality raid5_personality =
 
				 	.quiesce	= raid5_quiesce,
			
 
				 	.takeover	= raid5_takeover,
			
 
				 	.congested	= raid5_congested,
			
 
				+	.change_consistency_policy = raid5_change_consistency_policy,
			
 
				 };
			
 
				 
			
 
				 static struct md_personality raid4_personality =
			
@@ -8217,6 +8391,7 @@ static struct md_personality raid4_personality =
 
				 	.quiesce	= raid5_quiesce,
			
 
				 	.takeover	= raid4_takeover,
			
 
				 	.congested	= raid5_congested,
			
 
				+	.change_consistency_policy = raid5_change_consistency_policy,
			
 
				 };
			
 
				 
			
 
				 static int __init raid5_init(void)
			
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -224,10 +224,16 @@ struct stripe_head {
 
				 	spinlock_t		batch_lock; /* only header's lock is useful */
			
 
				 	struct list_head	batch_list; /* protected by head's batch lock*/
			
 
				 
			
 
				-	struct r5l_io_unit	*log_io;
			
 
				+	union {
			
 
				+		struct r5l_io_unit	*log_io;
			
 
				+		struct ppl_io_unit	*ppl_io;
			
 
				+	};
			
 
				+
			
 
				 	struct list_head	log_list;
			
 
				 	sector_t		log_start; /* first meta block on the journal */
			
 
				 	struct list_head	r5c; /* for r5c_cache->stripe_in_journal */
			
 
				+
			
 
				+	struct page		*ppl_page; /* partial parity of this stripe */
			
 
				 	/**
			
 
				 	 * struct stripe_operations
			
 
				 	 * @target - STRIPE_OP_COMPUTE_BLK target
			
@@ -272,7 +278,6 @@ struct stripe_head_state {
 
				 	int dec_preread_active;
			
 
				 	unsigned long ops_request;
			
 
				 
			
 
				-	struct bio_list return_bi;
			
 
				 	struct md_rdev *blocked_rdev;
			
 
				 	int handle_bad_blocks;
			
 
				 	int log_failed;
			
@@ -400,6 +405,7 @@ enum {
 
				 	STRIPE_OP_BIODRAIN,
			
 
				 	STRIPE_OP_RECONSTRUCT,
			
 
				 	STRIPE_OP_CHECK,
			
 
				+	STRIPE_OP_PARTIAL_PARITY,
			
 
				 };
			
 
				 
			
 
				 /*
			
@@ -481,50 +487,6 @@ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
 
				 		return NULL;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * We maintain a biased count of active stripes in the bottom 16 bits of
			
 
				- * bi_phys_segments, and a count of processed stripes in the upper 16 bits
			
 
				- */
			
 
				-static inline int raid5_bi_processed_stripes(struct bio *bio)
			
 
				-{
			
 
				-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
			
 
				-
			
 
				-	return (atomic_read(segments) >> 16) & 0xffff;
			
 
				-}
			
 
				-
			
 
				-static inline int raid5_dec_bi_active_stripes(struct bio *bio)
			
 
				-{
			
 
				-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
			
 
				-
			
 
				-	return atomic_sub_return(1, segments) & 0xffff;
			
 
				-}
			
 
				-
			
 
				-static inline void raid5_inc_bi_active_stripes(struct bio *bio)
			
 
				-{
			
 
				-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
			
 
				-
			
 
				-	atomic_inc(segments);
			
 
				-}
			
 
				-
			
 
				-static inline void raid5_set_bi_processed_stripes(struct bio *bio,
			
 
				-	unsigned int cnt)
			
 
				-{
			
 
				-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
			
 
				-	int old, new;
			
 
				-
			
 
				-	do {
			
 
				-		old = atomic_read(segments);
			
 
				-		new = (old & 0xffff) | (cnt << 16);
			
 
				-	} while (atomic_cmpxchg(segments, old, new) != old);
			
 
				-}
			
 
				-
			
 
				-static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
			
 
				-{
			
 
				-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
			
 
				-
			
 
				-	atomic_set(segments, cnt);
			
 
				-}
			
 
				-
			
 
				 /* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
			
 
				  * This is because we sometimes take all the spinlocks
			
 
				  * and creating that much locking depth can cause
			
@@ -542,6 +504,7 @@ struct r5worker {
 
				 
			
 
				 struct r5worker_group {
			
 
				 	struct list_head handle_list;
			
 
				+	struct list_head loprio_list;
			
 
				 	struct r5conf *conf;
			
 
				 	struct r5worker *workers;
			
 
				 	int stripes_cnt;
			
@@ -571,6 +534,14 @@ enum r5_cache_state {
 
				 				 */
			
 
				 };
			
 
				 
			
 
				+#define PENDING_IO_MAX 512
			
 
				+#define PENDING_IO_ONE_FLUSH 128
			
 
				+struct r5pending_data {
			
 
				+	struct list_head sibling;
			
 
				+	sector_t sector; /* stripe sector */
			
 
				+	struct bio_list bios;
			
 
				+};
			
 
				+
			
 
				 struct r5conf {
			
 
				 	struct hlist_head	*stripe_hashtbl;
			
 
				 	/* only protect corresponding hash list and inactive_list */
			
@@ -608,10 +579,12 @@ struct r5conf {
 
				 						  */
			
 
				 
			
 
				 	struct list_head	handle_list; /* stripes needing handling */
			
 
				+	struct list_head	loprio_list; /* low priority stripes */
			
 
				 	struct list_head	hold_list; /* preread ready stripes */
			
 
				 	struct list_head	delayed_list; /* stripes that have plugged requests */
			
 
				 	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
			
 
				 	struct bio		*retry_read_aligned; /* currently retrying aligned bios   */
			
 
				+	unsigned int		retry_read_offset; /* sector offset into retry_read_aligned */
			
 
				 	struct bio		*retry_read_aligned_list; /* aligned bios retry list  */
			
 
				 	atomic_t		preread_active_stripes; /* stripes with scheduled io */
			
 
				 	atomic_t		active_aligned_reads;
			
@@ -621,9 +594,6 @@ struct r5conf {
 
				 	int			skip_copy; /* Don't copy data from bio to stripe cache */
			
 
				 	struct list_head	*last_hold; /* detect hold_list promotions */
			
 
				 
			
 
				-	/* bios to have bi_end_io called after metadata is synced */
			
 
				-	struct bio_list		return_bi;
			
 
				-
			
 
				 	atomic_t		reshape_stripes; /* stripes with pending writes for reshape */
			
 
				 	/* unfortunately we need two cache names as we temporarily have
			
 
				 	 * two caches.
			
@@ -676,6 +646,7 @@ struct r5conf {
 
				 	int			pool_size; /* number of disks in stripeheads in pool */
			
 
				 	spinlock_t		device_lock;
			
 
				 	struct disk_info	*disks;
			
 
				+	struct bio_set		*bio_split;
			
 
				 
			
 
				 	/* When taking over an array from a different personality, we store
			
 
				 	 * the new thread here until we fully activate the array.
			
@@ -686,10 +657,15 @@ struct r5conf {
 
				 	int			group_cnt;
			
 
				 	int			worker_cnt_per_group;
			
 
				 	struct r5l_log		*log;
			
 
				+	void			*log_private;
			
 
				 
			
 
				-	struct bio_list		pending_bios;
			
 
				 	spinlock_t		pending_bios_lock;
			
 
				 	bool			batch_bio_dispatch;
			
 
				+	struct r5pending_data	*pending_data;
			
 
				+	struct list_head	free_list;
			
 
				+	struct list_head	pending_list;
			
 
				+	int			pending_data_cnt;
			
 
				+	struct r5pending_data	*next_pending_data;
			
 
				 };
			
 
				 
			
 
				 
			
@@ -765,34 +741,4 @@ extern struct stripe_head *
 
				 raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
			
 
				 			int previous, int noblock, int noquiesce);
			
 
				 extern int raid5_calc_degraded(struct r5conf *conf);
			
 
				-extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
			
 
				-extern void r5l_exit_log(struct r5l_log *log);
			
 
				-extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
			
 
				-extern void r5l_write_stripe_run(struct r5l_log *log);
			
 
				-extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
			
 
				-extern void r5l_stripe_write_finished(struct stripe_head *sh);
			
 
				-extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
			
 
				-extern void r5l_quiesce(struct r5l_log *log, int state);
			
 
				-extern bool r5l_log_disk_error(struct r5conf *conf);
			
 
				-extern bool r5c_is_writeback(struct r5l_log *log);
			
 
				-extern int
			
 
				-r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
			
 
				-		      struct stripe_head_state *s, int disks);
			
 
				-extern void
			
 
				-r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
			
 
				-			    struct stripe_head_state *s);
			
 
				-extern void r5c_release_extra_page(struct stripe_head *sh);
			
 
				-extern void r5c_use_extra_page(struct stripe_head *sh);
			
 
				-extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
			
 
				-extern void r5c_handle_cached_data_endio(struct r5conf *conf,
			
 
				-	struct stripe_head *sh, int disks, struct bio_list *return_bi);
			
 
				-extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
			
 
				-			  struct stripe_head_state *s);
			
 
				-extern void r5c_make_stripe_write_out(struct stripe_head *sh);
			
 
				-extern void r5c_flush_cache(struct r5conf *conf, int num);
			
 
				-extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
			
 
				-extern void r5c_check_cached_full_stripe(struct r5conf *conf);
			
 
				-extern struct md_sysfs_entry r5c_journal_mode;
			
 
				-extern void r5c_update_on_rdev_error(struct mddev *mddev);
			
 
				-extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
			
 
				 #endif
			
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -183,7 +183,7 @@ static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
 
				 
			
 
				 #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len)
			
 
				 
			
 
				-static inline unsigned __bio_segments(struct bio *bio, struct bvec_iter *bvec)
			
 
				+static inline unsigned bio_segments(struct bio *bio)
			
 
				 {
			
 
				 	unsigned segs = 0;
			
 
				 	struct bio_vec bv;
			
@@ -205,17 +205,12 @@ static inline unsigned __bio_segments(struct bio *bio, struct bvec_iter *bvec)
 
				 		break;
			
 
				 	}
			
 
				 
			
 
				-	__bio_for_each_segment(bv, bio, iter, *bvec)
			
 
				+	bio_for_each_segment(bv, bio, iter)
			
 
				 		segs++;
			
 
				 
			
 
				 	return segs;
			
 
				 }
			
 
				 
			
 
				-static inline unsigned bio_segments(struct bio *bio)
			
 
				-{
			
 
				-	return __bio_segments(bio, &bio->bi_iter);
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * get a reference to a bio, so it won't disappear. the intended use is
			
 
				  * something like:
			
@@ -389,8 +384,6 @@ extern void bio_put(struct bio *);
 
				 extern void __bio_clone_fast(struct bio *, struct bio *);
			
 
				 extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
			
 
				 extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
			
 
				-extern struct bio *bio_clone_bioset_partial(struct bio *, gfp_t,
			
 
				-					    struct bio_set *, int, int);
			
 
				 
			
 
				 extern struct bio_set *fs_bio_set;
			
 
				 
			
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -99,6 +99,7 @@ int __must_check percpu_ref_init(struct percpu_ref *ref,
 
				 void percpu_ref_exit(struct percpu_ref *ref);
			
 
				 void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
			
 
				 				 percpu_ref_func_t *confirm_switch);
			
 
				+void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref);
			
 
				 void percpu_ref_switch_to_percpu(struct percpu_ref *ref);
			
 
				 void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
			
 
				 				 percpu_ref_func_t *confirm_kill);
			
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -242,10 +242,18 @@ struct mdp_superblock_1 {
 
				 
			
 
				 	__le32	chunksize;	/* in 512byte sectors */
			
 
				 	__le32	raid_disks;
			
 
				-	__le32	bitmap_offset;	/* sectors after start of superblock that bitmap starts
			
 
				-				 * NOTE: signed, so bitmap can be before superblock
			
 
				-				 * only meaningful of feature_map[0] is set.
			
 
				-				 */
			
 
				+	union {
			
 
				+		__le32	bitmap_offset;	/* sectors after start of superblock that bitmap starts
			
 
				+					 * NOTE: signed, so bitmap can be before superblock
			
 
				+					 * only meaningful of feature_map[0] is set.
			
 
				+					 */
			
 
				+
			
 
				+		/* only meaningful when feature_map[MD_FEATURE_PPL] is set */
			
 
				+		struct {
			
 
				+			__le16 offset; /* sectors from start of superblock that ppl starts (signed) */
			
 
				+			__le16 size; /* ppl size in sectors */
			
 
				+		} ppl;
			
 
				+	};
			
 
				 
			
 
				 	/* These are only valid with feature bit '4' */
			
 
				 	__le32	new_level;	/* new level we are reshaping to		*/
			
@@ -318,6 +326,7 @@ struct mdp_superblock_1 {
 
				 					     */
			
 
				 #define MD_FEATURE_CLUSTERED		256 /* clustered MD */
			
 
				 #define	MD_FEATURE_JOURNAL		512 /* support write cache */
			
 
				+#define	MD_FEATURE_PPL			1024 /* support PPL */
			
 
				 #define	MD_FEATURE_ALL			(MD_FEATURE_BITMAP_OFFSET	\
			
 
				 					|MD_FEATURE_RECOVERY_OFFSET	\
			
 
				 					|MD_FEATURE_RESHAPE_ACTIVE	\
			
@@ -328,6 +337,7 @@ struct mdp_superblock_1 {
 
				 					|MD_FEATURE_RECOVERY_BITMAP	\
			
 
				 					|MD_FEATURE_CLUSTERED		\
			
 
				 					|MD_FEATURE_JOURNAL		\
			
 
				+					|MD_FEATURE_PPL			\
			
 
				 					)
			
 
				 
			
 
				 struct r5l_payload_header {
			
@@ -388,4 +398,31 @@ struct r5l_meta_block {
 
				 
			
 
				 #define R5LOG_VERSION 0x1
			
 
				 #define R5LOG_MAGIC 0x6433c509
			
 
				+
			
 
				+struct ppl_header_entry {
			
 
				+	__le64 data_sector;	/* raid sector of the new data */
			
 
				+	__le32 pp_size;		/* length of partial parity */
			
 
				+	__le32 data_size;	/* length of data */
			
 
				+	__le32 parity_disk;	/* member disk containing parity */
			
 
				+	__le32 checksum;	/* checksum of partial parity data for this
			
 
				+				 * entry (~crc32c) */
			
 
				+} __attribute__ ((__packed__));
			
 
				+
			
 
				+#define PPL_HEADER_SIZE 4096
			
 
				+#define PPL_HDR_RESERVED 512
			
 
				+#define PPL_HDR_ENTRY_SPACE \
			
 
				+	(PPL_HEADER_SIZE - PPL_HDR_RESERVED - 4 * sizeof(__le32) - sizeof(__le64))
			
 
				+#define PPL_HDR_MAX_ENTRIES \
			
 
				+	(PPL_HDR_ENTRY_SPACE / sizeof(struct ppl_header_entry))
			
 
				+
			
 
				+struct ppl_header {
			
 
				+	__u8 reserved[PPL_HDR_RESERVED];/* reserved space, fill with 0xff */
			
 
				+	__le32 signature;		/* signature (family number of volume) */
			
 
				+	__le32 padding;			/* zero pad */
			
 
				+	__le64 generation;		/* generation number of the header */
			
 
				+	__le32 entries_count;		/* number of entries in entry array */
			
 
				+	__le32 checksum;		/* checksum of the header (~crc32c) */
			
 
				+	struct ppl_header_entry entries[PPL_HDR_MAX_ENTRIES];
			
 
				+} __attribute__ ((__packed__));
			
 
				+
			
 
				 #endif
			
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -260,6 +260,22 @@ void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
 
				 
			
 
				 	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
			
 
				 }
			
 
				+EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic);
			
 
				+
			
 
				+/**
			
 
				+ * percpu_ref_switch_to_atomic_sync - switch a percpu_ref to atomic mode
			
 
				+ * @ref: percpu_ref to switch to atomic mode
			
 
				+ *
			
 
				+ * Schedule switching the ref to atomic mode, and wait for the
			
 
				+ * switch to complete.  Caller must ensure that no other thread
			
 
				+ * will switch back to percpu mode.
			
 
				+ */
			
 
				+void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref)
			
 
				+{
			
 
				+	percpu_ref_switch_to_atomic(ref, NULL);
			
 
				+	wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic_sync);
			
 
				 
			
 
				 /**
			
 
				  * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode
			
@@ -290,6 +306,7 @@ void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
 
				 
			
 
				 	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
			
 
				 }
			
 
				+EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu);
			
 
				 
			
 
				 /**
			
 
				  * percpu_ref_kill_and_confirm - drop the initial ref and schedule confirmation