8 years ago · 8db87912c9
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -136,10 +136,13 @@ static void r10bio_pool_free(void *r10_bio, void *data)
 
				 	kfree(r10_bio);
			
 
				 }
			
 
				 
			
 
				+#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
			
 
				 /* amount of memory to reserve for resync requests */
			
 
				 #define RESYNC_WINDOW (1024*1024)
			
 
				 /* maximum number of concurrent requests, memory permitting */
			
 
				 #define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
			
 
				+#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
			
 
				+#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
			
 
				 
			
 
				 /*
			
 
				  * When performing a resync, we need to read and compare, so
			
@@ -2840,6 +2843,43 @@ static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
 
				 	return r10bio;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Set cluster_sync_high since we need other nodes to add the
			
 
				+ * range [cluster_sync_low, cluster_sync_high] to suspend list.
			
 
				+ */
			
 
				+static void raid10_set_cluster_sync_high(struct r10conf *conf)
			
 
				+{
			
 
				+	sector_t window_size;
			
 
				+	int extra_chunk, chunks;
			
 
				+
			
 
				+	/*
			
 
				+	 * First, here we define "stripe" as a unit which across
			
 
				+	 * all member devices one time, so we get chunks by use
			
 
				+	 * raid_disks / near_copies. Otherwise, if near_copies is
			
 
				+	 * close to raid_disks, then resync window could increases
			
 
				+	 * linearly with the increase of raid_disks, which means
			
 
				+	 * we will suspend a really large IO window while it is not
			
 
				+	 * necessary. If raid_disks is not divisible by near_copies,
			
 
				+	 * an extra chunk is needed to ensure the whole "stripe" is
			
 
				+	 * covered.
			
 
				+	 */
			
 
				+
			
 
				+	chunks = conf->geo.raid_disks / conf->geo.near_copies;
			
 
				+	if (conf->geo.raid_disks % conf->geo.near_copies == 0)
			
 
				+		extra_chunk = 0;
			
 
				+	else
			
 
				+		extra_chunk = 1;
			
 
				+	window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors;
			
 
				+
			
 
				+	/*
			
 
				+	 * At least use a 32M window to align with raid1's resync window
			
 
				+	 */
			
 
				+	window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ?
			
 
				+			CLUSTER_RESYNC_WINDOW_SECTORS : window_size;
			
 
				+
			
 
				+	conf->cluster_sync_high = conf->cluster_sync_low + window_size;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * perform a "sync" on one "block"
			
 
				  *
			
@@ -2912,6 +2952,9 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
				 	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
			
 
				 		max_sector = mddev->resync_max_sectors;
			
 
				 	if (sector_nr >= max_sector) {
			
 
				+		conf->cluster_sync_low = 0;
			
 
				+		conf->cluster_sync_high = 0;
			
 
				+
			
 
				 		/* If we aborted, we need to abort the
			
 
				 		 * sync on the 'current' bitmap chucks (there can
			
 
				 		 * be several when recovering multiple devices).
			
@@ -3266,7 +3309,17 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
				 		/* resync. Schedule a read for every block at this virt offset */
			
 
				 		int count = 0;
			
 
				 
			
 
				-		bitmap_cond_end_sync(mddev->bitmap, sector_nr, 0);
			
 
				+		/*
			
 
				+		 * Since curr_resync_completed could probably not update in
			
 
				+		 * time, and we will set cluster_sync_low based on it.
			
 
				+		 * Let's check against "sector_nr + 2 * RESYNC_SECTORS" for
			
 
				+		 * safety reason, which ensures curr_resync_completed is
			
 
				+		 * updated in bitmap_cond_end_sync.
			
 
				+		 */
			
 
				+		bitmap_cond_end_sync(mddev->bitmap, sector_nr,
			
 
				+				     mddev_is_clustered(mddev) &&
			
 
				+				     (sector_nr + 2 * RESYNC_SECTORS >
			
 
				+				      conf->cluster_sync_high));
			
 
				 
			
 
				 		if (!bitmap_start_sync(mddev->bitmap, sector_nr,
			
 
				 				       &sync_blocks, mddev->degraded) &&
			
@@ -3400,6 +3453,52 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
				 	} while (++page_idx < RESYNC_PAGES);
			
 
				 	r10_bio->sectors = nr_sectors;
			
 
				 
			
 
				+	if (mddev_is_clustered(mddev) &&
			
 
				+	    test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
			
 
				+		/* It is resync not recovery */
			
 
				+		if (conf->cluster_sync_high < sector_nr + nr_sectors) {
			
 
				+			conf->cluster_sync_low = mddev->curr_resync_completed;
			
 
				+			raid10_set_cluster_sync_high(conf);
			
 
				+			/* Send resync message */
			
 
				+			md_cluster_ops->resync_info_update(mddev,
			
 
				+						conf->cluster_sync_low,
			
 
				+						conf->cluster_sync_high);
			
 
				+		}
			
 
				+	} else if (mddev_is_clustered(mddev)) {
			
 
				+		/* This is recovery not resync */
			
 
				+		sector_t sect_va1, sect_va2;
			
 
				+		bool broadcast_msg = false;
			
 
				+
			
 
				+		for (i = 0; i < conf->geo.raid_disks; i++) {
			
 
				+			/*
			
 
				+			 * sector_nr is a device address for recovery, so we
			
 
				+			 * need translate it to array address before compare
			
 
				+			 * with cluster_sync_high.
			
 
				+			 */
			
 
				+			sect_va1 = raid10_find_virt(conf, sector_nr, i);
			
 
				+
			
 
				+			if (conf->cluster_sync_high < sect_va1 + nr_sectors) {
			
 
				+				broadcast_msg = true;
			
 
				+				/*
			
 
				+				 * curr_resync_completed is similar as
			
 
				+				 * sector_nr, so make the translation too.
			
 
				+				 */
			
 
				+				sect_va2 = raid10_find_virt(conf,
			
 
				+					mddev->curr_resync_completed, i);
			
 
				+
			
 
				+				if (conf->cluster_sync_low == 0 ||
			
 
				+				    conf->cluster_sync_low > sect_va2)
			
 
				+					conf->cluster_sync_low = sect_va2;
			
 
				+			}
			
 
				+		}
			
 
				+		if (broadcast_msg) {
			
 
				+			raid10_set_cluster_sync_high(conf);
			
 
				+			md_cluster_ops->resync_info_update(mddev,
			
 
				+						conf->cluster_sync_low,
			
 
				+						conf->cluster_sync_high);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	while (biolist) {
			
 
				 		bio = biolist;
			
 
				 		biolist = biolist->bi_next;
			
@@ -3659,6 +3758,18 @@ static int raid10_run(struct mddev *mddev)
 
				 	if (!conf)
			
 
				 		goto out;
			
 
				 
			
 
				+	if (mddev_is_clustered(conf->mddev)) {
			
 
				+		int fc, fo;
			
 
				+
			
 
				+		fc = (mddev->layout >> 8) & 255;
			
 
				+		fo = mddev->layout & (1<<16);
			
 
				+		if (fc > 1 || fo > 0) {
			
 
				+			pr_err("only near layout is supported by clustered"
			
 
				+				" raid10\n");
			
 
				+			goto out;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	mddev->thread = conf->thread;
			
 
				 	conf->thread = NULL;
			
 
				 
			
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -88,6 +88,12 @@ struct r10conf {
 
				 	 * the new thread here until we fully activate the array.
			
 
				 	 */
			
 
				 	struct md_thread	*thread;
			
 
				+
			
 
				+	/*
			
 
				+	 * Keep track of cluster resync window to send to other nodes.
			
 
				+	 */
			
 
				+	sector_t		cluster_sync_low;
			
 
				+	sector_t		cluster_sync_high;
			
 
				 };
			
 
				 
			
 
				 /*