9 years ago · feaa7cb5c5
--- a/Documentation/md-cluster.txt
+++ b/Documentation/md-cluster.txt
@@ -316,3 +316,9 @@ The algorithm is:
 
				  nodes are using the raid which is achieved by lock all bitmap
			
 
				  locks within the cluster, and also those locks are unlocked
			
 
				  accordingly.
			
 
				+
			
 
				+7. Unsupported features
			
 
				+
			
 
				+There are somethings which are not supported by cluster MD yet.
			
 
				+
			
 
				+- update size and change array_sectors.
			
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -46,7 +46,7 @@ static inline char *bmname(struct bitmap *bitmap)
 
				  * allocated while we're using it
			
 
				  */
			
 
				 static int bitmap_checkpage(struct bitmap_counts *bitmap,
			
 
				-			    unsigned long page, int create)
			
 
				+			    unsigned long page, int create, int no_hijack)
			
 
				 __releases(bitmap->lock)
			
 
				 __acquires(bitmap->lock)
			
 
				 {
			
@@ -90,6 +90,9 @@ __acquires(bitmap->lock)
 
				 
			
 
				 	if (mappage == NULL) {
			
 
				 		pr_debug("md/bitmap: map page allocation failed, hijacking\n");
			
 
				+		/* We don't support hijack for cluster raid */
			
 
				+		if (no_hijack)
			
 
				+			return -ENOMEM;
			
 
				 		/* failed - set the hijacked flag so that we can use the
			
 
				 		 * pointer as a counter */
			
 
				 		if (!bitmap->bp[page].map)
			
@@ -756,7 +759,7 @@ static int bitmap_storage_alloc(struct bitmap_storage *store,
 
				 		bytes += sizeof(bitmap_super_t);
			
 
				 
			
 
				 	num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE);
			
 
				-	offset = slot_number * (num_pages - 1);
			
 
				+	offset = slot_number * num_pages;
			
 
				 
			
 
				 	store->filemap = kmalloc(sizeof(struct page *)
			
 
				 				 * num_pages, GFP_KERNEL);
			
@@ -900,6 +903,11 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 
				 	struct page *page;
			
 
				 	void *kaddr;
			
 
				 	unsigned long chunk = block >> bitmap->counts.chunkshift;
			
 
				+	struct bitmap_storage *store = &bitmap->storage;
			
 
				+	unsigned long node_offset = 0;
			
 
				+
			
 
				+	if (mddev_is_clustered(bitmap->mddev))
			
 
				+		node_offset = bitmap->cluster_slot * store->file_pages;
			
 
				 
			
 
				 	page = filemap_get_page(&bitmap->storage, chunk);
			
 
				 	if (!page)
			
@@ -915,7 +923,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 
				 	kunmap_atomic(kaddr);
			
 
				 	pr_debug("set file bit %lu page %lu\n", bit, page->index);
			
 
				 	/* record page number so it gets flushed to disk when unplug occurs */
			
 
				-	set_page_attr(bitmap, page->index, BITMAP_PAGE_DIRTY);
			
 
				+	set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_DIRTY);
			
 
				 }
			
 
				 
			
 
				 static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
			
@@ -924,6 +932,11 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
 
				 	struct page *page;
			
 
				 	void *paddr;
			
 
				 	unsigned long chunk = block >> bitmap->counts.chunkshift;
			
 
				+	struct bitmap_storage *store = &bitmap->storage;
			
 
				+	unsigned long node_offset = 0;
			
 
				+
			
 
				+	if (mddev_is_clustered(bitmap->mddev))
			
 
				+		node_offset = bitmap->cluster_slot * store->file_pages;
			
 
				 
			
 
				 	page = filemap_get_page(&bitmap->storage, chunk);
			
 
				 	if (!page)
			
@@ -935,8 +948,8 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
 
				 	else
			
 
				 		clear_bit_le(bit, paddr);
			
 
				 	kunmap_atomic(paddr);
			
 
				-	if (!test_page_attr(bitmap, page->index, BITMAP_PAGE_NEEDWRITE)) {
			
 
				-		set_page_attr(bitmap, page->index, BITMAP_PAGE_PENDING);
			
 
				+	if (!test_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_NEEDWRITE)) {
			
 
				+		set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_PENDING);
			
 
				 		bitmap->allclean = 0;
			
 
				 	}
			
 
				 }
			
@@ -1321,7 +1334,7 @@ __acquires(bitmap->lock)
 
				 	sector_t csize;
			
 
				 	int err;
			
 
				 
			
 
				-	err = bitmap_checkpage(bitmap, page, create);
			
 
				+	err = bitmap_checkpage(bitmap, page, create, 0);
			
 
				 
			
 
				 	if (bitmap->bp[page].hijacked ||
			
 
				 	    bitmap->bp[page].map == NULL)
			
@@ -1594,6 +1607,27 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force)
 
				 }
			
 
				 EXPORT_SYMBOL(bitmap_cond_end_sync);
			
 
				 
			
 
				+void bitmap_sync_with_cluster(struct mddev *mddev,
			
 
				+			      sector_t old_lo, sector_t old_hi,
			
 
				+			      sector_t new_lo, sector_t new_hi)
			
 
				+{
			
 
				+	struct bitmap *bitmap = mddev->bitmap;
			
 
				+	sector_t sector, blocks = 0;
			
 
				+
			
 
				+	for (sector = old_lo; sector < new_lo; ) {
			
 
				+		bitmap_end_sync(bitmap, sector, &blocks, 0);
			
 
				+		sector += blocks;
			
 
				+	}
			
 
				+	WARN((blocks > new_lo) && old_lo, "alignment is not correct for lo\n");
			
 
				+
			
 
				+	for (sector = old_hi; sector < new_hi; ) {
			
 
				+		bitmap_start_sync(bitmap, sector, &blocks, 0);
			
 
				+		sector += blocks;
			
 
				+	}
			
 
				+	WARN((blocks > new_hi) && old_hi, "alignment is not correct for hi\n");
			
 
				+}
			
 
				+EXPORT_SYMBOL(bitmap_sync_with_cluster);
			
 
				+
			
 
				 static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed)
			
 
				 {
			
 
				 	/* For each chunk covered by any of these sectors, set the
			
@@ -1814,6 +1848,9 @@ int bitmap_load(struct mddev *mddev)
 
				 	if (!bitmap)
			
 
				 		goto out;
			
 
				 
			
 
				+	if (mddev_is_clustered(mddev))
			
 
				+		md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes);
			
 
				+
			
 
				 	/* Clear out old bitmap info first:  Either there is none, or we
			
 
				 	 * are resuming after someone else has possibly changed things,
			
 
				 	 * so we should forget old cached info.
			
@@ -1890,14 +1927,14 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot,
 
				 
			
 
				 	if (clear_bits) {
			
 
				 		bitmap_update_sb(bitmap);
			
 
				-		/* Setting this for the ev_page should be enough.
			
 
				-		 * And we do not require both write_all and PAGE_DIRT either
			
 
				-		 */
			
 
				+		/* BITMAP_PAGE_PENDING is set, but bitmap_unplug needs
			
 
				+		 * BITMAP_PAGE_DIRTY or _NEEDWRITE to write ... */
			
 
				 		for (i = 0; i < bitmap->storage.file_pages; i++)
			
 
				-			set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
			
 
				-		bitmap_write_all(bitmap);
			
 
				+			if (test_page_attr(bitmap, i, BITMAP_PAGE_PENDING))
			
 
				+				set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE);
			
 
				 		bitmap_unplug(bitmap);
			
 
				 	}
			
 
				+	bitmap_unplug(mddev->bitmap);
			
 
				 	*low = lo;
			
 
				 	*high = hi;
			
 
				 err:
			
@@ -2032,6 +2069,35 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
 
				 		     chunks << chunkshift);
			
 
				 
			
 
				 	spin_lock_irq(&bitmap->counts.lock);
			
 
				+	/* For cluster raid, need to pre-allocate bitmap */
			
 
				+	if (mddev_is_clustered(bitmap->mddev)) {
			
 
				+		unsigned long page;
			
 
				+		for (page = 0; page < pages; page++) {
			
 
				+			ret = bitmap_checkpage(&bitmap->counts, page, 1, 1);
			
 
				+			if (ret) {
			
 
				+				unsigned long k;
			
 
				+
			
 
				+				/* deallocate the page memory */
			
 
				+				for (k = 0; k < page; k++) {
			
 
				+					kfree(new_bp[k].map);
			
 
				+				}
			
 
				+
			
 
				+				/* restore some fields from old_counts */
			
 
				+				bitmap->counts.bp = old_counts.bp;
			
 
				+				bitmap->counts.pages = old_counts.pages;
			
 
				+				bitmap->counts.missing_pages = old_counts.pages;
			
 
				+				bitmap->counts.chunkshift = old_counts.chunkshift;
			
 
				+				bitmap->counts.chunks = old_counts.chunks;
			
 
				+				bitmap->mddev->bitmap_info.chunksize = 1 << (old_counts.chunkshift +
			
 
				+									     BITMAP_BLOCK_SHIFT);
			
 
				+				blocks = old_counts.chunks << old_counts.chunkshift;
			
 
				+				pr_err("Could not pre-allocate in-memory bitmap for cluster raid\n");
			
 
				+				break;
			
 
				+			} else
			
 
				+				bitmap->counts.bp[page].count += 1;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	for (block = 0; block < blocks; ) {
			
 
				 		bitmap_counter_t *bmc_old, *bmc_new;
			
 
				 		int set;
			
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -258,6 +258,9 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks,
 
				 void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted);
			
 
				 void bitmap_close_sync(struct bitmap *bitmap);
			
 
				 void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force);
			
 
				+void bitmap_sync_with_cluster(struct mddev *mddev,
			
 
				+			      sector_t old_lo, sector_t old_hi,
			
 
				+			      sector_t new_lo, sector_t new_hi);
			
 
				 
			
 
				 void bitmap_unplug(struct bitmap *bitmap);
			
 
				 void bitmap_daemon_work(struct mddev *mddev);
			
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -61,6 +61,10 @@ struct resync_info {
 
				  * the lock.
			
 
				  */
			
 
				 #define		MD_CLUSTER_SEND_LOCKED_ALREADY		5
			
 
				+/* We should receive message after node joined cluster and
			
 
				+ * set up all the related infos such as bitmap and personality */
			
 
				+#define		MD_CLUSTER_ALREADY_IN_CLUSTER		6
			
 
				+#define		MD_CLUSTER_PENDING_RECV_EVENT		7
			
 
				 
			
 
				 
			
 
				 struct md_cluster_info {
			
@@ -85,6 +89,9 @@ struct md_cluster_info {
 
				 	struct completion newdisk_completion;
			
 
				 	wait_queue_head_t wait;
			
 
				 	unsigned long state;
			
 
				+	/* record the region in RESYNCING message */
			
 
				+	sector_t sync_low;
			
 
				+	sector_t sync_hi;
			
 
				 };
			
 
				 
			
 
				 enum msg_type {
			
@@ -284,11 +291,14 @@ static void recover_bitmaps(struct md_thread *thread)
 
				 			goto dlm_unlock;
			
 
				 		}
			
 
				 		if (hi > 0) {
			
 
				-			/* TODO:Wait for current resync to get over */
			
 
				-			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
			
 
				 			if (lo < mddev->recovery_cp)
			
 
				 				mddev->recovery_cp = lo;
			
 
				-			md_check_recovery(mddev);
			
 
				+			/* wake up thread to continue resync in case resync
			
 
				+			 * is not finished */
			
 
				+			if (mddev->recovery_cp != MaxSector) {
			
 
				+			    set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
			
 
				+			    md_wakeup_thread(mddev->thread);
			
 
				+			}
			
 
				 		}
			
 
				 dlm_unlock:
			
 
				 		dlm_unlock_sync(bm_lockres);
			
@@ -370,8 +380,12 @@ static void ack_bast(void *arg, int mode)
 
				 	struct dlm_lock_resource *res = arg;
			
 
				 	struct md_cluster_info *cinfo = res->mddev->cluster_info;
			
 
				 
			
 
				-	if (mode == DLM_LOCK_EX)
			
 
				-		md_wakeup_thread(cinfo->recv_thread);
			
 
				+	if (mode == DLM_LOCK_EX) {
			
 
				+		if (test_bit(MD_CLUSTER_ALREADY_IN_CLUSTER, &cinfo->state))
			
 
				+			md_wakeup_thread(cinfo->recv_thread);
			
 
				+		else
			
 
				+			set_bit(MD_CLUSTER_PENDING_RECV_EVENT, &cinfo->state);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
			
@@ -408,6 +422,30 @@ static void process_suspend_info(struct mddev *mddev,
 
				 		md_wakeup_thread(mddev->thread);
			
 
				 		return;
			
 
				 	}
			
 
				+
			
 
				+	/*
			
 
				+	 * The bitmaps are not same for different nodes
			
 
				+	 * if RESYNCING is happening in one node, then
			
 
				+	 * the node which received the RESYNCING message
			
 
				+	 * probably will perform resync with the region
			
 
				+	 * [lo, hi] again, so we could reduce resync time
			
 
				+	 * a lot if we can ensure that the bitmaps among
			
 
				+	 * different nodes are match up well.
			
 
				+	 *
			
 
				+	 * sync_low/hi is used to record the region which
			
 
				+	 * arrived in the previous RESYNCING message,
			
 
				+	 *
			
 
				+	 * Call bitmap_sync_with_cluster to clear
			
 
				+	 * NEEDED_MASK and set RESYNC_MASK since
			
 
				+	 * resync thread is running in another node,
			
 
				+	 * so we don't need to do the resync again
			
 
				+	 * with the same section */
			
 
				+	bitmap_sync_with_cluster(mddev, cinfo->sync_low,
			
 
				+					cinfo->sync_hi,
			
 
				+					lo, hi);
			
 
				+	cinfo->sync_low = lo;
			
 
				+	cinfo->sync_hi = hi;
			
 
				+
			
 
				 	s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
			
 
				 	if (!s)
			
 
				 		return;
			
@@ -482,11 +520,13 @@ static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg)
 
				 			__func__, __LINE__, le32_to_cpu(msg->raid_slot));
			
 
				 }
			
 
				 
			
 
				-static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
			
 
				+static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
			
 
				 {
			
 
				+	int ret = 0;
			
 
				+
			
 
				 	if (WARN(mddev->cluster_info->slot_number - 1 == le32_to_cpu(msg->slot),
			
 
				 		"node %d received it's own msg\n", le32_to_cpu(msg->slot)))
			
 
				-		return;
			
 
				+		return -1;
			
 
				 	switch (le32_to_cpu(msg->type)) {
			
 
				 	case METADATA_UPDATED:
			
 
				 		process_metadata_update(mddev, msg);
			
@@ -509,9 +549,11 @@ static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
 
				 		__recover_slot(mddev, le32_to_cpu(msg->slot));
			
 
				 		break;
			
 
				 	default:
			
 
				+		ret = -1;
			
 
				 		pr_warn("%s:%d Received unknown message from %d\n",
			
 
				 			__func__, __LINE__, msg->slot);
			
 
				 	}
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -535,7 +577,9 @@ static void recv_daemon(struct md_thread *thread)
 
				 
			
 
				 	/* read lvb and wake up thread to process this message_lockres */
			
 
				 	memcpy(&msg, message_lockres->lksb.sb_lvbptr, sizeof(struct cluster_msg));
			
 
				-	process_recvd_msg(thread->mddev, &msg);
			
 
				+	ret = process_recvd_msg(thread->mddev, &msg);
			
 
				+	if (ret)
			
 
				+		goto out;
			
 
				 
			
 
				 	/*release CR on ack_lockres*/
			
 
				 	ret = dlm_unlock_sync(ack_lockres);
			
@@ -549,6 +593,7 @@ static void recv_daemon(struct md_thread *thread)
 
				 	ret = dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
			
 
				 	if (unlikely(ret != 0))
			
 
				 		pr_info("lock CR on ack failed return %d\n", ret);
			
 
				+out:
			
 
				 	/*release CR on message_lockres*/
			
 
				 	ret = dlm_unlock_sync(message_lockres);
			
 
				 	if (unlikely(ret != 0))
			
@@ -778,17 +823,24 @@ static int join(struct mddev *mddev, int nodes)
 
				 	cinfo->token_lockres = lockres_init(mddev, "token", NULL, 0);
			
 
				 	if (!cinfo->token_lockres)
			
 
				 		goto err;
			
 
				-	cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0);
			
 
				-	if (!cinfo->ack_lockres)
			
 
				-		goto err;
			
 
				 	cinfo->no_new_dev_lockres = lockres_init(mddev, "no-new-dev", NULL, 0);
			
 
				 	if (!cinfo->no_new_dev_lockres)
			
 
				 		goto err;
			
 
				 
			
 
				+	ret = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
			
 
				+	if (ret) {
			
 
				+		ret = -EAGAIN;
			
 
				+		pr_err("md-cluster: can't join cluster to avoid lock issue\n");
			
 
				+		goto err;
			
 
				+	}
			
 
				+	cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0);
			
 
				+	if (!cinfo->ack_lockres)
			
 
				+		goto err;
			
 
				 	/* get sync CR lock on ACK. */
			
 
				 	if (dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR))
			
 
				 		pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n",
			
 
				 				ret);
			
 
				+	dlm_unlock_sync(cinfo->token_lockres);
			
 
				 	/* get sync CR lock on no-new-dev. */
			
 
				 	if (dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR))
			
 
				 		pr_err("md-cluster: failed to get a sync CR lock on no-new-dev!(%d)\n", ret);
			
@@ -809,12 +861,10 @@ static int join(struct mddev *mddev, int nodes)
 
				 	if (!cinfo->resync_lockres)
			
 
				 		goto err;
			
 
				 
			
 
				-	ret = gather_all_resync_info(mddev, nodes);
			
 
				-	if (ret)
			
 
				-		goto err;
			
 
				-
			
 
				 	return 0;
			
 
				 err:
			
 
				+	md_unregister_thread(&cinfo->recovery_thread);
			
 
				+	md_unregister_thread(&cinfo->recv_thread);
			
 
				 	lockres_free(cinfo->message_lockres);
			
 
				 	lockres_free(cinfo->token_lockres);
			
 
				 	lockres_free(cinfo->ack_lockres);
			
@@ -828,6 +878,19 @@ static int join(struct mddev *mddev, int nodes)
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+static void load_bitmaps(struct mddev *mddev, int total_slots)
			
 
				+{
			
 
				+	struct md_cluster_info *cinfo = mddev->cluster_info;
			
 
				+
			
 
				+	/* load all the node's bitmap info for resync */
			
 
				+	if (gather_all_resync_info(mddev, total_slots))
			
 
				+		pr_err("md-cluster: failed to gather all resyn infos\n");
			
 
				+	set_bit(MD_CLUSTER_ALREADY_IN_CLUSTER, &cinfo->state);
			
 
				+	/* wake up recv thread in case something need to be handled */
			
 
				+	if (test_and_clear_bit(MD_CLUSTER_PENDING_RECV_EVENT, &cinfo->state))
			
 
				+		md_wakeup_thread(cinfo->recv_thread);
			
 
				+}
			
 
				+
			
 
				 static void resync_bitmap(struct mddev *mddev)
			
 
				 {
			
 
				 	struct md_cluster_info *cinfo = mddev->cluster_info;
			
@@ -937,7 +1000,6 @@ static void metadata_update_cancel(struct mddev *mddev)
 
				 static int resync_start(struct mddev *mddev)
			
 
				 {
			
 
				 	struct md_cluster_info *cinfo = mddev->cluster_info;
			
 
				-	cinfo->resync_lockres->flags |= DLM_LKF_NOQUEUE;
			
 
				 	return dlm_lock_sync(cinfo->resync_lockres, DLM_LOCK_EX);
			
 
				 }
			
 
				 
			
@@ -967,7 +1029,6 @@ static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
 
				 static int resync_finish(struct mddev *mddev)
			
 
				 {
			
 
				 	struct md_cluster_info *cinfo = mddev->cluster_info;
			
 
				-	cinfo->resync_lockres->flags &= ~DLM_LKF_NOQUEUE;
			
 
				 	dlm_unlock_sync(cinfo->resync_lockres);
			
 
				 	return resync_info_update(mddev, 0, 0);
			
 
				 }
			
@@ -1171,6 +1232,7 @@ static struct md_cluster_operations cluster_ops = {
 
				 	.add_new_disk_cancel = add_new_disk_cancel,
			
 
				 	.new_disk_ack = new_disk_ack,
			
 
				 	.remove_disk = remove_disk,
			
 
				+	.load_bitmaps = load_bitmaps,
			
 
				 	.gather_bitmaps = gather_bitmaps,
			
 
				 	.lock_all_bitmaps = lock_all_bitmaps,
			
 
				 	.unlock_all_bitmaps = unlock_all_bitmaps,
			
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -23,6 +23,7 @@ struct md_cluster_operations {
 
				 	void (*add_new_disk_cancel)(struct mddev *mddev);
			
 
				 	int (*new_disk_ack)(struct mddev *mddev, bool ack);
			
 
				 	int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev);
			
 
				+	void (*load_bitmaps)(struct mddev *mddev, int total_slots);
			
 
				 	int (*gather_bitmaps)(struct md_rdev *rdev);
			
 
				 	int (*lock_all_bitmaps)(struct mddev *mddev);
			
 
				 	void (*unlock_all_bitmaps)(struct mddev *mddev);
			
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -307,7 +307,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
 
				  */
			
 
				 void mddev_suspend(struct mddev *mddev)
			
 
				 {
			
 
				-	WARN_ON_ONCE(current == mddev->thread->tsk);
			
 
				+	WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk);
			
 
				 	if (mddev->suspended++)
			
 
				 		return;
			
 
				 	synchronize_rcu();
			
@@ -2291,19 +2291,24 @@ void md_update_sb(struct mddev *mddev, int force_change)
 
				 		return;
			
 
				 	}
			
 
				 
			
 
				+repeat:
			
 
				 	if (mddev_is_clustered(mddev)) {
			
 
				 		if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
			
 
				 			force_change = 1;
			
 
				+		if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
			
 
				+			nospares = 1;
			
 
				 		ret = md_cluster_ops->metadata_update_start(mddev);
			
 
				 		/* Has someone else has updated the sb */
			
 
				 		if (!does_sb_need_changing(mddev)) {
			
 
				 			if (ret == 0)
			
 
				 				md_cluster_ops->metadata_update_cancel(mddev);
			
 
				-			clear_bit(MD_CHANGE_PENDING, &mddev->flags);
			
 
				+			bit_clear_unless(&mddev->flags, BIT(MD_CHANGE_PENDING),
			
 
				+							 BIT(MD_CHANGE_DEVS) |
			
 
				+							 BIT(MD_CHANGE_CLEAN));
			
 
				 			return;
			
 
				 		}
			
 
				 	}
			
 
				-repeat:
			
 
				+
			
 
				 	/* First make sure individual recovery_offsets are correct */
			
 
				 	rdev_for_each(rdev, mddev) {
			
 
				 		if (rdev->raid_disk >= 0 &&
			
@@ -2430,15 +2435,14 @@ void md_update_sb(struct mddev *mddev, int force_change)
 
				 	md_super_wait(mddev);
			
 
				 	/* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
			
 
				 
			
 
				-	spin_lock(&mddev->lock);
			
 
				+	if (mddev_is_clustered(mddev) && ret == 0)
			
 
				+		md_cluster_ops->metadata_update_finish(mddev);
			
 
				+
			
 
				 	if (mddev->in_sync != sync_req ||
			
 
				-	    test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
			
 
				+	    !bit_clear_unless(&mddev->flags, BIT(MD_CHANGE_PENDING),
			
 
				+			       BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_CLEAN)))
			
 
				 		/* have to write it out again */
			
 
				-		spin_unlock(&mddev->lock);
			
 
				 		goto repeat;
			
 
				-	}
			
 
				-	clear_bit(MD_CHANGE_PENDING, &mddev->flags);
			
 
				-	spin_unlock(&mddev->lock);
			
 
				 	wake_up(&mddev->sb_wait);
			
 
				 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
			
 
				 		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
			
@@ -2452,9 +2456,6 @@ void md_update_sb(struct mddev *mddev, int force_change)
 
				 		clear_bit(BlockedBadBlocks, &rdev->flags);
			
 
				 		wake_up(&rdev->blocked_wait);
			
 
				 	}
			
 
				-
			
 
				-	if (mddev_is_clustered(mddev) && ret == 0)
			
 
				-		md_cluster_ops->metadata_update_finish(mddev);
			
 
				 }
			
 
				 EXPORT_SYMBOL(md_update_sb);
			
 
				 
			
@@ -4816,6 +4817,10 @@ array_size_store(struct mddev *mddev, const char *buf, size_t len)
 
				 	if (err)
			
 
				 		return err;
			
 
				 
			
 
				+	/* cluster raid doesn't support change array_sectors */
			
 
				+	if (mddev_is_clustered(mddev))
			
 
				+		return -EINVAL;
			
 
				+
			
 
				 	if (strncmp(buf, "default", 7) == 0) {
			
 
				 		if (mddev->pers)
			
 
				 			sectors = mddev->pers->size(mddev, 0, 0);
			
@@ -6437,6 +6442,10 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
 
				 	int rv;
			
 
				 	int fit = (num_sectors == 0);
			
 
				 
			
 
				+	/* cluster raid doesn't support update size */
			
 
				+	if (mddev_is_clustered(mddev))
			
 
				+		return -EINVAL;
			
 
				+
			
 
				 	if (mddev->pers->resize == NULL)
			
 
				 		return -EINVAL;
			
 
				 	/* The "num_sectors" is the number of sectors of each device that
			
@@ -7785,7 +7794,7 @@ void md_do_sync(struct md_thread *thread)
 
				 	struct md_rdev *rdev;
			
 
				 	char *desc, *action = NULL;
			
 
				 	struct blk_plug plug;
			
 
				-	bool cluster_resync_finished = false;
			
 
				+	int ret;
			
 
				 
			
 
				 	/* just incase thread restarts... */
			
 
				 	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
			
@@ -7795,6 +7804,19 @@ void md_do_sync(struct md_thread *thread)
 
				 		return;
			
 
				 	}
			
 
				 
			
 
				+	if (mddev_is_clustered(mddev)) {
			
 
				+		ret = md_cluster_ops->resync_start(mddev);
			
 
				+		if (ret)
			
 
				+			goto skip;
			
 
				+
			
 
				+		if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
			
 
				+			test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
			
 
				+			test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
			
 
				+		     && ((unsigned long long)mddev->curr_resync_completed
			
 
				+			 < (unsigned long long)mddev->resync_max_sectors))
			
 
				+			goto skip;
			
 
				+	}
			
 
				+
			
 
				 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
			
 
				 		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
			
 
				 			desc = "data-check";
			
@@ -8089,11 +8111,6 @@ void md_do_sync(struct md_thread *thread)
 
				 		mddev->curr_resync_completed = mddev->curr_resync;
			
 
				 		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
			
 
				 	}
			
 
				-	/* tell personality and other nodes that we are finished */
			
 
				-	if (mddev_is_clustered(mddev)) {
			
 
				-		md_cluster_ops->resync_finish(mddev);
			
 
				-		cluster_resync_finished = true;
			
 
				-	}
			
 
				 	mddev->pers->sync_request(mddev, max_sectors, &skipped);
			
 
				 
			
 
				 	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
			
@@ -8130,12 +8147,18 @@ void md_do_sync(struct md_thread *thread)
 
				 		}
			
 
				 	}
			
 
				  skip:
			
 
				-	set_bit(MD_CHANGE_DEVS, &mddev->flags);
			
 
				-
			
 
				 	if (mddev_is_clustered(mddev) &&
			
 
				-	    test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
			
 
				-	    !cluster_resync_finished)
			
 
				+	    ret == 0) {
			
 
				+		/* set CHANGE_PENDING here since maybe another
			
 
				+		 * update is needed, so other nodes are informed */
			
 
				+		set_mask_bits(&mddev->flags, 0,
			
 
				+			      BIT(MD_CHANGE_PENDING) | BIT(MD_CHANGE_DEVS));
			
 
				+		md_wakeup_thread(mddev->thread);
			
 
				+		wait_event(mddev->sb_wait,
			
 
				+			   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
			
 
				 		md_cluster_ops->resync_finish(mddev);
			
 
				+	} else
			
 
				+		set_bit(MD_CHANGE_DEVS, &mddev->flags);
			
 
				 
			
 
				 	spin_lock(&mddev->lock);
			
 
				 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
			
@@ -8226,18 +8249,9 @@ static void md_start_sync(struct work_struct *ws)
 
				 	struct mddev *mddev = container_of(ws, struct mddev, del_work);
			
 
				 	int ret = 0;
			
 
				 
			
 
				-	if (mddev_is_clustered(mddev)) {
			
 
				-		ret = md_cluster_ops->resync_start(mddev);
			
 
				-		if (ret) {
			
 
				-			mddev->sync_thread = NULL;
			
 
				-			goto out;
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				 	mddev->sync_thread = md_register_thread(md_do_sync,
			
 
				 						mddev,
			
 
				 						"resync");
			
 
				-out:
			
 
				 	if (!mddev->sync_thread) {
			
 
				 		if (!(mddev_is_clustered(mddev) && ret == -EAGAIN))
			
 
				 			printk(KERN_ERR "%s: could not start resync"
			
@@ -8536,6 +8550,7 @@ EXPORT_SYMBOL(md_finish_reshape);
 
				 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
			
 
				 		       int is_new)
			
 
				 {
			
 
				+	struct mddev *mddev = rdev->mddev;
			
 
				 	int rv;
			
 
				 	if (is_new)
			
 
				 		s += rdev->new_data_offset;
			
@@ -8545,8 +8560,8 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 
				 	if (rv == 0) {
			
 
				 		/* Make sure they get written out promptly */
			
 
				 		sysfs_notify_dirent_safe(rdev->sysfs_state);
			
 
				-		set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
			
 
				-		set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags);
			
 
				+		set_mask_bits(&mddev->flags, 0,
			
 
				+			      BIT(MD_CHANGE_CLEAN) | BIT(MD_CHANGE_PENDING));
			
 
				 		md_wakeup_thread(rdev->mddev->thread);
			
 
				 		return 1;
			
 
				 	} else
			
@@ -8680,6 +8695,11 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
 
				 				ret = remove_and_add_spares(mddev, rdev2);
			
 
				 				pr_info("Activated spare: %s\n",
			
 
				 						bdevname(rdev2->bdev,b));
			
 
				+				/* wakeup mddev->thread here, so array could
			
 
				+				 * perform resync with the new activated disk */
			
 
				+				set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
			
 
				+				md_wakeup_thread(mddev->thread);
			
 
				+
			
 
				 			}
			
 
				 			/* device faulty
			
 
				 			 * We just want to do the minimum to mark the disk
			
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1474,8 +1474,8 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
 
				 	 * if recovery is running, make sure it aborts.
			
 
				 	 */
			
 
				 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
			
 
				-	set_bit(MD_CHANGE_DEVS, &mddev->flags);
			
 
				-	set_bit(MD_CHANGE_PENDING, &mddev->flags);
			
 
				+	set_mask_bits(&mddev->flags, 0,
			
 
				+		      BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
			
 
				 	printk(KERN_ALERT
			
 
				 	       "md/raid1:%s: Disk failure on %s, disabling device.\n"
			
 
				 	       "md/raid1:%s: Operation continuing on %d devices.\n",
			
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1102,8 +1102,8 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 
				 		bio->bi_iter.bi_sector < conf->reshape_progress))) {
			
 
				 		/* Need to update reshape_position in metadata */
			
 
				 		mddev->reshape_position = conf->reshape_progress;
			
 
				-		set_bit(MD_CHANGE_DEVS, &mddev->flags);
			
 
				-		set_bit(MD_CHANGE_PENDING, &mddev->flags);
			
 
				+		set_mask_bits(&mddev->flags, 0,
			
 
				+			      BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
			
 
				 		md_wakeup_thread(mddev->thread);
			
 
				 		wait_event(mddev->sb_wait,
			
 
				 			   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
			
@@ -1591,8 +1591,8 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
 
				 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
			
 
				 	set_bit(Blocked, &rdev->flags);
			
 
				 	set_bit(Faulty, &rdev->flags);
			
 
				-	set_bit(MD_CHANGE_DEVS, &mddev->flags);
			
 
				-	set_bit(MD_CHANGE_PENDING, &mddev->flags);
			
 
				+	set_mask_bits(&mddev->flags, 0,
			
 
				+		      BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
			
 
				 	spin_unlock_irqrestore(&conf->device_lock, flags);
			
 
				 	printk(KERN_ALERT
			
 
				 	       "md/raid10:%s: Disk failure on %s, disabling device.\n"
			
@@ -3782,8 +3782,10 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
 
				 			return ret;
			
 
				 	}
			
 
				 	md_set_array_sectors(mddev, size);
			
 
				-	set_capacity(mddev->gendisk, mddev->array_sectors);
			
 
				-	revalidate_disk(mddev->gendisk);
			
 
				+	if (mddev->queue) {
			
 
				+		set_capacity(mddev->gendisk, mddev->array_sectors);
			
 
				+		revalidate_disk(mddev->gendisk);
			
 
				+	}
			
 
				 	if (sectors > mddev->dev_sectors &&
			
 
				 	    mddev->recovery_cp > oldsize) {
			
 
				 		mddev->recovery_cp = oldsize;
			
@@ -4593,8 +4595,10 @@ static void raid10_finish_reshape(struct mddev *mddev)
 
				 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
			
 
				 		}
			
 
				 		mddev->resync_max_sectors = size;
			
 
				-		set_capacity(mddev->gendisk, mddev->array_sectors);
			
 
				-		revalidate_disk(mddev->gendisk);
			
 
				+		if (mddev->queue) {
			
 
				+			set_capacity(mddev->gendisk, mddev->array_sectors);
			
 
				+			revalidate_disk(mddev->gendisk);
			
 
				+		}
			
 
				 	} else {
			
 
				 		int d;
			
 
				 		for (d = conf->geo.raid_disks ;
			
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -712,8 +712,8 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
 
				 	 * in_teardown check workaround this issue.
			
 
				 	 */
			
 
				 	if (!log->in_teardown) {
			
 
				-		set_bit(MD_CHANGE_DEVS, &mddev->flags);
			
 
				-		set_bit(MD_CHANGE_PENDING, &mddev->flags);
			
 
				+		set_mask_bits(&mddev->flags, 0,
			
 
				+			      BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
			
 
				 		md_wakeup_thread(mddev->thread);
			
 
				 		wait_event(mddev->sb_wait,
			
 
				 			!test_bit(MD_CHANGE_PENDING, &mddev->flags) ||
			
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2514,8 +2514,8 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
 
				 
			
 
				 	set_bit(Blocked, &rdev->flags);
			
 
				 	set_bit(Faulty, &rdev->flags);
			
 
				-	set_bit(MD_CHANGE_DEVS, &mddev->flags);
			
 
				-	set_bit(MD_CHANGE_PENDING, &mddev->flags);
			
 
				+	set_mask_bits(&mddev->flags, 0,
			
 
				+		      BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
			
 
				 	printk(KERN_ALERT
			
 
				 	       "md/raid:%s: Disk failure on %s, disabling device.\n"
			
 
				 	       "md/raid:%s: Operation continuing on %d devices.\n",
			
@@ -7572,8 +7572,10 @@ static void raid5_finish_reshape(struct mddev *mddev)
 
				 
			
 
				 		if (mddev->delta_disks > 0) {
			
 
				 			md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
			
 
				-			set_capacity(mddev->gendisk, mddev->array_sectors);
			
 
				-			revalidate_disk(mddev->gendisk);
			
 
				+			if (mddev->queue) {
			
 
				+				set_capacity(mddev->gendisk, mddev->array_sectors);
			
 
				+				revalidate_disk(mddev->gendisk);
			
 
				+			}
			
 
				 		} else {
			
 
				 			int d;
			
 
				 			spin_lock_irq(&conf->device_lock);
			
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -227,6 +227,22 @@ static inline unsigned long __ffs64(u64 word)
 
				 })
			
 
				 #endif
			
 
				 
			
 
				+#ifndef bit_clear_unless
			
 
				+#define bit_clear_unless(ptr, _clear, _test)	\
			
 
				+({								\
			
 
				+	const typeof(*ptr) clear = (_clear), test = (_test);	\
			
 
				+	typeof(*ptr) old, new;					\
			
 
				+								\
			
 
				+	do {							\
			
 
				+		old = ACCESS_ONCE(*ptr);			\
			
 
				+		new = old & ~clear;				\
			
 
				+	} while (!(old & test) &&				\
			
 
				+		 cmpxchg(ptr, old, new) != old);		\
			
 
				+								\
			
 
				+	!(old & test);						\
			
 
				+})
			
 
				+#endif
			
 
				+
			
 
				 #ifndef find_last_bit
			
 
				 /**
			
 
				  * find_last_bit - find the last set bit in a memory region