8 years ago · 8b4822de59
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8022,18 +8022,15 @@ EXPORT_SYMBOL(md_write_end);
 
				  * may proceed without blocking.  It is important to call this before
			
 
				  * attempting a GFP_KERNEL allocation while holding the mddev lock.
			
 
				  * Must be called with mddev_lock held.
			
 
				- *
			
 
				- * In the ->external case MD_SB_CHANGE_PENDING can not be cleared until mddev->lock
			
 
				- * is dropped, so return -EAGAIN after notifying userspace.
			
 
				  */
			
 
				-int md_allow_write(struct mddev *mddev)
			
 
				+void md_allow_write(struct mddev *mddev)
			
 
				 {
			
 
				 	if (!mddev->pers)
			
 
				-		return 0;
			
 
				+		return;
			
 
				 	if (mddev->ro)
			
 
				-		return 0;
			
 
				+		return;
			
 
				 	if (!mddev->pers->sync_request)
			
 
				-		return 0;
			
 
				+		return;
			
 
				 
			
 
				 	spin_lock(&mddev->lock);
			
 
				 	if (mddev->in_sync) {
			
@@ -8046,13 +8043,12 @@ int md_allow_write(struct mddev *mddev)
 
				 		spin_unlock(&mddev->lock);
			
 
				 		md_update_sb(mddev, 0);
			
 
				 		sysfs_notify_dirent_safe(mddev->sysfs_state);
			
 
				+		/* wait for the dirty state to be recorded in the metadata */
			
 
				+		wait_event(mddev->sb_wait,
			
 
				+			   !test_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags) &&
			
 
				+			   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
			
 
				 	} else
			
 
				 		spin_unlock(&mddev->lock);
			
 
				-
			
 
				-	if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
			
 
				-		return -EAGAIN;
			
 
				-	else
			
 
				-		return 0;
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(md_allow_write);
			
 
				 
			
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -665,7 +665,7 @@ extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
 
				 			bool metadata_op);
			
 
				 extern void md_do_sync(struct md_thread *thread);
			
 
				 extern void md_new_event(struct mddev *mddev);
			
 
				-extern int md_allow_write(struct mddev *mddev);
			
 
				+extern void md_allow_write(struct mddev *mddev);
			
 
				 extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev);
			
 
				 extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors);
			
 
				 extern int md_check_no_bitmap(struct mddev *mddev);
			
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -385,7 +385,7 @@ static int raid0_run(struct mddev *mddev)
 
				 		blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
			
 
				 		blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
			
 
				 		blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors);
			
 
				-		blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
			
 
				+		blk_queue_max_discard_sectors(mddev->queue, UINT_MAX);
			
 
				 
			
 
				 		blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
			
 
				 		blk_queue_io_opt(mddev->queue,
			
@@ -459,6 +459,95 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev,
 
				 	}
			
 
				 }
			
 
				 
			
 
				+static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
			
 
				+{
			
 
				+	struct r0conf *conf = mddev->private;
			
 
				+	struct strip_zone *zone;
			
 
				+	sector_t start = bio->bi_iter.bi_sector;
			
 
				+	sector_t end;
			
 
				+	unsigned int stripe_size;
			
 
				+	sector_t first_stripe_index, last_stripe_index;
			
 
				+	sector_t start_disk_offset;
			
 
				+	unsigned int start_disk_index;
			
 
				+	sector_t end_disk_offset;
			
 
				+	unsigned int end_disk_index;
			
 
				+	unsigned int disk;
			
 
				+
			
 
				+	zone = find_zone(conf, &start);
			
 
				+
			
 
				+	if (bio_end_sector(bio) > zone->zone_end) {
			
 
				+		struct bio *split = bio_split(bio,
			
 
				+			zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO,
			
 
				+			mddev->bio_set);
			
 
				+		bio_chain(split, bio);
			
 
				+		generic_make_request(bio);
			
 
				+		bio = split;
			
 
				+		end = zone->zone_end;
			
 
				+	} else
			
 
				+		end = bio_end_sector(bio);
			
 
				+
			
 
				+	if (zone != conf->strip_zone)
			
 
				+		end = end - zone[-1].zone_end;
			
 
				+
			
 
				+	/* Now start and end is the offset in zone */
			
 
				+	stripe_size = zone->nb_dev * mddev->chunk_sectors;
			
 
				+
			
 
				+	first_stripe_index = start;
			
 
				+	sector_div(first_stripe_index, stripe_size);
			
 
				+	last_stripe_index = end;
			
 
				+	sector_div(last_stripe_index, stripe_size);
			
 
				+
			
 
				+	start_disk_index = (int)(start - first_stripe_index * stripe_size) /
			
 
				+		mddev->chunk_sectors;
			
 
				+	start_disk_offset = ((int)(start - first_stripe_index * stripe_size) %
			
 
				+		mddev->chunk_sectors) +
			
 
				+		first_stripe_index * mddev->chunk_sectors;
			
 
				+	end_disk_index = (int)(end - last_stripe_index * stripe_size) /
			
 
				+		mddev->chunk_sectors;
			
 
				+	end_disk_offset = ((int)(end - last_stripe_index * stripe_size) %
			
 
				+		mddev->chunk_sectors) +
			
 
				+		last_stripe_index * mddev->chunk_sectors;
			
 
				+
			
 
				+	for (disk = 0; disk < zone->nb_dev; disk++) {
			
 
				+		sector_t dev_start, dev_end;
			
 
				+		struct bio *discard_bio = NULL;
			
 
				+		struct md_rdev *rdev;
			
 
				+
			
 
				+		if (disk < start_disk_index)
			
 
				+			dev_start = (first_stripe_index + 1) *
			
 
				+				mddev->chunk_sectors;
			
 
				+		else if (disk > start_disk_index)
			
 
				+			dev_start = first_stripe_index * mddev->chunk_sectors;
			
 
				+		else
			
 
				+			dev_start = start_disk_offset;
			
 
				+
			
 
				+		if (disk < end_disk_index)
			
 
				+			dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
			
 
				+		else if (disk > end_disk_index)
			
 
				+			dev_end = last_stripe_index * mddev->chunk_sectors;
			
 
				+		else
			
 
				+			dev_end = end_disk_offset;
			
 
				+
			
 
				+		if (dev_end <= dev_start)
			
 
				+			continue;
			
 
				+
			
 
				+		rdev = conf->devlist[(zone - conf->strip_zone) *
			
 
				+			conf->strip_zone[0].nb_dev + disk];
			
 
				+		if (__blkdev_issue_discard(rdev->bdev,
			
 
				+			dev_start + zone->dev_start + rdev->data_offset,
			
 
				+			dev_end - dev_start, GFP_NOIO, 0, &discard_bio) ||
			
 
				+		    !discard_bio)
			
 
				+			continue;
			
 
				+		bio_chain(discard_bio, bio);
			
 
				+		if (mddev->gendisk)
			
 
				+			trace_block_bio_remap(bdev_get_queue(rdev->bdev),
			
 
				+				discard_bio, disk_devt(mddev->gendisk),
			
 
				+				bio->bi_iter.bi_sector);
			
 
				+		generic_make_request(discard_bio);
			
 
				+	}
			
 
				+	bio_endio(bio);
			
 
				+}
			
 
				+
			
 
				 static void raid0_make_request(struct mddev *mddev, struct bio *bio)
			
 
				 {
			
 
				 	struct strip_zone *zone;
			
@@ -473,6 +562,11 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 
				 		return;
			
 
				 	}
			
 
				 
			
 
				+	if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) {
			
 
				+		raid0_handle_discard(mddev, bio);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				 	bio_sector = bio->bi_iter.bi_sector;
			
 
				 	sector = bio_sector;
			
 
				 	chunk_sects = mddev->chunk_sectors;
			
@@ -498,19 +592,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 
				 	bio->bi_iter.bi_sector = sector + zone->dev_start +
			
 
				 		tmp_dev->data_offset;
			
 
				 
			
 
				-	if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
			
 
				-		     !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
			
 
				-		/* Just ignore it */
			
 
				-		bio_endio(bio);
			
 
				-	} else {
			
 
				-		if (mddev->gendisk)
			
 
				-			trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
			
 
				-					      bio, disk_devt(mddev->gendisk),
			
 
				-					      bio_sector);
			
 
				-		mddev_check_writesame(mddev, bio);
			
 
				-		mddev_check_write_zeroes(mddev, bio);
			
 
				-		generic_make_request(bio);
			
 
				-	}
			
 
				+	if (mddev->gendisk)
			
 
				+		trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
			
 
				+				      bio, disk_devt(mddev->gendisk),
			
 
				+				      bio_sector);
			
 
				+	mddev_check_writesame(mddev, bio);
			
 
				+	mddev_check_write_zeroes(mddev, bio);
			
 
				+	generic_make_request(bio);
			
 
				 }
			
 
				 
			
 
				 static void raid0_status(struct seq_file *seq, struct mddev *mddev)
			
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -666,8 +666,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 
				 					break;
			
 
				 			}
			
 
				 			continue;
			
 
				-		} else
			
 
				+		} else {
			
 
				+			if ((sectors > best_good_sectors) && (best_disk >= 0))
			
 
				+				best_disk = -1;
			
 
				 			best_good_sectors = sectors;
			
 
				+		}
			
 
				 
			
 
				 		if (best_disk >= 0)
			
 
				 			/* At least two disks to choose from so failfast is OK */
			
@@ -1529,17 +1532,16 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 
				 			plug = container_of(cb, struct raid1_plug_cb, cb);
			
 
				 		else
			
 
				 			plug = NULL;
			
 
				-		spin_lock_irqsave(&conf->device_lock, flags);
			
 
				 		if (plug) {
			
 
				 			bio_list_add(&plug->pending, mbio);
			
 
				 			plug->pending_cnt++;
			
 
				 		} else {
			
 
				+			spin_lock_irqsave(&conf->device_lock, flags);
			
 
				 			bio_list_add(&conf->pending_bio_list, mbio);
			
 
				 			conf->pending_count++;
			
 
				-		}
			
 
				-		spin_unlock_irqrestore(&conf->device_lock, flags);
			
 
				-		if (!plug)
			
 
				+			spin_unlock_irqrestore(&conf->device_lock, flags);
			
 
				 			md_wakeup_thread(mddev->thread);
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	r1_bio_write_done(r1_bio);
			
@@ -3197,7 +3199,7 @@ static int raid1_reshape(struct mddev *mddev)
 
				 	struct r1conf *conf = mddev->private;
			
 
				 	int cnt, raid_disks;
			
 
				 	unsigned long flags;
			
 
				-	int d, d2, err;
			
 
				+	int d, d2;
			
 
				 
			
 
				 	/* Cannot change chunk_size, layout, or level */
			
 
				 	if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
			
@@ -3209,11 +3211,8 @@ static int raid1_reshape(struct mddev *mddev)
 
				 		return -EINVAL;
			
 
				 	}
			
 
				 
			
 
				-	if (!mddev_is_clustered(mddev)) {
			
 
				-		err = md_allow_write(mddev);
			
 
				-		if (err)
			
 
				-			return err;
			
 
				-	}
			
 
				+	if (!mddev_is_clustered(mddev))
			
 
				+		md_allow_write(mddev);
			
 
				 
			
 
				 	raid_disks = mddev->raid_disks + mddev->delta_disks;
			
 
				 
			
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1282,17 +1282,16 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
 
				 		plug = container_of(cb, struct raid10_plug_cb, cb);
			
 
				 	else
			
 
				 		plug = NULL;
			
 
				-	spin_lock_irqsave(&conf->device_lock, flags);
			
 
				 	if (plug) {
			
 
				 		bio_list_add(&plug->pending, mbio);
			
 
				 		plug->pending_cnt++;
			
 
				 	} else {
			
 
				+		spin_lock_irqsave(&conf->device_lock, flags);
			
 
				 		bio_list_add(&conf->pending_bio_list, mbio);
			
 
				 		conf->pending_count++;
			
 
				-	}
			
 
				-	spin_unlock_irqrestore(&conf->device_lock, flags);
			
 
				-	if (!plug)
			
 
				+		spin_unlock_irqrestore(&conf->device_lock, flags);
			
 
				 		md_wakeup_thread(mddev->thread);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 static void raid10_write_request(struct mddev *mddev, struct bio *bio,
			
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -24,6 +24,7 @@
 
				 #include "md.h"
			
 
				 #include "raid5.h"
			
 
				 #include "bitmap.h"
			
 
				+#include "raid5-log.h"
			
 
				 
			
 
				 /*
			
 
				  * metadata/data stored in disk with 4k size unit (a block) regardless
			
@@ -622,20 +623,30 @@ static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
 
				 	__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
			
 
				 	spin_unlock_irqrestore(&log->io_list_lock, flags);
			
 
				 
			
 
				+	/*
			
 
				+	 * In case of journal device failures, submit_bio will get error
			
 
				+	 * and calls endio, then active stripes will continue write
			
 
				+	 * process. Therefore, it is not necessary to check Faulty bit
			
 
				+	 * of journal device here.
			
 
				+	 *
			
 
				+	 * We can't check split_bio after current_bio is submitted. If
			
 
				+	 * io->split_bio is null, after current_bio is submitted, current_bio
			
 
				+	 * might already be completed and the io_unit is freed. We submit
			
 
				+	 * split_bio first to avoid the issue.
			
 
				+	 */
			
 
				+	if (io->split_bio) {
			
 
				+		if (io->has_flush)
			
 
				+			io->split_bio->bi_opf |= REQ_PREFLUSH;
			
 
				+		if (io->has_fua)
			
 
				+			io->split_bio->bi_opf |= REQ_FUA;
			
 
				+		submit_bio(io->split_bio);
			
 
				+	}
			
 
				+
			
 
				 	if (io->has_flush)
			
 
				 		io->current_bio->bi_opf |= REQ_PREFLUSH;
			
 
				 	if (io->has_fua)
			
 
				 		io->current_bio->bi_opf |= REQ_FUA;
			
 
				 	submit_bio(io->current_bio);
			
 
				-
			
 
				-	if (!io->split_bio)
			
 
				-		return;
			
 
				-
			
 
				-	if (io->has_flush)
			
 
				-		io->split_bio->bi_opf |= REQ_PREFLUSH;
			
 
				-	if (io->has_fua)
			
 
				-		io->split_bio->bi_opf |= REQ_FUA;
			
 
				-	submit_bio(io->split_bio);
			
 
				 }
			
 
				 
			
 
				 /* deferred io_unit will be dispatched here */
			
@@ -670,6 +681,11 @@ static void r5c_disable_writeback_async(struct work_struct *work)
 
				 		return;
			
 
				 	pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
			
 
				 		mdname(mddev));
			
 
				+
			
 
				+	/* wait superblock change before suspend */
			
 
				+	wait_event(mddev->sb_wait,
			
 
				+		   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
			
 
				+
			
 
				 	mddev_suspend(mddev);
			
 
				 	log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
			
 
				 	mddev_resume(mddev);
			
@@ -2621,8 +2637,11 @@ int r5c_try_caching_write(struct r5conf *conf,
 
				 	 * When run in degraded mode, array is set to write-through mode.
			
 
				 	 * This check helps drain pending write safely in the transition to
			
 
				 	 * write-through mode.
			
 
				+	 *
			
 
				+	 * When a stripe is syncing, the write is also handled in write
			
 
				+	 * through mode.
			
 
				 	 */
			
 
				-	if (s->failed) {
			
 
				+	if (s->failed || test_bit(STRIPE_SYNCING, &sh->state)) {
			
 
				 		r5c_make_stripe_write_out(sh);
			
 
				 		return -EAGAIN;
			
 
				 	}
			
@@ -2825,6 +2844,9 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
 
				 	}
			
 
				 
			
 
				 	r5l_append_flush_payload(log, sh->sector);
			
 
				+	/* stripe is flused to raid disks, we can do resync now */
			
 
				+	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
			
 
				+		set_bit(STRIPE_HANDLE, &sh->state);
			
 
				 }
			
 
				 
			
 
				 int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
			
@@ -2973,7 +2995,7 @@ ioerr:
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-void r5c_update_on_rdev_error(struct mddev *mddev)
			
 
				+void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
			
 
				 {
			
 
				 	struct r5conf *conf = mddev->private;
			
 
				 	struct r5l_log *log = conf->log;
			
@@ -2981,7 +3003,8 @@ void r5c_update_on_rdev_error(struct mddev *mddev)
 
				 	if (!log)
			
 
				 		return;
			
 
				 
			
 
				-	if (raid5_calc_degraded(conf) > 0 &&
			
 
				+	if ((raid5_calc_degraded(conf) > 0 ||
			
 
				+	     test_bit(Journal, &rdev->flags)) &&
			
 
				 	    conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
			
 
				 		schedule_work(&log->disable_writeback_work);
			
 
				 }
			
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -28,7 +28,8 @@ extern void r5c_flush_cache(struct r5conf *conf, int num);
 
				 extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
			
 
				 extern void r5c_check_cached_full_stripe(struct r5conf *conf);
			
 
				 extern struct md_sysfs_entry r5c_journal_mode;
			
 
				-extern void r5c_update_on_rdev_error(struct mddev *mddev);
			
 
				+extern void r5c_update_on_rdev_error(struct mddev *mddev,
			
 
				+				     struct md_rdev *rdev);
			
 
				 extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
			
 
				 
			
 
				 extern struct dma_async_tx_descriptor *
			
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -103,8 +103,7 @@ static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
 
				 static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
			
 
				 {
			
 
				 	int i;
			
 
				-	local_irq_disable();
			
 
				-	spin_lock(conf->hash_locks);
			
 
				+	spin_lock_irq(conf->hash_locks);
			
 
				 	for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
			
 
				 		spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
			
 
				 	spin_lock(&conf->device_lock);
			
@@ -114,9 +113,9 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
 
				 {
			
 
				 	int i;
			
 
				 	spin_unlock(&conf->device_lock);
			
 
				-	for (i = NR_STRIPE_HASH_LOCKS; i; i--)
			
 
				-		spin_unlock(conf->hash_locks + i - 1);
			
 
				-	local_irq_enable();
			
 
				+	for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
			
 
				+		spin_unlock(conf->hash_locks + i);
			
 
				+	spin_unlock_irq(conf->hash_locks);
			
 
				 }
			
 
				 
			
 
				 /* Find first data disk in a raid6 stripe */
			
@@ -234,11 +233,15 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 
				 			if (test_bit(R5_InJournal, &sh->dev[i].flags))
			
 
				 				injournal++;
			
 
				 	/*
			
 
				-	 * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with
			
 
				-	 * data in journal, so they are not released to cached lists
			
 
				+	 * In the following cases, the stripe cannot be released to cached
			
 
				+	 * lists. Therefore, we make the stripe write out and set
			
 
				+	 * STRIPE_HANDLE:
			
 
				+	 *   1. when quiesce in r5c write back;
			
 
				+	 *   2. when resync is requested fot the stripe.
			
 
				 	 */
			
 
				-	if (conf->quiesce && r5c_is_writeback(conf->log) &&
			
 
				-	    !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) {
			
 
				+	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) ||
			
 
				+	    (conf->quiesce && r5c_is_writeback(conf->log) &&
			
 
				+	     !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) {
			
 
				 		if (test_bit(STRIPE_R5C_CACHING, &sh->state))
			
 
				 			r5c_make_stripe_write_out(sh);
			
 
				 		set_bit(STRIPE_HANDLE, &sh->state);
			
@@ -714,12 +717,11 @@ static bool is_full_stripe_write(struct stripe_head *sh)
 
				 
			
 
				 static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
			
 
				 {
			
 
				-	local_irq_disable();
			
 
				 	if (sh1 > sh2) {
			
 
				-		spin_lock(&sh2->stripe_lock);
			
 
				+		spin_lock_irq(&sh2->stripe_lock);
			
 
				 		spin_lock_nested(&sh1->stripe_lock, 1);
			
 
				 	} else {
			
 
				-		spin_lock(&sh1->stripe_lock);
			
 
				+		spin_lock_irq(&sh1->stripe_lock);
			
 
				 		spin_lock_nested(&sh2->stripe_lock, 1);
			
 
				 	}
			
 
				 }
			
@@ -727,8 +729,7 @@ static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
 
				 static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
			
 
				 {
			
 
				 	spin_unlock(&sh1->stripe_lock);
			
 
				-	spin_unlock(&sh2->stripe_lock);
			
 
				-	local_irq_enable();
			
 
				+	spin_unlock_irq(&sh2->stripe_lock);
			
 
				 }
			
 
				 
			
 
				 /* Only freshly new full stripe normal write stripe can be added to a batch list */
			
@@ -2312,14 +2313,12 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 
				 	struct stripe_head *osh, *nsh;
			
 
				 	LIST_HEAD(newstripes);
			
 
				 	struct disk_info *ndisks;
			
 
				-	int err;
			
 
				+	int err = 0;
			
 
				 	struct kmem_cache *sc;
			
 
				 	int i;
			
 
				 	int hash, cnt;
			
 
				 
			
 
				-	err = md_allow_write(conf->mddev);
			
 
				-	if (err)
			
 
				-		return err;
			
 
				+	md_allow_write(conf->mddev);
			
 
				 
			
 
				 	/* Step 1 */
			
 
				 	sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
			
@@ -2694,7 +2693,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
 
				 		bdevname(rdev->bdev, b),
			
 
				 		mdname(mddev),
			
 
				 		conf->raid_disks - mddev->degraded);
			
 
				-	r5c_update_on_rdev_error(mddev);
			
 
				+	r5c_update_on_rdev_error(mddev, rdev);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -3055,6 +3054,11 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
 
				  *      When LOG_CRITICAL, stripes with injournal == 0 will be sent to
			
 
				  *      no_space_stripes list.
			
 
				  *
			
 
				+ *   3. during journal failure
			
 
				+ *      In journal failure, we try to flush all cached data to raid disks
			
 
				+ *      based on data in stripe cache. The array is read-only to upper
			
 
				+ *      layers, so we would skip all pending writes.
			
 
				+ *
			
 
				  */
			
 
				 static inline bool delay_towrite(struct r5conf *conf,
			
 
				 				 struct r5dev *dev,
			
@@ -3068,6 +3072,9 @@ static inline bool delay_towrite(struct r5conf *conf,
 
				 	if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
			
 
				 	    s->injournal > 0)
			
 
				 		return true;
			
 
				+	/* case 3 above */
			
 
				+	if (s->log_failed && s->injournal)
			
 
				+		return true;
			
 
				 	return false;
			
 
				 }
			
 
				 
			
@@ -4653,8 +4660,13 @@ static void handle_stripe(struct stripe_head *sh)
 
				 
			
 
				 	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
			
 
				 		spin_lock(&sh->stripe_lock);
			
 
				-		/* Cannot process 'sync' concurrently with 'discard' */
			
 
				-		if (!test_bit(STRIPE_DISCARD, &sh->state) &&
			
 
				+		/*
			
 
				+		 * Cannot process 'sync' concurrently with 'discard'.
			
 
				+		 * Flush data in r5cache before 'sync'.
			
 
				+		 */
			
 
				+		if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
			
 
				+		    !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) &&
			
 
				+		    !test_bit(STRIPE_DISCARD, &sh->state) &&
			
 
				 		    test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
			
 
				 			set_bit(STRIPE_SYNCING, &sh->state);
			
 
				 			clear_bit(STRIPE_INSYNC, &sh->state);
			
@@ -4701,10 +4713,15 @@ static void handle_stripe(struct stripe_head *sh)
 
				 	       " to_write=%d failed=%d failed_num=%d,%d\n",
			
 
				 	       s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
			
 
				 	       s.failed_num[0], s.failed_num[1]);
			
 
				-	/* check if the array has lost more than max_degraded devices and,
			
 
				+	/*
			
 
				+	 * check if the array has lost more than max_degraded devices and,
			
 
				 	 * if so, some requests might need to be failed.
			
 
				+	 *
			
 
				+	 * When journal device failed (log_failed), we will only process
			
 
				+	 * the stripe if there is data need write to raid disks
			
 
				 	 */
			
 
				-	if (s.failed > conf->max_degraded || s.log_failed) {
			
 
				+	if (s.failed > conf->max_degraded ||
			
 
				+	    (s.log_failed && s.injournal == 0)) {
			
 
				 		sh->check_state = 0;
			
 
				 		sh->reconstruct_state = 0;
			
 
				 		break_stripe_batch_list(sh, 0);
			
@@ -5277,8 +5294,10 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
 
				 	struct stripe_head *sh, *tmp;
			
 
				 	struct list_head *handle_list = NULL;
			
 
				 	struct r5worker_group *wg;
			
 
				-	bool second_try = !r5c_is_writeback(conf->log);
			
 
				-	bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state);
			
 
				+	bool second_try = !r5c_is_writeback(conf->log) &&
			
 
				+		!r5l_log_disk_error(conf);
			
 
				+	bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) ||
			
 
				+		r5l_log_disk_error(conf);
			
 
				 
			
 
				 again:
			
 
				 	wg = NULL;
			
@@ -6313,7 +6332,6 @@ int
 
				 raid5_set_cache_size(struct mddev *mddev, int size)
			
 
				 {
			
 
				 	struct r5conf *conf = mddev->private;
			
 
				-	int err;
			
 
				 
			
 
				 	if (size <= 16 || size > 32768)
			
 
				 		return -EINVAL;
			
@@ -6325,10 +6343,7 @@ raid5_set_cache_size(struct mddev *mddev, int size)
 
				 		;
			
 
				 	mutex_unlock(&conf->cache_size_mutex);
			
 
				 
			
 
				-
			
 
				-	err = md_allow_write(mddev);
			
 
				-	if (err)
			
 
				-		return err;
			
 
				+	md_allow_write(mddev);
			
 
				 
			
 
				 	mutex_lock(&conf->cache_size_mutex);
			
 
				 	while (size > conf->max_nr_stripes)
			
@@ -7530,7 +7545,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 
				 		 * neilb: there is no locking about new writes here,
			
 
				 		 * so this cannot be safe.
			
 
				 		 */
			
 
				-		if (atomic_read(&conf->active_stripes)) {
			
 
				+		if (atomic_read(&conf->active_stripes) ||
			
 
				+		    atomic_read(&conf->r5c_cached_full_stripes) ||
			
 
				+		    atomic_read(&conf->r5c_cached_partial_stripes)) {
			
 
				 			return -EBUSY;
			
 
				 		}
			
 
				 		log_exit(conf);