7 years ago · d60dafdca4
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -132,6 +132,24 @@ static inline int speed_max(struct mddev *mddev)
 
				 		mddev->sync_speed_max : sysctl_speed_limit_max;
			
 
				 }
			
 
				 
			
 
				+static void * flush_info_alloc(gfp_t gfp_flags, void *data)
			
 
				+{
			
 
				+        return kzalloc(sizeof(struct flush_info), gfp_flags);
			
 
				+}
			
 
				+static void flush_info_free(void *flush_info, void *data)
			
 
				+{
			
 
				+        kfree(flush_info);
			
 
				+}
			
 
				+
			
 
				+static void * flush_bio_alloc(gfp_t gfp_flags, void *data)
			
 
				+{
			
 
				+	return kzalloc(sizeof(struct flush_bio), gfp_flags);
			
 
				+}
			
 
				+static void flush_bio_free(void *flush_bio, void *data)
			
 
				+{
			
 
				+	kfree(flush_bio);
			
 
				+}
			
 
				+
			
 
				 static struct ctl_table_header *raid_table_header;
			
 
				 
			
 
				 static struct ctl_table raid_table[] = {
			
@@ -414,30 +432,53 @@ static int md_congested(void *data, int bits)
 
				 /*
			
 
				  * Generic flush handling for md
			
 
				  */
			
 
				+static void submit_flushes(struct work_struct *ws)
			
 
				+{
			
 
				+	struct flush_info *fi = container_of(ws, struct flush_info, flush_work);
			
 
				+	struct mddev *mddev = fi->mddev;
			
 
				+	struct bio *bio = fi->bio;
			
 
				+
			
 
				+	bio->bi_opf &= ~REQ_PREFLUSH;
			
 
				+	md_handle_request(mddev, bio);
			
 
				+
			
 
				+	mempool_free(fi, mddev->flush_pool);
			
 
				+}
			
 
				 
			
 
				-static void md_end_flush(struct bio *bio)
			
 
				+static void md_end_flush(struct bio *fbio)
			
 
				 {
			
 
				-	struct md_rdev *rdev = bio->bi_private;
			
 
				-	struct mddev *mddev = rdev->mddev;
			
 
				+	struct flush_bio *fb = fbio->bi_private;
			
 
				+	struct md_rdev *rdev = fb->rdev;
			
 
				+	struct flush_info *fi = fb->fi;
			
 
				+	struct bio *bio = fi->bio;
			
 
				+	struct mddev *mddev = fi->mddev;
			
 
				 
			
 
				 	rdev_dec_pending(rdev, mddev);
			
 
				 
			
 
				-	if (atomic_dec_and_test(&mddev->flush_pending)) {
			
 
				-		/* The pre-request flush has finished */
			
 
				-		queue_work(md_wq, &mddev->flush_work);
			
 
				+	if (atomic_dec_and_test(&fi->flush_pending)) {
			
 
				+		if (bio->bi_iter.bi_size == 0)
			
 
				+			/* an empty barrier - all done */
			
 
				+			bio_endio(bio);
			
 
				+		else {
			
 
				+			INIT_WORK(&fi->flush_work, submit_flushes);
			
 
				+			queue_work(md_wq, &fi->flush_work);
			
 
				+		}
			
 
				 	}
			
 
				-	bio_put(bio);
			
 
				-}
			
 
				 
			
 
				-static void md_submit_flush_data(struct work_struct *ws);
			
 
				+	mempool_free(fb, mddev->flush_bio_pool);
			
 
				+	bio_put(fbio);
			
 
				+}
			
 
				 
			
 
				-static void submit_flushes(struct work_struct *ws)
			
 
				+void md_flush_request(struct mddev *mddev, struct bio *bio)
			
 
				 {
			
 
				-	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
			
 
				 	struct md_rdev *rdev;
			
 
				+	struct flush_info *fi;
			
 
				+
			
 
				+	fi = mempool_alloc(mddev->flush_pool, GFP_NOIO);
			
 
				+
			
 
				+	fi->bio = bio;
			
 
				+	fi->mddev = mddev;
			
 
				+	atomic_set(&fi->flush_pending, 1);
			
 
				 
			
 
				-	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
			
 
				-	atomic_set(&mddev->flush_pending, 1);
			
 
				 	rcu_read_lock();
			
 
				 	rdev_for_each_rcu(rdev, mddev)
			
 
				 		if (rdev->raid_disk >= 0 &&
			
@@ -447,59 +488,39 @@ static void submit_flushes(struct work_struct *ws)
 
				 			 * we reclaim rcu_read_lock
			
 
				 			 */
			
 
				 			struct bio *bi;
			
 
				+			struct flush_bio *fb;
			
 
				 			atomic_inc(&rdev->nr_pending);
			
 
				 			atomic_inc(&rdev->nr_pending);
			
 
				 			rcu_read_unlock();
			
 
				+
			
 
				+			fb = mempool_alloc(mddev->flush_bio_pool, GFP_NOIO);
			
 
				+			fb->fi = fi;
			
 
				+			fb->rdev = rdev;
			
 
				+
			
 
				 			bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
			
 
				-			bi->bi_end_io = md_end_flush;
			
 
				-			bi->bi_private = rdev;
			
 
				 			bio_set_dev(bi, rdev->bdev);
			
 
				+			bi->bi_end_io = md_end_flush;
			
 
				+			bi->bi_private = fb;
			
 
				 			bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
			
 
				-			atomic_inc(&mddev->flush_pending);
			
 
				+
			
 
				+			atomic_inc(&fi->flush_pending);
			
 
				 			submit_bio(bi);
			
 
				+
			
 
				 			rcu_read_lock();
			
 
				 			rdev_dec_pending(rdev, mddev);
			
 
				 		}
			
 
				 	rcu_read_unlock();
			
 
				-	if (atomic_dec_and_test(&mddev->flush_pending))
			
 
				-		queue_work(md_wq, &mddev->flush_work);
			
 
				-}
			
 
				-
			
 
				-static void md_submit_flush_data(struct work_struct *ws)
			
 
				-{
			
 
				-	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
			
 
				-	struct bio *bio = mddev->flush_bio;
			
 
				 
			
 
				-	/*
			
 
				-	 * must reset flush_bio before calling into md_handle_request to avoid a
			
 
				-	 * deadlock, because other bios passed md_handle_request suspend check
			
 
				-	 * could wait for this and below md_handle_request could wait for those
			
 
				-	 * bios because of suspend check
			
 
				-	 */
			
 
				-	mddev->flush_bio = NULL;
			
 
				-	wake_up(&mddev->sb_wait);
			
 
				-
			
 
				-	if (bio->bi_iter.bi_size == 0)
			
 
				-		/* an empty barrier - all done */
			
 
				-		bio_endio(bio);
			
 
				-	else {
			
 
				-		bio->bi_opf &= ~REQ_PREFLUSH;
			
 
				-		md_handle_request(mddev, bio);
			
 
				+	if (atomic_dec_and_test(&fi->flush_pending)) {
			
 
				+		if (bio->bi_iter.bi_size == 0)
			
 
				+			/* an empty barrier - all done */
			
 
				+			bio_endio(bio);
			
 
				+		else {
			
 
				+			INIT_WORK(&fi->flush_work, submit_flushes);
			
 
				+			queue_work(md_wq, &fi->flush_work);
			
 
				+		}
			
 
				 	}
			
 
				 }
			
 
				-
			
 
				-void md_flush_request(struct mddev *mddev, struct bio *bio)
			
 
				-{
			
 
				-	spin_lock_irq(&mddev->lock);
			
 
				-	wait_event_lock_irq(mddev->sb_wait,
			
 
				-			    !mddev->flush_bio,
			
 
				-			    mddev->lock);
			
 
				-	mddev->flush_bio = bio;
			
 
				-	spin_unlock_irq(&mddev->lock);
			
 
				-
			
 
				-	INIT_WORK(&mddev->flush_work, submit_flushes);
			
 
				-	queue_work(md_wq, &mddev->flush_work);
			
 
				-}
			
 
				 EXPORT_SYMBOL(md_flush_request);
			
 
				 
			
 
				 static inline struct mddev *mddev_get(struct mddev *mddev)
			
@@ -546,7 +567,6 @@ void mddev_init(struct mddev *mddev)
 
				 	atomic_set(&mddev->openers, 0);
			
 
				 	atomic_set(&mddev->active_io, 0);
			
 
				 	spin_lock_init(&mddev->lock);
			
 
				-	atomic_set(&mddev->flush_pending, 0);
			
 
				 	init_waitqueue_head(&mddev->sb_wait);
			
 
				 	init_waitqueue_head(&mddev->recovery_wait);
			
 
				 	mddev->reshape_position = MaxSector;
			
@@ -2844,7 +2864,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
 
				 			err = 0;
			
 
				 		}
			
 
				 	} else if (cmd_match(buf, "re-add")) {
			
 
				-		if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) {
			
 
				+		if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
			
 
				+			rdev->saved_raid_disk >= 0) {
			
 
				 			/* clear_bit is performed _after_ all the devices
			
 
				 			 * have their local Faulty bit cleared. If any writes
			
 
				 			 * happen in the meantime in the local node, they
			
@@ -5499,6 +5520,22 @@ int md_run(struct mddev *mddev)
 
				 		if (err)
			
 
				 			return err;
			
 
				 	}
			
 
				+	if (mddev->flush_pool == NULL) {
			
 
				+		mddev->flush_pool = mempool_create(NR_FLUSH_INFOS, flush_info_alloc,
			
 
				+						flush_info_free, mddev);
			
 
				+		if (!mddev->flush_pool) {
			
 
				+			err = -ENOMEM;
			
 
				+			goto abort;
			
 
				+		}
			
 
				+	}
			
 
				+	if (mddev->flush_bio_pool == NULL) {
			
 
				+		mddev->flush_bio_pool = mempool_create(NR_FLUSH_BIOS, flush_bio_alloc,
			
 
				+						flush_bio_free, mddev);
			
 
				+		if (!mddev->flush_bio_pool) {
			
 
				+			err = -ENOMEM;
			
 
				+			goto abort;
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				 	spin_lock(&pers_lock);
			
 
				 	pers = find_pers(mddev->level, mddev->clevel);
			
@@ -5654,6 +5691,18 @@ int md_run(struct mddev *mddev)
 
				 	sysfs_notify_dirent_safe(mddev->sysfs_action);
			
 
				 	sysfs_notify(&mddev->kobj, NULL, "degraded");
			
 
				 	return 0;
			
 
				+
			
 
				+abort:
			
 
				+	if (mddev->flush_bio_pool) {
			
 
				+		mempool_destroy(mddev->flush_bio_pool);
			
 
				+		mddev->flush_bio_pool = NULL;
			
 
				+	}
			
 
				+	if (mddev->flush_pool){
			
 
				+		mempool_destroy(mddev->flush_pool);
			
 
				+		mddev->flush_pool = NULL;
			
 
				+	}
			
 
				+
			
 
				+	return err;
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(md_run);
			
 
				 
			
@@ -5864,6 +5913,14 @@ void md_stop(struct mddev *mddev)
 
				 	 * This is called from dm-raid
			
 
				 	 */
			
 
				 	__md_stop(mddev);
			
 
				+	if (mddev->flush_bio_pool) {
			
 
				+		mempool_destroy(mddev->flush_bio_pool);
			
 
				+		mddev->flush_bio_pool = NULL;
			
 
				+	}
			
 
				+	if (mddev->flush_pool) {
			
 
				+		mempool_destroy(mddev->flush_pool);
			
 
				+		mddev->flush_pool = NULL;
			
 
				+	}
			
 
				 	bioset_exit(&mddev->bio_set);
			
 
				 	bioset_exit(&mddev->sync_set);
			
 
				 }
			
@@ -6494,6 +6551,9 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
 
				 	char b[BDEVNAME_SIZE];
			
 
				 	struct md_rdev *rdev;
			
 
				 
			
 
				+	if (!mddev->pers)
			
 
				+		return -ENODEV;
			
 
				+
			
 
				 	rdev = find_rdev(mddev, dev);
			
 
				 	if (!rdev)
			
 
				 		return -ENXIO;
			
@@ -8611,6 +8671,7 @@ static int remove_and_add_spares(struct mddev *mddev,
 
				 			if (mddev->pers->hot_remove_disk(
			
 
				 				    mddev, rdev) == 0) {
			
 
				 				sysfs_unlink_rdev(mddev, rdev);
			
 
				+				rdev->saved_raid_disk = rdev->raid_disk;
			
 
				 				rdev->raid_disk = -1;
			
 
				 				removed++;
			
 
				 			}
			
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -252,6 +252,19 @@ enum mddev_sb_flags {
 
				 	MD_SB_NEED_REWRITE,	/* metadata write needs to be repeated */
			
 
				 };
			
 
				 
			
 
				+#define NR_FLUSH_INFOS 8
			
 
				+#define NR_FLUSH_BIOS 64
			
 
				+struct flush_info {
			
 
				+	struct bio			*bio;
			
 
				+	struct mddev			*mddev;
			
 
				+	struct work_struct		flush_work;
			
 
				+	atomic_t			flush_pending;
			
 
				+};
			
 
				+struct flush_bio {
			
 
				+	struct flush_info *fi;
			
 
				+	struct md_rdev *rdev;
			
 
				+};
			
 
				+
			
 
				 struct mddev {
			
 
				 	void				*private;
			
 
				 	struct md_personality		*pers;
			
@@ -457,13 +470,8 @@ struct mddev {
 
				 						   * metadata and bitmap writes
			
 
				 						   */
			
 
				 
			
 
				-	/* Generic flush handling.
			
 
				-	 * The last to finish preflush schedules a worker to submit
			
 
				-	 * the rest of the request (without the REQ_PREFLUSH flag).
			
 
				-	 */
			
 
				-	struct bio *flush_bio;
			
 
				-	atomic_t flush_pending;
			
 
				-	struct work_struct flush_work;
			
 
				+	mempool_t			*flush_pool;
			
 
				+	mempool_t			*flush_bio_pool;
			
 
				 	struct work_struct event_work;	/* used by dm to report failure event */
			
 
				 	void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
			
 
				 	struct md_cluster_info		*cluster_info;
			
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2449,7 +2449,6 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
 
				 	struct mddev *mddev = conf->mddev;
			
 
				 	struct bio *bio;
			
 
				 	struct md_rdev *rdev;
			
 
				-	sector_t bio_sector;
			
 
				 
			
 
				 	clear_bit(R1BIO_ReadError, &r1_bio->state);
			
 
				 	/* we got a read error. Maybe the drive is bad.  Maybe just
			
@@ -2462,7 +2461,6 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
 
				 	 */
			
 
				 
			
 
				 	bio = r1_bio->bios[r1_bio->read_disk];
			
 
				-	bio_sector = conf->mirrors[r1_bio->read_disk].rdev->data_offset + r1_bio->sector;
			
 
				 	bio_put(bio);
			
 
				 	r1_bio->bios[r1_bio->read_disk] = NULL;
			
 
				 
			
@@ -2473,6 +2471,8 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
 
				 		fix_read_error(conf, r1_bio->read_disk,
			
 
				 			       r1_bio->sector, r1_bio->sectors);
			
 
				 		unfreeze_array(conf);
			
 
				+	} else if (mddev->ro == 0 && test_bit(FailFast, &rdev->flags)) {
			
 
				+		md_error(mddev, rdev);
			
 
				 	} else {
			
 
				 		r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED;
			
 
				 	}
			
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -255,9 +255,11 @@ static void r10buf_pool_free(void *__r10_bio, void *data)
 
				 	for (j = conf->copies; j--; ) {
			
 
				 		struct bio *bio = r10bio->devs[j].bio;
			
 
				 
			
 
				-		rp = get_resync_pages(bio);
			
 
				-		resync_free_pages(rp);
			
 
				-		bio_put(bio);
			
 
				+		if (bio) {
			
 
				+			rp = get_resync_pages(bio);
			
 
				+			resync_free_pages(rp);
			
 
				+			bio_put(bio);
			
 
				+		}
			
 
				 
			
 
				 		bio = r10bio->devs[j].repl_bio;
			
 
				 		if (bio)
			
@@ -2362,7 +2364,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 
				 {
			
 
				 	int sect = 0; /* Offset from r10_bio->sector */
			
 
				 	int sectors = r10_bio->sectors;
			
 
				-	struct md_rdev*rdev;
			
 
				+	struct md_rdev *rdev;
			
 
				 	int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
			
 
				 	int d = r10_bio->devs[r10_bio->read_slot].devnum;
			
 
				 
			
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1139,6 +1139,9 @@ again:
 
				 			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
			
 
				 			bi->bi_io_vec[0].bv_offset = 0;
			
 
				 			bi->bi_iter.bi_size = STRIPE_SIZE;
			
 
				+			bi->bi_write_hint = sh->dev[i].write_hint;
			
 
				+			if (!rrdev)
			
 
				+				sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
			
 
				 			/*
			
 
				 			 * If this is discard request, set bi_vcnt 0. We don't
			
 
				 			 * want to confuse SCSI because SCSI will replace payload
			
@@ -1190,6 +1193,8 @@ again:
 
				 			rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
			
 
				 			rbi->bi_io_vec[0].bv_offset = 0;
			
 
				 			rbi->bi_iter.bi_size = STRIPE_SIZE;
			
 
				+			rbi->bi_write_hint = sh->dev[i].write_hint;
			
 
				+			sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
			
 
				 			/*
			
 
				 			 * If this is discard request, set bi_vcnt 0. We don't
			
 
				 			 * want to confuse SCSI because SCSI will replace payload
			
@@ -3204,6 +3209,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 
				 		(unsigned long long)sh->sector);
			
 
				 
			
 
				 	spin_lock_irq(&sh->stripe_lock);
			
 
				+	sh->dev[dd_idx].write_hint = bi->bi_write_hint;
			
 
				 	/* Don't allow new IO added to stripes in batch list */
			
 
				 	if (sh->batch_head)
			
 
				 		goto overlap;
			
@@ -4614,15 +4620,15 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
 
				 
			
 
				 		sh->check_state = head_sh->check_state;
			
 
				 		sh->reconstruct_state = head_sh->reconstruct_state;
			
 
				+		spin_lock_irq(&sh->stripe_lock);
			
 
				+		sh->batch_head = NULL;
			
 
				+		spin_unlock_irq(&sh->stripe_lock);
			
 
				 		for (i = 0; i < sh->disks; i++) {
			
 
				 			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
			
 
				 				do_wakeup = 1;
			
 
				 			sh->dev[i].flags = head_sh->dev[i].flags &
			
 
				 				(~((1 << R5_WriteError) | (1 << R5_Overlap)));
			
 
				 		}
			
 
				-		spin_lock_irq(&sh->stripe_lock);
			
 
				-		sh->batch_head = NULL;
			
 
				-		spin_unlock_irq(&sh->stripe_lock);
			
 
				 		if (handle_flags == 0 ||
			
 
				 		    sh->state & handle_flags)
			
 
				 			set_bit(STRIPE_HANDLE, &sh->state);
			
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -257,6 +257,7 @@ struct stripe_head {
 
				 		sector_t	sector;			/* sector of this page */
			
 
				 		unsigned long	flags;
			
 
				 		u32		log_checksum;
			
 
				+		unsigned short	write_hint;
			
 
				 	} dev[1]; /* allocated with extra space depending of RAID geometry */
			
 
				 };