9 жил өмнө · b4c625c673
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1,5 +1,6 @@
 
				 /*
			
 
				  * Copyright (C) 2015 Shaohua Li <shli@fb.com>
			
 
				+ * Copyright (C) 2016 Song Liu <songliubraving@fb.com>
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify it
			
 
				  * under the terms and conditions of the GNU General Public License,
			
@@ -1354,6 +1355,9 @@ struct r5l_recovery_ctx {
 
				 	sector_t meta_total_blocks;	/* total size of current meta and data */
			
 
				 	sector_t pos;			/* recovery position */
			
 
				 	u64 seq;			/* recovery position seq */
			
 
				+	int data_parity_stripes;	/* number of data_parity stripes */
			
 
				+	int data_only_stripes;		/* number of data_only stripes */
			
 
				+	struct list_head cached_list;
			
 
				 };
			
 
				 
			
 
				 static int r5l_recovery_read_meta_block(struct r5l_log *log,
			
@@ -1576,6 +1580,590 @@ static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite
			
 
				+ * to mark valid (potentially not flushed) data in the journal.
			
 
				+ *
			
 
				+ * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb,
			
 
				+ * so there should not be any mismatch here.
			
 
				+ */
			
 
				+static void r5l_recovery_load_data(struct r5l_log *log,
			
 
				+				   struct stripe_head *sh,
			
 
				+				   struct r5l_recovery_ctx *ctx,
			
 
				+				   struct r5l_payload_data_parity *payload,
			
 
				+				   sector_t log_offset)
			
 
				+{
			
 
				+	struct mddev *mddev = log->rdev->mddev;
			
 
				+	struct r5conf *conf = mddev->private;
			
 
				+	int dd_idx;
			
 
				+
			
 
				+	raid5_compute_sector(conf,
			
 
				+			     le64_to_cpu(payload->location), 0,
			
 
				+			     &dd_idx, sh);
			
 
				+	sync_page_io(log->rdev, log_offset, PAGE_SIZE,
			
 
				+		     sh->dev[dd_idx].page, REQ_OP_READ, 0, false);
			
 
				+	sh->dev[dd_idx].log_checksum =
			
 
				+		le32_to_cpu(payload->checksum[0]);
			
 
				+	ctx->meta_total_blocks += BLOCK_SECTORS;
			
 
				+
			
 
				+	set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags);
			
 
				+	set_bit(STRIPE_R5C_CACHING, &sh->state);
			
 
				+}
			
 
				+
			
 
				+static void r5l_recovery_load_parity(struct r5l_log *log,
			
 
				+				     struct stripe_head *sh,
			
 
				+				     struct r5l_recovery_ctx *ctx,
			
 
				+				     struct r5l_payload_data_parity *payload,
			
 
				+				     sector_t log_offset)
			
 
				+{
			
 
				+	struct mddev *mddev = log->rdev->mddev;
			
 
				+	struct r5conf *conf = mddev->private;
			
 
				+
			
 
				+	ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
			
 
				+	sync_page_io(log->rdev, log_offset, PAGE_SIZE,
			
 
				+		     sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false);
			
 
				+	sh->dev[sh->pd_idx].log_checksum =
			
 
				+		le32_to_cpu(payload->checksum[0]);
			
 
				+	set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags);
			
 
				+
			
 
				+	if (sh->qd_idx >= 0) {
			
 
				+		sync_page_io(log->rdev,
			
 
				+			     r5l_ring_add(log, log_offset, BLOCK_SECTORS),
			
 
				+			     PAGE_SIZE, sh->dev[sh->qd_idx].page,
			
 
				+			     REQ_OP_READ, 0, false);
			
 
				+		sh->dev[sh->qd_idx].log_checksum =
			
 
				+			le32_to_cpu(payload->checksum[1]);
			
 
				+		set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags);
			
 
				+	}
			
 
				+	clear_bit(STRIPE_R5C_CACHING, &sh->state);
			
 
				+}
			
 
				+
			
 
				+static void r5l_recovery_reset_stripe(struct stripe_head *sh)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	sh->state = 0;
			
 
				+	sh->log_start = MaxSector;
			
 
				+	for (i = sh->disks; i--; )
			
 
				+		sh->dev[i].flags = 0;
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+r5l_recovery_replay_one_stripe(struct r5conf *conf,
			
 
				+			       struct stripe_head *sh,
			
 
				+			       struct r5l_recovery_ctx *ctx)
			
 
				+{
			
 
				+	struct md_rdev *rdev, *rrdev;
			
 
				+	int disk_index;
			
 
				+	int data_count = 0;
			
 
				+
			
 
				+	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
			
 
				+		if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
			
 
				+			continue;
			
 
				+		if (disk_index == sh->qd_idx || disk_index == sh->pd_idx)
			
 
				+			continue;
			
 
				+		data_count++;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * stripes that only have parity must have been flushed
			
 
				+	 * before the crash that we are now recovering from, so
			
 
				+	 * there is nothing more to recovery.
			
 
				+	 */
			
 
				+	if (data_count == 0)
			
 
				+		goto out;
			
 
				+
			
 
				+	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
			
 
				+		if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
			
 
				+			continue;
			
 
				+
			
 
				+		/* in case device is broken */
			
 
				+		rcu_read_lock();
			
 
				+		rdev = rcu_dereference(conf->disks[disk_index].rdev);
			
 
				+		if (rdev) {
			
 
				+			atomic_inc(&rdev->nr_pending);
			
 
				+			rcu_read_unlock();
			
 
				+			sync_page_io(rdev, sh->sector, PAGE_SIZE,
			
 
				+				     sh->dev[disk_index].page, REQ_OP_WRITE, 0,
			
 
				+				     false);
			
 
				+			rdev_dec_pending(rdev, rdev->mddev);
			
 
				+			rcu_read_lock();
			
 
				+		}
			
 
				+		rrdev = rcu_dereference(conf->disks[disk_index].replacement);
			
 
				+		if (rrdev) {
			
 
				+			atomic_inc(&rrdev->nr_pending);
			
 
				+			rcu_read_unlock();
			
 
				+			sync_page_io(rrdev, sh->sector, PAGE_SIZE,
			
 
				+				     sh->dev[disk_index].page, REQ_OP_WRITE, 0,
			
 
				+				     false);
			
 
				+			rdev_dec_pending(rrdev, rrdev->mddev);
			
 
				+			rcu_read_lock();
			
 
				+		}
			
 
				+		rcu_read_unlock();
			
 
				+	}
			
 
				+	ctx->data_parity_stripes++;
			
 
				+out:
			
 
				+	r5l_recovery_reset_stripe(sh);
			
 
				+}
			
 
				+
			
 
				+static struct stripe_head *
			
 
				+r5c_recovery_alloc_stripe(struct r5conf *conf,
			
 
				+			  struct list_head *recovery_list,
			
 
				+			  sector_t stripe_sect,
			
 
				+			  sector_t log_start)
			
 
				+{
			
 
				+	struct stripe_head *sh;
			
 
				+
			
 
				+	sh = raid5_get_active_stripe(conf, stripe_sect, 0, 1, 0);
			
 
				+	if (!sh)
			
 
				+		return NULL;  /* no more stripe available */
			
 
				+
			
 
				+	r5l_recovery_reset_stripe(sh);
			
 
				+	sh->log_start = log_start;
			
 
				+
			
 
				+	return sh;
			
 
				+}
			
 
				+
			
 
				+static struct stripe_head *
			
 
				+r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect)
			
 
				+{
			
 
				+	struct stripe_head *sh;
			
 
				+
			
 
				+	list_for_each_entry(sh, list, lru)
			
 
				+		if (sh->sector == sect)
			
 
				+			return sh;
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+r5c_recovery_drop_stripes(struct list_head *cached_stripe_list,
			
 
				+			  struct r5l_recovery_ctx *ctx)
			
 
				+{
			
 
				+	struct stripe_head *sh, *next;
			
 
				+
			
 
				+	list_for_each_entry_safe(sh, next, cached_stripe_list, lru) {
			
 
				+		r5l_recovery_reset_stripe(sh);
			
 
				+		list_del_init(&sh->lru);
			
 
				+		raid5_release_stripe(sh);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+r5c_recovery_replay_stripes(struct list_head *cached_stripe_list,
			
 
				+			    struct r5l_recovery_ctx *ctx)
			
 
				+{
			
 
				+	struct stripe_head *sh, *next;
			
 
				+
			
 
				+	list_for_each_entry_safe(sh, next, cached_stripe_list, lru)
			
 
				+		if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
			
 
				+			r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx);
			
 
				+			list_del_init(&sh->lru);
			
 
				+			raid5_release_stripe(sh);
			
 
				+		}
			
 
				+}
			
 
				+
			
 
				+/* if matches return 0; otherwise return -EINVAL */
			
 
				+static int
			
 
				+r5l_recovery_verify_data_checksum(struct r5l_log *log, struct page *page,
			
 
				+				  sector_t log_offset, __le32 log_checksum)
			
 
				+{
			
 
				+	void *addr;
			
 
				+	u32 checksum;
			
 
				+
			
 
				+	sync_page_io(log->rdev, log_offset, PAGE_SIZE,
			
 
				+		     page, REQ_OP_READ, 0, false);
			
 
				+	addr = kmap_atomic(page);
			
 
				+	checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
			
 
				+	kunmap_atomic(addr);
			
 
				+	return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * before loading data to stripe cache, we need verify checksum for all data,
			
 
				+ * if there is mismatch for any data page, we drop all data in the mata block
			
 
				+ */
			
 
				+static int
			
 
				+r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
			
 
				+					 struct r5l_recovery_ctx *ctx)
			
 
				+{
			
 
				+	struct mddev *mddev = log->rdev->mddev;
			
 
				+	struct r5conf *conf = mddev->private;
			
 
				+	struct r5l_meta_block *mb = page_address(ctx->meta_page);
			
 
				+	sector_t mb_offset = sizeof(struct r5l_meta_block);
			
 
				+	sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
			
 
				+	struct page *page;
			
 
				+	struct r5l_payload_data_parity *payload;
			
 
				+
			
 
				+	page = alloc_page(GFP_KERNEL);
			
 
				+	if (!page)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	while (mb_offset < le32_to_cpu(mb->meta_size)) {
			
 
				+		payload = (void *)mb + mb_offset;
			
 
				+
			
 
				+		if (payload->header.type == R5LOG_PAYLOAD_DATA) {
			
 
				+			if (r5l_recovery_verify_data_checksum(
			
 
				+				    log, page, log_offset,
			
 
				+				    payload->checksum[0]) < 0)
			
 
				+				goto mismatch;
			
 
				+		} else if (payload->header.type == R5LOG_PAYLOAD_PARITY) {
			
 
				+			if (r5l_recovery_verify_data_checksum(
			
 
				+				    log, page, log_offset,
			
 
				+				    payload->checksum[0]) < 0)
			
 
				+				goto mismatch;
			
 
				+			if (conf->max_degraded == 2 && /* q for RAID 6 */
			
 
				+			    r5l_recovery_verify_data_checksum(
			
 
				+				    log, page,
			
 
				+				    r5l_ring_add(log, log_offset,
			
 
				+						 BLOCK_SECTORS),
			
 
				+				    payload->checksum[1]) < 0)
			
 
				+				goto mismatch;
			
 
				+		} else /* not R5LOG_PAYLOAD_DATA or R5LOG_PAYLOAD_PARITY */
			
 
				+			goto mismatch;
			
 
				+
			
 
				+		log_offset = r5l_ring_add(log, log_offset,
			
 
				+					  le32_to_cpu(payload->size));
			
 
				+
			
 
				+		mb_offset += sizeof(struct r5l_payload_data_parity) +
			
 
				+			sizeof(__le32) *
			
 
				+			(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
			
 
				+	}
			
 
				+
			
 
				+	put_page(page);
			
 
				+	return 0;
			
 
				+
			
 
				+mismatch:
			
 
				+	put_page(page);
			
 
				+	return -EINVAL;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Analyze all data/parity pages in one meta block
			
 
				+ * Returns:
			
 
				+ * 0 for success
			
 
				+ * -EINVAL for unknown playload type
			
 
				+ * -EAGAIN for checksum mismatch of data page
			
 
				+ * -ENOMEM for run out of memory (alloc_page failed or run out of stripes)
			
 
				+ */
			
 
				+static int
			
 
				+r5c_recovery_analyze_meta_block(struct r5l_log *log,
			
 
				+				struct r5l_recovery_ctx *ctx,
			
 
				+				struct list_head *cached_stripe_list)
			
 
				+{
			
 
				+	struct mddev *mddev = log->rdev->mddev;
			
 
				+	struct r5conf *conf = mddev->private;
			
 
				+	struct r5l_meta_block *mb;
			
 
				+	struct r5l_payload_data_parity *payload;
			
 
				+	int mb_offset;
			
 
				+	sector_t log_offset;
			
 
				+	sector_t stripe_sect;
			
 
				+	struct stripe_head *sh;
			
 
				+	int ret;
			
 
				+
			
 
				+	/*
			
 
				+	 * for mismatch in data blocks, we will drop all data in this mb, but
			
 
				+	 * we will still read next mb for other data with FLUSH flag, as
			
 
				+	 * io_unit could finish out of order.
			
 
				+	 */
			
 
				+	ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx);
			
 
				+	if (ret == -EINVAL)
			
 
				+		return -EAGAIN;
			
 
				+	else if (ret)
			
 
				+		return ret;   /* -ENOMEM duo to alloc_page() failed */
			
 
				+
			
 
				+	mb = page_address(ctx->meta_page);
			
 
				+	mb_offset = sizeof(struct r5l_meta_block);
			
 
				+	log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
			
 
				+
			
 
				+	while (mb_offset < le32_to_cpu(mb->meta_size)) {
			
 
				+		int dd;
			
 
				+
			
 
				+		payload = (void *)mb + mb_offset;
			
 
				+		stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ?
			
 
				+			raid5_compute_sector(
			
 
				+				conf, le64_to_cpu(payload->location), 0, &dd,
			
 
				+				NULL)
			
 
				+			: le64_to_cpu(payload->location);
			
 
				+
			
 
				+		sh = r5c_recovery_lookup_stripe(cached_stripe_list,
			
 
				+						stripe_sect);
			
 
				+
			
 
				+		if (!sh) {
			
 
				+			sh = r5c_recovery_alloc_stripe(conf, cached_stripe_list,
			
 
				+						       stripe_sect, ctx->pos);
			
 
				+			/*
			
 
				+			 * cannot get stripe from raid5_get_active_stripe
			
 
				+			 * try replay some stripes
			
 
				+			 */
			
 
				+			if (!sh) {
			
 
				+				r5c_recovery_replay_stripes(
			
 
				+					cached_stripe_list, ctx);
			
 
				+				sh = r5c_recovery_alloc_stripe(
			
 
				+					conf, cached_stripe_list,
			
 
				+					stripe_sect, ctx->pos);
			
 
				+			}
			
 
				+			if (!sh) {
			
 
				+				pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n",
			
 
				+					mdname(mddev),
			
 
				+					conf->min_nr_stripes * 2);
			
 
				+				raid5_set_cache_size(mddev,
			
 
				+						     conf->min_nr_stripes * 2);
			
 
				+				sh = r5c_recovery_alloc_stripe(
			
 
				+					conf, cached_stripe_list, stripe_sect,
			
 
				+					ctx->pos);
			
 
				+			}
			
 
				+			if (!sh) {
			
 
				+				pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n",
			
 
				+				       mdname(mddev));
			
 
				+				return -ENOMEM;
			
 
				+			}
			
 
				+			list_add_tail(&sh->lru, cached_stripe_list);
			
 
				+		}
			
 
				+
			
 
				+		if (payload->header.type == R5LOG_PAYLOAD_DATA) {
			
 
				+			if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
			
 
				+				r5l_recovery_replay_one_stripe(conf, sh, ctx);
			
 
				+				r5l_recovery_reset_stripe(sh);
			
 
				+				sh->log_start = ctx->pos;
			
 
				+				list_move_tail(&sh->lru, cached_stripe_list);
			
 
				+			}
			
 
				+			r5l_recovery_load_data(log, sh, ctx, payload,
			
 
				+					       log_offset);
			
 
				+		} else if (payload->header.type == R5LOG_PAYLOAD_PARITY)
			
 
				+			r5l_recovery_load_parity(log, sh, ctx, payload,
			
 
				+						 log_offset);
			
 
				+		else
			
 
				+			return -EINVAL;
			
 
				+
			
 
				+		log_offset = r5l_ring_add(log, log_offset,
			
 
				+					  le32_to_cpu(payload->size));
			
 
				+
			
 
				+		mb_offset += sizeof(struct r5l_payload_data_parity) +
			
 
				+			sizeof(__le32) *
			
 
				+			(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Load the stripe into cache. The stripe will be written out later by
			
 
				+ * the stripe cache state machine.
			
 
				+ */
			
 
				+static void r5c_recovery_load_one_stripe(struct r5l_log *log,
			
 
				+					 struct stripe_head *sh)
			
 
				+{
			
 
				+	struct r5conf *conf = sh->raid_conf;
			
 
				+	struct r5dev *dev;
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = sh->disks; i--; ) {
			
 
				+		dev = sh->dev + i;
			
 
				+		if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) {
			
 
				+			set_bit(R5_InJournal, &dev->flags);
			
 
				+			set_bit(R5_UPTODATE, &dev->flags);
			
 
				+		}
			
 
				+	}
			
 
				+	set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state);
			
 
				+	atomic_inc(&conf->r5c_cached_partial_stripes);
			
 
				+	list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Scan through the log for all to-be-flushed data
			
 
				+ *
			
 
				+ * For stripes with data and parity, namely Data-Parity stripe
			
 
				+ * (STRIPE_R5C_CACHING == 0), we simply replay all the writes.
			
 
				+ *
			
 
				+ * For stripes with only data, namely Data-Only stripe
			
 
				+ * (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine.
			
 
				+ *
			
 
				+ * For a stripe, if we see data after parity, we should discard all previous
			
 
				+ * data and parity for this stripe, as these data are already flushed to
			
 
				+ * the array.
			
 
				+ *
			
 
				+ * At the end of the scan, we return the new journal_tail, which points to
			
 
				+ * first data-only stripe on the journal device, or next invalid meta block.
			
 
				+ */
			
 
				+static int r5c_recovery_flush_log(struct r5l_log *log,
			
 
				+				  struct r5l_recovery_ctx *ctx)
			
 
				+{
			
 
				+	struct stripe_head *sh, *next;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	/* scan through the log */
			
 
				+	while (1) {
			
 
				+		if (r5l_recovery_read_meta_block(log, ctx))
			
 
				+			break;
			
 
				+
			
 
				+		ret = r5c_recovery_analyze_meta_block(log, ctx,
			
 
				+						      &ctx->cached_list);
			
 
				+		/*
			
 
				+		 * -EAGAIN means mismatch in data block, in this case, we still
			
 
				+		 * try scan the next metablock
			
 
				+		 */
			
 
				+		if (ret && ret != -EAGAIN)
			
 
				+			break;   /* ret == -EINVAL or -ENOMEM */
			
 
				+		ctx->seq++;
			
 
				+		ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
			
 
				+	}
			
 
				+
			
 
				+	if (ret == -ENOMEM) {
			
 
				+		r5c_recovery_drop_stripes(&ctx->cached_list, ctx);
			
 
				+		return ret;
			
 
				+	}
			
 
				+
			
 
				+	/* replay data-parity stripes */
			
 
				+	r5c_recovery_replay_stripes(&ctx->cached_list, ctx);
			
 
				+
			
 
				+	/* load data-only stripes to stripe cache */
			
 
				+	list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
			
 
				+		WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
			
 
				+		r5c_recovery_load_one_stripe(log, sh);
			
 
				+		list_del_init(&sh->lru);
			
 
				+		raid5_release_stripe(sh);
			
 
				+		ctx->data_only_stripes++;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * we did a recovery. Now ctx.pos points to an invalid meta block. New
			
 
				+ * log will start here. but we can't let superblock point to last valid
			
 
				+ * meta block. The log might looks like:
			
 
				+ * | meta 1| meta 2| meta 3|
			
 
				+ * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
			
 
				+ * superblock points to meta 1, we write a new valid meta 2n.  if crash
			
 
				+ * happens again, new recovery will start from meta 1. Since meta 2n is
			
 
				+ * valid now, recovery will think meta 3 is valid, which is wrong.
			
 
				+ * The solution is we create a new meta in meta2 with its seq == meta
			
 
				+ * 1's seq + 10 and let superblock points to meta2. The same recovery will
			
 
				+ * not think meta 3 is a valid meta, because its seq doesn't match
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Before recovery, the log looks like the following
			
 
				+ *
			
 
				+ *   ---------------------------------------------
			
 
				+ *   |           valid log        | invalid log  |
			
 
				+ *   ---------------------------------------------
			
 
				+ *   ^
			
 
				+ *   |- log->last_checkpoint
			
 
				+ *   |- log->last_cp_seq
			
 
				+ *
			
 
				+ * Now we scan through the log until we see invalid entry
			
 
				+ *
			
 
				+ *   ---------------------------------------------
			
 
				+ *   |           valid log        | invalid log  |
			
 
				+ *   ---------------------------------------------
			
 
				+ *   ^                            ^
			
 
				+ *   |- log->last_checkpoint      |- ctx->pos
			
 
				+ *   |- log->last_cp_seq          |- ctx->seq
			
 
				+ *
			
 
				+ * From this point, we need to increase seq number by 10 to avoid
			
 
				+ * confusing next recovery.
			
 
				+ *
			
 
				+ *   ---------------------------------------------
			
 
				+ *   |           valid log        | invalid log  |
			
 
				+ *   ---------------------------------------------
			
 
				+ *   ^                              ^
			
 
				+ *   |- log->last_checkpoint        |- ctx->pos+1
			
 
				+ *   |- log->last_cp_seq            |- ctx->seq+11
			
 
				+ *
			
 
				+ * However, it is not safe to start the state machine yet, because data only
			
 
				+ * parities are not yet secured in RAID. To save these data only parities, we
			
 
				+ * rewrite them from seq+11.
			
 
				+ *
			
 
				+ *   -----------------------------------------------------------------
			
 
				+ *   |           valid log        | data only stripes | invalid log  |
			
 
				+ *   -----------------------------------------------------------------
			
 
				+ *   ^                                                ^
			
 
				+ *   |- log->last_checkpoint                          |- ctx->pos+n
			
 
				+ *   |- log->last_cp_seq                              |- ctx->seq+10+n
			
 
				+ *
			
 
				+ * If failure happens again during this process, the recovery can safe start
			
 
				+ * again from log->last_checkpoint.
			
 
				+ *
			
 
				+ * Once data only stripes are rewritten to journal, we move log_tail
			
 
				+ *
			
 
				+ *   -----------------------------------------------------------------
			
 
				+ *   |     old log        |    data only stripes    | invalid log  |
			
 
				+ *   -----------------------------------------------------------------
			
 
				+ *                        ^                         ^
			
 
				+ *                        |- log->last_checkpoint   |- ctx->pos+n
			
 
				+ *                        |- log->last_cp_seq       |- ctx->seq+10+n
			
 
				+ *
			
 
				+ * Then we can safely start the state machine. If failure happens from this
			
 
				+ * point on, the recovery will start from new log->last_checkpoint.
			
 
				+ */
			
 
				+static int
			
 
				+r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
			
 
				+				       struct r5l_recovery_ctx *ctx)
			
 
				+{
			
 
				+	struct stripe_head *sh;
			
 
				+	struct mddev *mddev = log->rdev->mddev;
			
 
				+	struct page *page;
			
 
				+
			
 
				+	page = alloc_page(GFP_KERNEL);
			
 
				+	if (!page) {
			
 
				+		pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n",
			
 
				+		       mdname(mddev));
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	ctx->seq += 10;
			
 
				+	list_for_each_entry(sh, &ctx->cached_list, lru) {
			
 
				+		struct r5l_meta_block *mb;
			
 
				+		int i;
			
 
				+		int offset;
			
 
				+		sector_t write_pos;
			
 
				+
			
 
				+		WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
			
 
				+		r5l_recovery_create_empty_meta_block(log, page,
			
 
				+						     ctx->pos, ctx->seq);
			
 
				+		mb = page_address(page);
			
 
				+		offset = le32_to_cpu(mb->meta_size);
			
 
				+		write_pos = ctx->pos + BLOCK_SECTORS;
			
 
				+
			
 
				+		for (i = sh->disks; i--; ) {
			
 
				+			struct r5dev *dev = &sh->dev[i];
			
 
				+			struct r5l_payload_data_parity *payload;
			
 
				+			void *addr;
			
 
				+
			
 
				+			if (test_bit(R5_InJournal, &dev->flags)) {
			
 
				+				payload = (void *)mb + offset;
			
 
				+				payload->header.type = cpu_to_le16(
			
 
				+					R5LOG_PAYLOAD_DATA);
			
 
				+				payload->size = BLOCK_SECTORS;
			
 
				+				payload->location = cpu_to_le64(
			
 
				+					raid5_compute_blocknr(sh, i, 0));
			
 
				+				addr = kmap_atomic(dev->page);
			
 
				+				payload->checksum[0] = cpu_to_le32(
			
 
				+					crc32c_le(log->uuid_checksum, addr,
			
 
				+						  PAGE_SIZE));
			
 
				+				kunmap_atomic(addr);
			
 
				+				sync_page_io(log->rdev, write_pos, PAGE_SIZE,
			
 
				+					     dev->page, REQ_OP_WRITE, 0, false);
			
 
				+				write_pos = r5l_ring_add(log, write_pos,
			
 
				+							 BLOCK_SECTORS);
			
 
				+				offset += sizeof(__le32) +
			
 
				+					sizeof(struct r5l_payload_data_parity);
			
 
				+
			
 
				+			}
			
 
				+		}
			
 
				+		mb->meta_size = cpu_to_le32(offset);
			
 
				+		mb->checksum = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
			
 
				+		sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
			
 
				+			     REQ_OP_WRITE, WRITE_FUA, false);
			
 
				+		sh->log_start = ctx->pos;
			
 
				+		ctx->pos = write_pos;
			
 
				+		ctx->seq += 1;
			
 
				+	}
			
 
				+	__free_page(page);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 static int r5l_recovery_log(struct r5l_log *log)
			
 
				 {
			
 
				 	struct r5l_recovery_ctx ctx;
			
@@ -1583,6 +2171,10 @@ static int r5l_recovery_log(struct r5l_log *log)
 
				 	ctx.pos = log->last_checkpoint;
			
 
				 	ctx.seq = log->last_cp_seq;
			
 
				 	ctx.meta_page = alloc_page(GFP_KERNEL);
			
 
				+	ctx.data_only_stripes = 0;
			
 
				+	ctx.data_parity_stripes = 0;
			
 
				+	INIT_LIST_HEAD(&ctx.cached_list);
			
 
				+
			
 
				 	if (!ctx.meta_page)
			
 
				 		return -ENOMEM;
			
 
				 
			
@@ -1617,6 +2209,16 @@ static int r5l_recovery_log(struct r5l_log *log)
 
				 		log->log_start = ctx.pos;
			
 
				 		log->seq = ctx.seq;
			
 
				 	}
			
 
				+
			
 
				+	/*
			
 
				+	 * This is to suppress "function defined but not used" warning.
			
 
				+	 * It will be removed when the two functions are used (next patch).
			
 
				+	 */
			
 
				+	if (!log) {
			
 
				+		r5c_recovery_flush_log(log, &ctx);
			
 
				+		r5c_recovery_rewrite_data_only_stripes(log, &ctx);
			
 
				+	}
			
 
				+
			
 
				 	return 0;
			
 
				 }