|
|
@@ -1,5 +1,6 @@
|
|
|
/*
|
|
|
* Copyright (C) 2015 Shaohua Li <shli@fb.com>
|
|
|
+ * Copyright (C) 2016 Song Liu <songliubraving@fb.com>
|
|
|
*
|
|
|
* This program is free software; you can redistribute it and/or modify it
|
|
|
* under the terms and conditions of the GNU General Public License,
|
|
|
@@ -1354,6 +1355,9 @@ struct r5l_recovery_ctx {
|
|
|
sector_t meta_total_blocks; /* total size of current meta and data */
|
|
|
sector_t pos; /* recovery position */
|
|
|
u64 seq; /* recovery position seq */
|
|
|
+ int data_parity_stripes; /* number of data_parity stripes */
|
|
|
+ int data_only_stripes; /* number of data_only stripes */
|
|
|
+ struct list_head cached_list;
|
|
|
};
|
|
|
|
|
|
static int r5l_recovery_read_meta_block(struct r5l_log *log,
|
|
|
@@ -1576,6 +1580,590 @@ static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite
|
|
|
+ * to mark valid (potentially not flushed) data in the journal.
|
|
|
+ *
|
|
|
+ * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb,
|
|
|
+ * so there should not be any mismatch here.
|
|
|
+ */
|
|
|
+static void r5l_recovery_load_data(struct r5l_log *log,
|
|
|
+ struct stripe_head *sh,
|
|
|
+ struct r5l_recovery_ctx *ctx,
|
|
|
+ struct r5l_payload_data_parity *payload,
|
|
|
+ sector_t log_offset)
|
|
|
+{
|
|
|
+ struct mddev *mddev = log->rdev->mddev;
|
|
|
+ struct r5conf *conf = mddev->private;
|
|
|
+ int dd_idx;
|
|
|
+
|
|
|
+ raid5_compute_sector(conf,
|
|
|
+ le64_to_cpu(payload->location), 0,
|
|
|
+ &dd_idx, sh);
|
|
|
+ sync_page_io(log->rdev, log_offset, PAGE_SIZE,
|
|
|
+ sh->dev[dd_idx].page, REQ_OP_READ, 0, false);
|
|
|
+ sh->dev[dd_idx].log_checksum =
|
|
|
+ le32_to_cpu(payload->checksum[0]);
|
|
|
+ ctx->meta_total_blocks += BLOCK_SECTORS;
|
|
|
+
|
|
|
+ set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags);
|
|
|
+ set_bit(STRIPE_R5C_CACHING, &sh->state);
|
|
|
+}
|
|
|
+
|
|
|
+static void r5l_recovery_load_parity(struct r5l_log *log,
|
|
|
+ struct stripe_head *sh,
|
|
|
+ struct r5l_recovery_ctx *ctx,
|
|
|
+ struct r5l_payload_data_parity *payload,
|
|
|
+ sector_t log_offset)
|
|
|
+{
|
|
|
+ struct mddev *mddev = log->rdev->mddev;
|
|
|
+ struct r5conf *conf = mddev->private;
|
|
|
+
|
|
|
+ ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
|
|
|
+ sync_page_io(log->rdev, log_offset, PAGE_SIZE,
|
|
|
+ sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false);
|
|
|
+ sh->dev[sh->pd_idx].log_checksum =
|
|
|
+ le32_to_cpu(payload->checksum[0]);
|
|
|
+ set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags);
|
|
|
+
|
|
|
+ if (sh->qd_idx >= 0) {
|
|
|
+ sync_page_io(log->rdev,
|
|
|
+ r5l_ring_add(log, log_offset, BLOCK_SECTORS),
|
|
|
+ PAGE_SIZE, sh->dev[sh->qd_idx].page,
|
|
|
+ REQ_OP_READ, 0, false);
|
|
|
+ sh->dev[sh->qd_idx].log_checksum =
|
|
|
+ le32_to_cpu(payload->checksum[1]);
|
|
|
+ set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags);
|
|
|
+ }
|
|
|
+ clear_bit(STRIPE_R5C_CACHING, &sh->state);
|
|
|
+}
|
|
|
+
|
|
|
+static void r5l_recovery_reset_stripe(struct stripe_head *sh)
|
|
|
+{
|
|
|
+ int i;
|
|
|
+
|
|
|
+ sh->state = 0;
|
|
|
+ sh->log_start = MaxSector;
|
|
|
+ for (i = sh->disks; i--; )
|
|
|
+ sh->dev[i].flags = 0;
|
|
|
+}
|
|
|
+
|
|
|
+static void
|
|
|
+r5l_recovery_replay_one_stripe(struct r5conf *conf,
|
|
|
+ struct stripe_head *sh,
|
|
|
+ struct r5l_recovery_ctx *ctx)
|
|
|
+{
|
|
|
+ struct md_rdev *rdev, *rrdev;
|
|
|
+ int disk_index;
|
|
|
+ int data_count = 0;
|
|
|
+
|
|
|
+ for (disk_index = 0; disk_index < sh->disks; disk_index++) {
|
|
|
+ if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
|
|
|
+ continue;
|
|
|
+ if (disk_index == sh->qd_idx || disk_index == sh->pd_idx)
|
|
|
+ continue;
|
|
|
+ data_count++;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * stripes that only have parity must have been flushed
|
|
|
+ * before the crash that we are now recovering from, so
|
|
|
+ * there is nothing more to recovery.
|
|
|
+ */
|
|
|
+ if (data_count == 0)
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ for (disk_index = 0; disk_index < sh->disks; disk_index++) {
|
|
|
+ if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
|
|
|
+ continue;
|
|
|
+
|
|
|
+ /* in case device is broken */
|
|
|
+ rcu_read_lock();
|
|
|
+ rdev = rcu_dereference(conf->disks[disk_index].rdev);
|
|
|
+ if (rdev) {
|
|
|
+ atomic_inc(&rdev->nr_pending);
|
|
|
+ rcu_read_unlock();
|
|
|
+ sync_page_io(rdev, sh->sector, PAGE_SIZE,
|
|
|
+ sh->dev[disk_index].page, REQ_OP_WRITE, 0,
|
|
|
+ false);
|
|
|
+ rdev_dec_pending(rdev, rdev->mddev);
|
|
|
+ rcu_read_lock();
|
|
|
+ }
|
|
|
+ rrdev = rcu_dereference(conf->disks[disk_index].replacement);
|
|
|
+ if (rrdev) {
|
|
|
+ atomic_inc(&rrdev->nr_pending);
|
|
|
+ rcu_read_unlock();
|
|
|
+ sync_page_io(rrdev, sh->sector, PAGE_SIZE,
|
|
|
+ sh->dev[disk_index].page, REQ_OP_WRITE, 0,
|
|
|
+ false);
|
|
|
+ rdev_dec_pending(rrdev, rrdev->mddev);
|
|
|
+ rcu_read_lock();
|
|
|
+ }
|
|
|
+ rcu_read_unlock();
|
|
|
+ }
|
|
|
+ ctx->data_parity_stripes++;
|
|
|
+out:
|
|
|
+ r5l_recovery_reset_stripe(sh);
|
|
|
+}
|
|
|
+
|
|
|
+static struct stripe_head *
|
|
|
+r5c_recovery_alloc_stripe(struct r5conf *conf,
|
|
|
+ struct list_head *recovery_list,
|
|
|
+ sector_t stripe_sect,
|
|
|
+ sector_t log_start)
|
|
|
+{
|
|
|
+ struct stripe_head *sh;
|
|
|
+
|
|
|
+ sh = raid5_get_active_stripe(conf, stripe_sect, 0, 1, 0);
|
|
|
+ if (!sh)
|
|
|
+ return NULL; /* no more stripe available */
|
|
|
+
|
|
|
+ r5l_recovery_reset_stripe(sh);
|
|
|
+ sh->log_start = log_start;
|
|
|
+
|
|
|
+ return sh;
|
|
|
+}
|
|
|
+
|
|
|
+static struct stripe_head *
|
|
|
+r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect)
|
|
|
+{
|
|
|
+ struct stripe_head *sh;
|
|
|
+
|
|
|
+ list_for_each_entry(sh, list, lru)
|
|
|
+ if (sh->sector == sect)
|
|
|
+ return sh;
|
|
|
+ return NULL;
|
|
|
+}
|
|
|
+
|
|
|
+static void
|
|
|
+r5c_recovery_drop_stripes(struct list_head *cached_stripe_list,
|
|
|
+ struct r5l_recovery_ctx *ctx)
|
|
|
+{
|
|
|
+ struct stripe_head *sh, *next;
|
|
|
+
|
|
|
+ list_for_each_entry_safe(sh, next, cached_stripe_list, lru) {
|
|
|
+ r5l_recovery_reset_stripe(sh);
|
|
|
+ list_del_init(&sh->lru);
|
|
|
+ raid5_release_stripe(sh);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+static void
|
|
|
+r5c_recovery_replay_stripes(struct list_head *cached_stripe_list,
|
|
|
+ struct r5l_recovery_ctx *ctx)
|
|
|
+{
|
|
|
+ struct stripe_head *sh, *next;
|
|
|
+
|
|
|
+ list_for_each_entry_safe(sh, next, cached_stripe_list, lru)
|
|
|
+ if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
|
|
|
+ r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx);
|
|
|
+ list_del_init(&sh->lru);
|
|
|
+ raid5_release_stripe(sh);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/* if matches return 0; otherwise return -EINVAL */
|
|
|
+static int
|
|
|
+r5l_recovery_verify_data_checksum(struct r5l_log *log, struct page *page,
|
|
|
+ sector_t log_offset, __le32 log_checksum)
|
|
|
+{
|
|
|
+ void *addr;
|
|
|
+ u32 checksum;
|
|
|
+
|
|
|
+ sync_page_io(log->rdev, log_offset, PAGE_SIZE,
|
|
|
+ page, REQ_OP_READ, 0, false);
|
|
|
+ addr = kmap_atomic(page);
|
|
|
+ checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
|
|
|
+ kunmap_atomic(addr);
|
|
|
+ return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * before loading data to stripe cache, we need verify checksum for all data,
|
|
|
+ * if there is mismatch for any data page, we drop all data in the mata block
|
|
|
+ */
|
|
|
+static int
|
|
|
+r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
|
|
|
+ struct r5l_recovery_ctx *ctx)
|
|
|
+{
|
|
|
+ struct mddev *mddev = log->rdev->mddev;
|
|
|
+ struct r5conf *conf = mddev->private;
|
|
|
+ struct r5l_meta_block *mb = page_address(ctx->meta_page);
|
|
|
+ sector_t mb_offset = sizeof(struct r5l_meta_block);
|
|
|
+ sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
|
|
|
+ struct page *page;
|
|
|
+ struct r5l_payload_data_parity *payload;
|
|
|
+
|
|
|
+ page = alloc_page(GFP_KERNEL);
|
|
|
+ if (!page)
|
|
|
+ return -ENOMEM;
|
|
|
+
|
|
|
+ while (mb_offset < le32_to_cpu(mb->meta_size)) {
|
|
|
+ payload = (void *)mb + mb_offset;
|
|
|
+
|
|
|
+ if (payload->header.type == R5LOG_PAYLOAD_DATA) {
|
|
|
+ if (r5l_recovery_verify_data_checksum(
|
|
|
+ log, page, log_offset,
|
|
|
+ payload->checksum[0]) < 0)
|
|
|
+ goto mismatch;
|
|
|
+ } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) {
|
|
|
+ if (r5l_recovery_verify_data_checksum(
|
|
|
+ log, page, log_offset,
|
|
|
+ payload->checksum[0]) < 0)
|
|
|
+ goto mismatch;
|
|
|
+ if (conf->max_degraded == 2 && /* q for RAID 6 */
|
|
|
+ r5l_recovery_verify_data_checksum(
|
|
|
+ log, page,
|
|
|
+ r5l_ring_add(log, log_offset,
|
|
|
+ BLOCK_SECTORS),
|
|
|
+ payload->checksum[1]) < 0)
|
|
|
+ goto mismatch;
|
|
|
+ } else /* not R5LOG_PAYLOAD_DATA or R5LOG_PAYLOAD_PARITY */
|
|
|
+ goto mismatch;
|
|
|
+
|
|
|
+ log_offset = r5l_ring_add(log, log_offset,
|
|
|
+ le32_to_cpu(payload->size));
|
|
|
+
|
|
|
+ mb_offset += sizeof(struct r5l_payload_data_parity) +
|
|
|
+ sizeof(__le32) *
|
|
|
+ (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
|
|
|
+ }
|
|
|
+
|
|
|
+ put_page(page);
|
|
|
+ return 0;
|
|
|
+
|
|
|
+mismatch:
|
|
|
+ put_page(page);
|
|
|
+ return -EINVAL;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Analyze all data/parity pages in one meta block
|
|
|
+ * Returns:
|
|
|
+ * 0 for success
|
|
|
+ * -EINVAL for unknown playload type
|
|
|
+ * -EAGAIN for checksum mismatch of data page
|
|
|
+ * -ENOMEM for run out of memory (alloc_page failed or run out of stripes)
|
|
|
+ */
|
|
|
+static int
|
|
|
+r5c_recovery_analyze_meta_block(struct r5l_log *log,
|
|
|
+ struct r5l_recovery_ctx *ctx,
|
|
|
+ struct list_head *cached_stripe_list)
|
|
|
+{
|
|
|
+ struct mddev *mddev = log->rdev->mddev;
|
|
|
+ struct r5conf *conf = mddev->private;
|
|
|
+ struct r5l_meta_block *mb;
|
|
|
+ struct r5l_payload_data_parity *payload;
|
|
|
+ int mb_offset;
|
|
|
+ sector_t log_offset;
|
|
|
+ sector_t stripe_sect;
|
|
|
+ struct stripe_head *sh;
|
|
|
+ int ret;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * for mismatch in data blocks, we will drop all data in this mb, but
|
|
|
+ * we will still read next mb for other data with FLUSH flag, as
|
|
|
+ * io_unit could finish out of order.
|
|
|
+ */
|
|
|
+ ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx);
|
|
|
+ if (ret == -EINVAL)
|
|
|
+ return -EAGAIN;
|
|
|
+ else if (ret)
|
|
|
+ return ret; /* -ENOMEM duo to alloc_page() failed */
|
|
|
+
|
|
|
+ mb = page_address(ctx->meta_page);
|
|
|
+ mb_offset = sizeof(struct r5l_meta_block);
|
|
|
+ log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
|
|
|
+
|
|
|
+ while (mb_offset < le32_to_cpu(mb->meta_size)) {
|
|
|
+ int dd;
|
|
|
+
|
|
|
+ payload = (void *)mb + mb_offset;
|
|
|
+ stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ?
|
|
|
+ raid5_compute_sector(
|
|
|
+ conf, le64_to_cpu(payload->location), 0, &dd,
|
|
|
+ NULL)
|
|
|
+ : le64_to_cpu(payload->location);
|
|
|
+
|
|
|
+ sh = r5c_recovery_lookup_stripe(cached_stripe_list,
|
|
|
+ stripe_sect);
|
|
|
+
|
|
|
+ if (!sh) {
|
|
|
+ sh = r5c_recovery_alloc_stripe(conf, cached_stripe_list,
|
|
|
+ stripe_sect, ctx->pos);
|
|
|
+ /*
|
|
|
+ * cannot get stripe from raid5_get_active_stripe
|
|
|
+ * try replay some stripes
|
|
|
+ */
|
|
|
+ if (!sh) {
|
|
|
+ r5c_recovery_replay_stripes(
|
|
|
+ cached_stripe_list, ctx);
|
|
|
+ sh = r5c_recovery_alloc_stripe(
|
|
|
+ conf, cached_stripe_list,
|
|
|
+ stripe_sect, ctx->pos);
|
|
|
+ }
|
|
|
+ if (!sh) {
|
|
|
+ pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n",
|
|
|
+ mdname(mddev),
|
|
|
+ conf->min_nr_stripes * 2);
|
|
|
+ raid5_set_cache_size(mddev,
|
|
|
+ conf->min_nr_stripes * 2);
|
|
|
+ sh = r5c_recovery_alloc_stripe(
|
|
|
+ conf, cached_stripe_list, stripe_sect,
|
|
|
+ ctx->pos);
|
|
|
+ }
|
|
|
+ if (!sh) {
|
|
|
+ pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n",
|
|
|
+ mdname(mddev));
|
|
|
+ return -ENOMEM;
|
|
|
+ }
|
|
|
+ list_add_tail(&sh->lru, cached_stripe_list);
|
|
|
+ }
|
|
|
+
|
|
|
+ if (payload->header.type == R5LOG_PAYLOAD_DATA) {
|
|
|
+ if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
|
|
|
+ r5l_recovery_replay_one_stripe(conf, sh, ctx);
|
|
|
+ r5l_recovery_reset_stripe(sh);
|
|
|
+ sh->log_start = ctx->pos;
|
|
|
+ list_move_tail(&sh->lru, cached_stripe_list);
|
|
|
+ }
|
|
|
+ r5l_recovery_load_data(log, sh, ctx, payload,
|
|
|
+ log_offset);
|
|
|
+ } else if (payload->header.type == R5LOG_PAYLOAD_PARITY)
|
|
|
+ r5l_recovery_load_parity(log, sh, ctx, payload,
|
|
|
+ log_offset);
|
|
|
+ else
|
|
|
+ return -EINVAL;
|
|
|
+
|
|
|
+ log_offset = r5l_ring_add(log, log_offset,
|
|
|
+ le32_to_cpu(payload->size));
|
|
|
+
|
|
|
+ mb_offset += sizeof(struct r5l_payload_data_parity) +
|
|
|
+ sizeof(__le32) *
|
|
|
+ (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
|
|
|
+ }
|
|
|
+
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Load the stripe into cache. The stripe will be written out later by
|
|
|
+ * the stripe cache state machine.
|
|
|
+ */
|
|
|
+static void r5c_recovery_load_one_stripe(struct r5l_log *log,
|
|
|
+ struct stripe_head *sh)
|
|
|
+{
|
|
|
+ struct r5conf *conf = sh->raid_conf;
|
|
|
+ struct r5dev *dev;
|
|
|
+ int i;
|
|
|
+
|
|
|
+ for (i = sh->disks; i--; ) {
|
|
|
+ dev = sh->dev + i;
|
|
|
+ if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) {
|
|
|
+ set_bit(R5_InJournal, &dev->flags);
|
|
|
+ set_bit(R5_UPTODATE, &dev->flags);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state);
|
|
|
+ atomic_inc(&conf->r5c_cached_partial_stripes);
|
|
|
+ list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Scan through the log for all to-be-flushed data
|
|
|
+ *
|
|
|
+ * For stripes with data and parity, namely Data-Parity stripe
|
|
|
+ * (STRIPE_R5C_CACHING == 0), we simply replay all the writes.
|
|
|
+ *
|
|
|
+ * For stripes with only data, namely Data-Only stripe
|
|
|
+ * (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine.
|
|
|
+ *
|
|
|
+ * For a stripe, if we see data after parity, we should discard all previous
|
|
|
+ * data and parity for this stripe, as these data are already flushed to
|
|
|
+ * the array.
|
|
|
+ *
|
|
|
+ * At the end of the scan, we return the new journal_tail, which points to
|
|
|
+ * first data-only stripe on the journal device, or next invalid meta block.
|
|
|
+ */
|
|
|
+static int r5c_recovery_flush_log(struct r5l_log *log,
|
|
|
+ struct r5l_recovery_ctx *ctx)
|
|
|
+{
|
|
|
+ struct stripe_head *sh, *next;
|
|
|
+ int ret = 0;
|
|
|
+
|
|
|
+ /* scan through the log */
|
|
|
+ while (1) {
|
|
|
+ if (r5l_recovery_read_meta_block(log, ctx))
|
|
|
+ break;
|
|
|
+
|
|
|
+ ret = r5c_recovery_analyze_meta_block(log, ctx,
|
|
|
+ &ctx->cached_list);
|
|
|
+ /*
|
|
|
+ * -EAGAIN means mismatch in data block, in this case, we still
|
|
|
+ * try scan the next metablock
|
|
|
+ */
|
|
|
+ if (ret && ret != -EAGAIN)
|
|
|
+ break; /* ret == -EINVAL or -ENOMEM */
|
|
|
+ ctx->seq++;
|
|
|
+ ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
|
|
|
+ }
|
|
|
+
|
|
|
+ if (ret == -ENOMEM) {
|
|
|
+ r5c_recovery_drop_stripes(&ctx->cached_list, ctx);
|
|
|
+ return ret;
|
|
|
+ }
|
|
|
+
|
|
|
+ /* replay data-parity stripes */
|
|
|
+ r5c_recovery_replay_stripes(&ctx->cached_list, ctx);
|
|
|
+
|
|
|
+ /* load data-only stripes to stripe cache */
|
|
|
+ list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
|
|
|
+ WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
|
|
|
+ r5c_recovery_load_one_stripe(log, sh);
|
|
|
+ list_del_init(&sh->lru);
|
|
|
+ raid5_release_stripe(sh);
|
|
|
+ ctx->data_only_stripes++;
|
|
|
+ }
|
|
|
+
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * we did a recovery. Now ctx.pos points to an invalid meta block. New
|
|
|
+ * log will start here. but we can't let superblock point to last valid
|
|
|
+ * meta block. The log might looks like:
|
|
|
+ * | meta 1| meta 2| meta 3|
|
|
|
+ * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
|
|
|
+ * superblock points to meta 1, we write a new valid meta 2n. if crash
|
|
|
+ * happens again, new recovery will start from meta 1. Since meta 2n is
|
|
|
+ * valid now, recovery will think meta 3 is valid, which is wrong.
|
|
|
+ * The solution is we create a new meta in meta2 with its seq == meta
|
|
|
+ * 1's seq + 10 and let superblock points to meta2. The same recovery will
|
|
|
+ * not think meta 3 is a valid meta, because its seq doesn't match
|
|
|
+ */
|
|
|
+
|
|
|
+/*
|
|
|
+ * Before recovery, the log looks like the following
|
|
|
+ *
|
|
|
+ * ---------------------------------------------
|
|
|
+ * | valid log | invalid log |
|
|
|
+ * ---------------------------------------------
|
|
|
+ * ^
|
|
|
+ * |- log->last_checkpoint
|
|
|
+ * |- log->last_cp_seq
|
|
|
+ *
|
|
|
+ * Now we scan through the log until we see invalid entry
|
|
|
+ *
|
|
|
+ * ---------------------------------------------
|
|
|
+ * | valid log | invalid log |
|
|
|
+ * ---------------------------------------------
|
|
|
+ * ^ ^
|
|
|
+ * |- log->last_checkpoint |- ctx->pos
|
|
|
+ * |- log->last_cp_seq |- ctx->seq
|
|
|
+ *
|
|
|
+ * From this point, we need to increase seq number by 10 to avoid
|
|
|
+ * confusing next recovery.
|
|
|
+ *
|
|
|
+ * ---------------------------------------------
|
|
|
+ * | valid log | invalid log |
|
|
|
+ * ---------------------------------------------
|
|
|
+ * ^ ^
|
|
|
+ * |- log->last_checkpoint |- ctx->pos+1
|
|
|
+ * |- log->last_cp_seq |- ctx->seq+11
|
|
|
+ *
|
|
|
+ * However, it is not safe to start the state machine yet, because data only
|
|
|
+ * parities are not yet secured in RAID. To save these data only parities, we
|
|
|
+ * rewrite them from seq+11.
|
|
|
+ *
|
|
|
+ * -----------------------------------------------------------------
|
|
|
+ * | valid log | data only stripes | invalid log |
|
|
|
+ * -----------------------------------------------------------------
|
|
|
+ * ^ ^
|
|
|
+ * |- log->last_checkpoint |- ctx->pos+n
|
|
|
+ * |- log->last_cp_seq |- ctx->seq+10+n
|
|
|
+ *
|
|
|
+ * If failure happens again during this process, the recovery can safe start
|
|
|
+ * again from log->last_checkpoint.
|
|
|
+ *
|
|
|
+ * Once data only stripes are rewritten to journal, we move log_tail
|
|
|
+ *
|
|
|
+ * -----------------------------------------------------------------
|
|
|
+ * | old log | data only stripes | invalid log |
|
|
|
+ * -----------------------------------------------------------------
|
|
|
+ * ^ ^
|
|
|
+ * |- log->last_checkpoint |- ctx->pos+n
|
|
|
+ * |- log->last_cp_seq |- ctx->seq+10+n
|
|
|
+ *
|
|
|
+ * Then we can safely start the state machine. If failure happens from this
|
|
|
+ * point on, the recovery will start from new log->last_checkpoint.
|
|
|
+ */
|
|
|
+static int
|
|
|
+r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
|
|
|
+ struct r5l_recovery_ctx *ctx)
|
|
|
+{
|
|
|
+ struct stripe_head *sh;
|
|
|
+ struct mddev *mddev = log->rdev->mddev;
|
|
|
+ struct page *page;
|
|
|
+
|
|
|
+ page = alloc_page(GFP_KERNEL);
|
|
|
+ if (!page) {
|
|
|
+ pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n",
|
|
|
+ mdname(mddev));
|
|
|
+ return -ENOMEM;
|
|
|
+ }
|
|
|
+
|
|
|
+ ctx->seq += 10;
|
|
|
+ list_for_each_entry(sh, &ctx->cached_list, lru) {
|
|
|
+ struct r5l_meta_block *mb;
|
|
|
+ int i;
|
|
|
+ int offset;
|
|
|
+ sector_t write_pos;
|
|
|
+
|
|
|
+ WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
|
|
|
+ r5l_recovery_create_empty_meta_block(log, page,
|
|
|
+ ctx->pos, ctx->seq);
|
|
|
+ mb = page_address(page);
|
|
|
+ offset = le32_to_cpu(mb->meta_size);
|
|
|
+ write_pos = ctx->pos + BLOCK_SECTORS;
|
|
|
+
|
|
|
+ for (i = sh->disks; i--; ) {
|
|
|
+ struct r5dev *dev = &sh->dev[i];
|
|
|
+ struct r5l_payload_data_parity *payload;
|
|
|
+ void *addr;
|
|
|
+
|
|
|
+ if (test_bit(R5_InJournal, &dev->flags)) {
|
|
|
+ payload = (void *)mb + offset;
|
|
|
+ payload->header.type = cpu_to_le16(
|
|
|
+ R5LOG_PAYLOAD_DATA);
|
|
|
+ payload->size = BLOCK_SECTORS;
|
|
|
+ payload->location = cpu_to_le64(
|
|
|
+ raid5_compute_blocknr(sh, i, 0));
|
|
|
+ addr = kmap_atomic(dev->page);
|
|
|
+ payload->checksum[0] = cpu_to_le32(
|
|
|
+ crc32c_le(log->uuid_checksum, addr,
|
|
|
+ PAGE_SIZE));
|
|
|
+ kunmap_atomic(addr);
|
|
|
+ sync_page_io(log->rdev, write_pos, PAGE_SIZE,
|
|
|
+ dev->page, REQ_OP_WRITE, 0, false);
|
|
|
+ write_pos = r5l_ring_add(log, write_pos,
|
|
|
+ BLOCK_SECTORS);
|
|
|
+ offset += sizeof(__le32) +
|
|
|
+ sizeof(struct r5l_payload_data_parity);
|
|
|
+
|
|
|
+ }
|
|
|
+ }
|
|
|
+ mb->meta_size = cpu_to_le32(offset);
|
|
|
+ mb->checksum = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
|
|
|
+ sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
|
|
|
+ REQ_OP_WRITE, WRITE_FUA, false);
|
|
|
+ sh->log_start = ctx->pos;
|
|
|
+ ctx->pos = write_pos;
|
|
|
+ ctx->seq += 1;
|
|
|
+ }
|
|
|
+ __free_page(page);
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
static int r5l_recovery_log(struct r5l_log *log)
|
|
|
{
|
|
|
struct r5l_recovery_ctx ctx;
|
|
|
@@ -1583,6 +2171,10 @@ static int r5l_recovery_log(struct r5l_log *log)
|
|
|
ctx.pos = log->last_checkpoint;
|
|
|
ctx.seq = log->last_cp_seq;
|
|
|
ctx.meta_page = alloc_page(GFP_KERNEL);
|
|
|
+ ctx.data_only_stripes = 0;
|
|
|
+ ctx.data_parity_stripes = 0;
|
|
|
+ INIT_LIST_HEAD(&ctx.cached_list);
|
|
|
+
|
|
|
if (!ctx.meta_page)
|
|
|
return -ENOMEM;
|
|
|
|
|
|
@@ -1617,6 +2209,16 @@ static int r5l_recovery_log(struct r5l_log *log)
|
|
|
log->log_start = ctx.pos;
|
|
|
log->seq = ctx.seq;
|
|
|
}
|
|
|
+
|
|
|
+ /*
|
|
|
+ * This is to suppress "function defined but not used" warning.
|
|
|
+ * It will be removed when the two functions are used (next patch).
|
|
|
+ */
|
|
|
+ if (!log) {
|
|
|
+ r5c_recovery_flush_log(log, &ctx);
|
|
|
+ r5c_recovery_rewrite_data_only_stripes(log, &ctx);
|
|
|
+ }
|
|
|
+
|
|
|
return 0;
|
|
|
}
|
|
|
|