|
@@ -717,11 +717,248 @@ static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
|
|
|
md_wakeup_thread(log->reclaim_thread);
|
|
|
}
|
|
|
|
|
|
+struct r5l_recovery_ctx {
|
|
|
+ struct page *meta_page; /* current meta */
|
|
|
+ sector_t meta_total_blocks; /* total size of current meta and data */
|
|
|
+ sector_t pos; /* recovery position */
|
|
|
+ u64 seq; /* recovery position seq */
|
|
|
+};
|
|
|
+
|
|
|
+static int r5l_read_meta_block(struct r5l_log *log,
|
|
|
+ struct r5l_recovery_ctx *ctx)
|
|
|
+{
|
|
|
+ struct page *page = ctx->meta_page;
|
|
|
+ struct r5l_meta_block *mb;
|
|
|
+ u32 crc, stored_crc;
|
|
|
+
|
|
|
+ if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, READ, false))
|
|
|
+ return -EIO;
|
|
|
+
|
|
|
+ mb = page_address(page);
|
|
|
+ stored_crc = le32_to_cpu(mb->checksum);
|
|
|
+ mb->checksum = 0;
|
|
|
+
|
|
|
+ if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
|
|
|
+ le64_to_cpu(mb->seq) != ctx->seq ||
|
|
|
+ mb->version != R5LOG_VERSION ||
|
|
|
+ le64_to_cpu(mb->position) != ctx->pos)
|
|
|
+ return -EINVAL;
|
|
|
+
|
|
|
+ crc = crc32_le(log->uuid_checksum, (void *)mb, PAGE_SIZE);
|
|
|
+ if (stored_crc != crc)
|
|
|
+ return -EINVAL;
|
|
|
+
|
|
|
+ if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
|
|
|
+ return -EINVAL;
|
|
|
+
|
|
|
+ ctx->meta_total_blocks = BLOCK_SECTORS;
|
|
|
+
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
|
|
|
+ struct r5l_recovery_ctx *ctx,
|
|
|
+ sector_t stripe_sect,
|
|
|
+ int *offset, sector_t *log_offset)
|
|
|
+{
|
|
|
+ struct r5conf *conf = log->rdev->mddev->private;
|
|
|
+ struct stripe_head *sh;
|
|
|
+ struct r5l_payload_data_parity *payload;
|
|
|
+ int disk_index;
|
|
|
+
|
|
|
+ sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
|
|
|
+ while (1) {
|
|
|
+ payload = page_address(ctx->meta_page) + *offset;
|
|
|
+
|
|
|
+ if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
|
|
|
+ raid5_compute_sector(conf,
|
|
|
+ le64_to_cpu(payload->location), 0,
|
|
|
+ &disk_index, sh);
|
|
|
+
|
|
|
+ sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
|
|
|
+ sh->dev[disk_index].page, READ, false);
|
|
|
+ sh->dev[disk_index].log_checksum =
|
|
|
+ le32_to_cpu(payload->checksum[0]);
|
|
|
+ set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
|
|
|
+ ctx->meta_total_blocks += BLOCK_SECTORS;
|
|
|
+ } else {
|
|
|
+ disk_index = sh->pd_idx;
|
|
|
+ sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
|
|
|
+ sh->dev[disk_index].page, READ, false);
|
|
|
+ sh->dev[disk_index].log_checksum =
|
|
|
+ le32_to_cpu(payload->checksum[0]);
|
|
|
+ set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
|
|
|
+
|
|
|
+ if (sh->qd_idx >= 0) {
|
|
|
+ disk_index = sh->qd_idx;
|
|
|
+ sync_page_io(log->rdev,
|
|
|
+ r5l_ring_add(log, *log_offset, BLOCK_SECTORS),
|
|
|
+ PAGE_SIZE, sh->dev[disk_index].page,
|
|
|
+ READ, false);
|
|
|
+ sh->dev[disk_index].log_checksum =
|
|
|
+ le32_to_cpu(payload->checksum[1]);
|
|
|
+ set_bit(R5_Wantwrite,
|
|
|
+ &sh->dev[disk_index].flags);
|
|
|
+ }
|
|
|
+ ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
|
|
|
+ }
|
|
|
+
|
|
|
+ *log_offset = r5l_ring_add(log, *log_offset,
|
|
|
+ le32_to_cpu(payload->size));
|
|
|
+ *offset += sizeof(struct r5l_payload_data_parity) +
|
|
|
+ sizeof(__le32) *
|
|
|
+ (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
|
|
|
+ if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+ for (disk_index = 0; disk_index < sh->disks; disk_index++) {
|
|
|
+ void *addr;
|
|
|
+ u32 checksum;
|
|
|
+
|
|
|
+ if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
|
|
|
+ continue;
|
|
|
+ addr = kmap_atomic(sh->dev[disk_index].page);
|
|
|
+ checksum = crc32_le(log->uuid_checksum, addr, PAGE_SIZE);
|
|
|
+ kunmap_atomic(addr);
|
|
|
+ if (checksum != sh->dev[disk_index].log_checksum)
|
|
|
+ goto error;
|
|
|
+ }
|
|
|
+
|
|
|
+ for (disk_index = 0; disk_index < sh->disks; disk_index++) {
|
|
|
+ struct md_rdev *rdev, *rrdev;
|
|
|
+
|
|
|
+ if (!test_and_clear_bit(R5_Wantwrite,
|
|
|
+ &sh->dev[disk_index].flags))
|
|
|
+ continue;
|
|
|
+
|
|
|
+ /* in case device is broken */
|
|
|
+ rdev = rcu_dereference(conf->disks[disk_index].rdev);
|
|
|
+ if (rdev)
|
|
|
+ sync_page_io(rdev, stripe_sect, PAGE_SIZE,
|
|
|
+ sh->dev[disk_index].page, WRITE, false);
|
|
|
+ rrdev = rcu_dereference(conf->disks[disk_index].replacement);
|
|
|
+ if (rrdev)
|
|
|
+ sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
|
|
|
+ sh->dev[disk_index].page, WRITE, false);
|
|
|
+ }
|
|
|
+ raid5_release_stripe(sh);
|
|
|
+ return 0;
|
|
|
+
|
|
|
+error:
|
|
|
+ for (disk_index = 0; disk_index < sh->disks; disk_index++)
|
|
|
+ sh->dev[disk_index].flags = 0;
|
|
|
+ raid5_release_stripe(sh);
|
|
|
+ return -EINVAL;
|
|
|
+}
|
|
|
+
|
|
|
+static int r5l_recovery_flush_one_meta(struct r5l_log *log,
|
|
|
+ struct r5l_recovery_ctx *ctx)
|
|
|
+{
|
|
|
+ struct r5conf *conf = log->rdev->mddev->private;
|
|
|
+ struct r5l_payload_data_parity *payload;
|
|
|
+ struct r5l_meta_block *mb;
|
|
|
+ int offset;
|
|
|
+ sector_t log_offset;
|
|
|
+ sector_t stripe_sector;
|
|
|
+
|
|
|
+ mb = page_address(ctx->meta_page);
|
|
|
+ offset = sizeof(struct r5l_meta_block);
|
|
|
+ log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
|
|
|
+
|
|
|
+ while (offset < le32_to_cpu(mb->meta_size)) {
|
|
|
+ int dd;
|
|
|
+
|
|
|
+ payload = (void *)mb + offset;
|
|
|
+ stripe_sector = raid5_compute_sector(conf,
|
|
|
+ le64_to_cpu(payload->location), 0, &dd, NULL);
|
|
|
+ if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
|
|
|
+ &offset, &log_offset))
|
|
|
+ return -EINVAL;
|
|
|
+ }
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+/* copy data/parity from log to raid disks */
|
|
|
+static void r5l_recovery_flush_log(struct r5l_log *log,
|
|
|
+ struct r5l_recovery_ctx *ctx)
|
|
|
+{
|
|
|
+ while (1) {
|
|
|
+ if (r5l_read_meta_block(log, ctx))
|
|
|
+ return;
|
|
|
+ if (r5l_recovery_flush_one_meta(log, ctx))
|
|
|
+ return;
|
|
|
+ ctx->seq++;
|
|
|
+ ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
|
|
|
+ u64 seq)
|
|
|
+{
|
|
|
+ struct page *page;
|
|
|
+ struct r5l_meta_block *mb;
|
|
|
+ u32 crc;
|
|
|
+
|
|
|
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
|
|
|
+ if (!page)
|
|
|
+ return -ENOMEM;
|
|
|
+ mb = page_address(page);
|
|
|
+ mb->magic = cpu_to_le32(R5LOG_MAGIC);
|
|
|
+ mb->version = R5LOG_VERSION;
|
|
|
+ mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
|
|
|
+ mb->seq = cpu_to_le64(seq);
|
|
|
+ mb->position = cpu_to_le64(pos);
|
|
|
+ crc = crc32_le(log->uuid_checksum, (void *)mb, PAGE_SIZE);
|
|
|
+ mb->checksum = cpu_to_le32(crc);
|
|
|
+
|
|
|
+ if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, WRITE_FUA, false)) {
|
|
|
+ __free_page(page);
|
|
|
+ return -EIO;
|
|
|
+ }
|
|
|
+ __free_page(page);
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
static int r5l_recovery_log(struct r5l_log *log)
|
|
|
{
|
|
|
- /* fake recovery */
|
|
|
- log->seq = log->last_cp_seq + 1;
|
|
|
- log->log_start = r5l_ring_add(log, log->last_checkpoint, BLOCK_SECTORS);
|
|
|
+ struct r5l_recovery_ctx ctx;
|
|
|
+
|
|
|
+ ctx.pos = log->last_checkpoint;
|
|
|
+ ctx.seq = log->last_cp_seq;
|
|
|
+ ctx.meta_page = alloc_page(GFP_KERNEL);
|
|
|
+ if (!ctx.meta_page)
|
|
|
+ return -ENOMEM;
|
|
|
+
|
|
|
+ r5l_recovery_flush_log(log, &ctx);
|
|
|
+ __free_page(ctx.meta_page);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * we did a recovery. Now ctx.pos points to an invalid meta block. New
|
|
|
+ * log will start here. but we can't let superblock point to last valid
|
|
|
+ * meta block. The log might looks like:
|
|
|
+ * | meta 1| meta 2| meta 3|
|
|
|
+ * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
|
|
|
+ * superblock points to meta 1, we write a new valid meta 2n. if crash
|
|
|
+ * happens again, new recovery will start from meta 1. Since meta 2n is
|
|
|
+ * valid now, recovery will think meta 3 is valid, which is wrong.
|
|
|
+ * The solution is we create a new meta in meta2 with its seq == meta
|
|
|
+ * 1's seq + 10 and let superblock points to meta2. The same recovery will
|
|
|
+ * not think meta 3 is a valid meta, because its seq doesn't match
|
|
|
+ */
|
|
|
+ if (ctx.seq > log->last_cp_seq + 1) {
|
|
|
+ int ret;
|
|
|
+
|
|
|
+ ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
|
|
|
+ if (ret)
|
|
|
+ return ret;
|
|
|
+ log->seq = ctx.seq + 11;
|
|
|
+ log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
|
|
|
+ r5l_write_super(log, ctx.pos);
|
|
|
+ } else {
|
|
|
+ log->log_start = ctx.pos;
|
|
|
+ log->seq = ctx.seq;
|
|
|
+ }
|
|
|
return 0;
|
|
|
}
|
|
|
|