|
@@ -24,6 +24,11 @@
|
|
|
*/
|
|
|
#define MIN_FREE_RESHAPE_SPACE to_sector(4*4096)
|
|
|
|
|
|
+/*
|
|
|
+ * Minimum journal space 4 MiB in sectors.
|
|
|
+ */
|
|
|
+#define MIN_RAID456_JOURNAL_SPACE (4*2048)
|
|
|
+
|
|
|
static bool devices_handle_discard_safely = false;
|
|
|
|
|
|
/*
|
|
@@ -73,6 +78,9 @@ struct raid_dev {
|
|
|
#define __CTR_FLAG_DATA_OFFSET 13 /* 2 */ /* Only with reshapable raid4/5/6/10! */
|
|
|
#define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */
|
|
|
|
|
|
+/* New for v1.10.0 */
|
|
|
+#define __CTR_FLAG_JOURNAL_DEV 15 /* 2 */ /* Only with raid4/5/6! */
|
|
|
+
|
|
|
/*
|
|
|
* Flags for rs->ctr_flags field.
|
|
|
*/
|
|
@@ -91,6 +99,7 @@ struct raid_dev {
|
|
|
#define CTR_FLAG_DELTA_DISKS (1 << __CTR_FLAG_DELTA_DISKS)
|
|
|
#define CTR_FLAG_DATA_OFFSET (1 << __CTR_FLAG_DATA_OFFSET)
|
|
|
#define CTR_FLAG_RAID10_USE_NEAR_SETS (1 << __CTR_FLAG_RAID10_USE_NEAR_SETS)
|
|
|
+#define CTR_FLAG_JOURNAL_DEV (1 << __CTR_FLAG_JOURNAL_DEV)
|
|
|
|
|
|
/*
|
|
|
* Definitions of various constructor flags to
|
|
@@ -163,7 +172,8 @@ struct raid_dev {
|
|
|
CTR_FLAG_STRIPE_CACHE | \
|
|
|
CTR_FLAG_REGION_SIZE | \
|
|
|
CTR_FLAG_DELTA_DISKS | \
|
|
|
- CTR_FLAG_DATA_OFFSET)
|
|
|
+ CTR_FLAG_DATA_OFFSET | \
|
|
|
+ CTR_FLAG_JOURNAL_DEV)
|
|
|
|
|
|
#define RAID6_VALID_FLAGS (CTR_FLAG_SYNC | \
|
|
|
CTR_FLAG_REBUILD | \
|
|
@@ -173,7 +183,8 @@ struct raid_dev {
|
|
|
CTR_FLAG_STRIPE_CACHE | \
|
|
|
CTR_FLAG_REGION_SIZE | \
|
|
|
CTR_FLAG_DELTA_DISKS | \
|
|
|
- CTR_FLAG_DATA_OFFSET)
|
|
|
+ CTR_FLAG_DATA_OFFSET | \
|
|
|
+ CTR_FLAG_JOURNAL_DEV)
|
|
|
/* ...valid options definitions per raid level */
|
|
|
|
|
|
/*
|
|
@@ -222,6 +233,12 @@ struct raid_set {
|
|
|
struct raid_type *raid_type;
|
|
|
struct dm_target_callbacks callbacks;
|
|
|
|
|
|
+ /* Optional raid4/5/6 journal device */
|
|
|
+ struct journal_dev {
|
|
|
+ struct dm_dev *dev;
|
|
|
+ struct md_rdev rdev;
|
|
|
+ } journal_dev;
|
|
|
+
|
|
|
struct raid_dev dev[0];
|
|
|
};
|
|
|
|
|
@@ -306,6 +323,7 @@ static struct arg_name_flag {
|
|
|
{ CTR_FLAG_DATA_OFFSET, "data_offset"},
|
|
|
{ CTR_FLAG_DELTA_DISKS, "delta_disks"},
|
|
|
{ CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"},
|
|
|
+ { CTR_FLAG_JOURNAL_DEV, "journal_dev" },
|
|
|
};
|
|
|
|
|
|
/* Return argument name string for given @flag */
|
|
@@ -370,7 +388,7 @@ static bool rs_is_reshapable(struct raid_set *rs)
|
|
|
/* Return true, if raid set in @rs is recovering */
|
|
|
static bool rs_is_recovering(struct raid_set *rs)
|
|
|
{
|
|
|
- return rs->md.recovery_cp < rs->dev[0].rdev.sectors;
|
|
|
+ return rs->md.recovery_cp < rs->md.dev_sectors;
|
|
|
}
|
|
|
|
|
|
/* Return true, if raid set in @rs is reshaping */
|
|
@@ -627,7 +645,8 @@ static void rs_set_capacity(struct raid_set *rs)
|
|
|
* is unintended in case of out-of-place reshaping
|
|
|
*/
|
|
|
rdev_for_each(rdev, mddev)
|
|
|
- rdev->sectors = mddev->dev_sectors;
|
|
|
+ if (!test_bit(Journal, &rdev->flags))
|
|
|
+ rdev->sectors = mddev->dev_sectors;
|
|
|
|
|
|
set_capacity(gendisk, mddev->array_sectors);
|
|
|
revalidate_disk(gendisk);
|
|
@@ -713,6 +732,11 @@ static void raid_set_free(struct raid_set *rs)
|
|
|
{
|
|
|
int i;
|
|
|
|
|
|
+ if (rs->journal_dev.dev) {
|
|
|
+ md_rdev_clear(&rs->journal_dev.rdev);
|
|
|
+ dm_put_device(rs->ti, rs->journal_dev.dev);
|
|
|
+ }
|
|
|
+
|
|
|
for (i = 0; i < rs->raid_disks; i++) {
|
|
|
if (rs->dev[i].meta_dev)
|
|
|
dm_put_device(rs->ti, rs->dev[i].meta_dev);
|
|
@@ -760,10 +784,11 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
|
|
|
rs->dev[i].data_dev = NULL;
|
|
|
|
|
|
/*
|
|
|
- * There are no offsets, since there is a separate device
|
|
|
- * for data and metadata.
|
|
|
+ * There are no offsets initially.
|
|
|
+ * Out of place reshape will set them accordingly.
|
|
|
*/
|
|
|
rs->dev[i].rdev.data_offset = 0;
|
|
|
+ rs->dev[i].rdev.new_data_offset = 0;
|
|
|
rs->dev[i].rdev.mddev = &rs->md;
|
|
|
|
|
|
arg = dm_shift_arg(as);
|
|
@@ -821,6 +846,9 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
|
|
|
rebuild++;
|
|
|
}
|
|
|
|
|
|
+ if (rs->journal_dev.dev)
|
|
|
+ list_add_tail(&rs->journal_dev.rdev.same_set, &rs->md.disks);
|
|
|
+
|
|
|
if (metadata_available) {
|
|
|
rs->md.external = 0;
|
|
|
rs->md.persistent = 1;
|
|
@@ -1026,6 +1054,8 @@ too_many:
|
|
|
* [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
|
|
|
* [stripe_cache <sectors>] Stripe cache size for higher RAIDs
|
|
|
* [region_size <sectors>] Defines granularity of bitmap
|
|
|
+ * [journal_dev <dev>] raid4/5/6 journaling deviice
|
|
|
+ * (i.e. write hole closing log)
|
|
|
*
|
|
|
* RAID10-only options:
|
|
|
* [raid10_copies <# copies>] Number of copies. (Default: 2)
|
|
@@ -1133,7 +1163,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
|
|
|
/*
|
|
|
* Parameters that take a string value are checked here.
|
|
|
*/
|
|
|
-
|
|
|
+ /* "raid10_format {near|offset|far} */
|
|
|
if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT))) {
|
|
|
if (test_and_set_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) {
|
|
|
rs->ti->error = "Only one 'raid10_format' argument pair allowed";
|
|
@@ -1151,6 +1181,41 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
+ /* "journal_dev dev" */
|
|
|
+ if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV))) {
|
|
|
+ int r;
|
|
|
+ struct md_rdev *jdev;
|
|
|
+
|
|
|
+ if (test_and_set_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
|
|
|
+ rs->ti->error = "Only one raid4/5/6 set journaling device allowed";
|
|
|
+ return -EINVAL;
|
|
|
+ }
|
|
|
+ if (!rt_is_raid456(rt)) {
|
|
|
+ rs->ti->error = "'journal_dev' is an invalid parameter for this RAID type";
|
|
|
+ return -EINVAL;
|
|
|
+ }
|
|
|
+ r = dm_get_device(rs->ti, arg, dm_table_get_mode(rs->ti->table),
|
|
|
+ &rs->journal_dev.dev);
|
|
|
+ if (r) {
|
|
|
+ rs->ti->error = "raid4/5/6 journal device lookup failure";
|
|
|
+ return r;
|
|
|
+ }
|
|
|
+ jdev = &rs->journal_dev.rdev;
|
|
|
+ md_rdev_init(jdev);
|
|
|
+ jdev->mddev = &rs->md;
|
|
|
+ jdev->bdev = rs->journal_dev.dev->bdev;
|
|
|
+ jdev->sectors = to_sector(i_size_read(jdev->bdev->bd_inode));
|
|
|
+ if (jdev->sectors < MIN_RAID456_JOURNAL_SPACE) {
|
|
|
+ rs->ti->error = "No space for raid4/5/6 journal";
|
|
|
+ return -ENOSPC;
|
|
|
+ }
|
|
|
+ set_bit(Journal, &jdev->flags);
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Parameters with number values from here on.
|
|
|
+ */
|
|
|
if (kstrtoint(arg, 10, &value) < 0) {
|
|
|
rs->ti->error = "Bad numerical argument given in raid params";
|
|
|
return -EINVAL;
|
|
@@ -1425,6 +1490,25 @@ static unsigned int rs_data_stripes(struct raid_set *rs)
|
|
|
return rs->raid_disks - rs->raid_type->parity_devs;
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * Retrieve rdev->sectors from any valid raid device of @rs
|
|
|
+ * to allow userpace to pass in arbitray "- -" device tupples.
|
|
|
+ */
|
|
|
+static sector_t __rdev_sectors(struct raid_set *rs)
|
|
|
+{
|
|
|
+ int i;
|
|
|
+
|
|
|
+ for (i = 0; i < rs->md.raid_disks; i++) {
|
|
|
+ struct md_rdev *rdev = &rs->dev[i].rdev;
|
|
|
+
|
|
|
+ if (!test_bit(Journal, &rdev->flags) &&
|
|
|
+ rdev->bdev && rdev->sectors)
|
|
|
+ return rdev->sectors;
|
|
|
+ }
|
|
|
+
|
|
|
+ BUG(); /* Constructor ensures we got some. */
|
|
|
+}
|
|
|
+
|
|
|
/* Calculate the sectors per device and per array used for @rs */
|
|
|
static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
|
|
|
{
|
|
@@ -1468,7 +1552,8 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
|
|
|
array_sectors = (data_stripes + delta_disks) * dev_sectors;
|
|
|
|
|
|
rdev_for_each(rdev, mddev)
|
|
|
- rdev->sectors = dev_sectors;
|
|
|
+ if (!test_bit(Journal, &rdev->flags))
|
|
|
+ rdev->sectors = dev_sectors;
|
|
|
|
|
|
mddev->array_sectors = array_sectors;
|
|
|
mddev->dev_sectors = dev_sectors;
|
|
@@ -1510,9 +1595,9 @@ static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
|
|
|
else if (dev_sectors == MaxSector)
|
|
|
/* Prevent recovery */
|
|
|
__rs_setup_recovery(rs, MaxSector);
|
|
|
- else if (rs->dev[0].rdev.sectors < dev_sectors)
|
|
|
+ else if (__rdev_sectors(rs) < dev_sectors)
|
|
|
/* Grown raid set */
|
|
|
- __rs_setup_recovery(rs, rs->dev[0].rdev.sectors);
|
|
|
+ __rs_setup_recovery(rs, __rdev_sectors(rs));
|
|
|
else
|
|
|
__rs_setup_recovery(rs, MaxSector);
|
|
|
}
|
|
@@ -1851,18 +1936,21 @@ static int rs_check_reshape(struct raid_set *rs)
|
|
|
return -EPERM;
|
|
|
}
|
|
|
|
|
|
-static int read_disk_sb(struct md_rdev *rdev, int size)
|
|
|
+static int read_disk_sb(struct md_rdev *rdev, int size, bool force_reload)
|
|
|
{
|
|
|
BUG_ON(!rdev->sb_page);
|
|
|
|
|
|
- if (rdev->sb_loaded)
|
|
|
+ if (rdev->sb_loaded && !force_reload)
|
|
|
return 0;
|
|
|
|
|
|
+ rdev->sb_loaded = 0;
|
|
|
+
|
|
|
if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true)) {
|
|
|
DMERR("Failed to read superblock of device at position %d",
|
|
|
rdev->raid_disk);
|
|
|
md_error(rdev->mddev, rdev);
|
|
|
- return -EINVAL;
|
|
|
+ set_bit(Faulty, &rdev->flags);
|
|
|
+ return -EIO;
|
|
|
}
|
|
|
|
|
|
rdev->sb_loaded = 1;
|
|
@@ -1990,7 +2078,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
|
|
|
return -EINVAL;
|
|
|
}
|
|
|
|
|
|
- r = read_disk_sb(rdev, rdev->sb_size);
|
|
|
+ r = read_disk_sb(rdev, rdev->sb_size, false);
|
|
|
if (r)
|
|
|
return r;
|
|
|
|
|
@@ -2146,6 +2234,9 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
|
|
|
*/
|
|
|
d = 0;
|
|
|
rdev_for_each(r, mddev) {
|
|
|
+ if (test_bit(Journal, &rdev->flags))
|
|
|
+ continue;
|
|
|
+
|
|
|
if (test_bit(FirstUse, &r->flags))
|
|
|
new_devs++;
|
|
|
|
|
@@ -2201,7 +2292,8 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
|
|
|
*/
|
|
|
sb_retrieve_failed_devices(sb, failed_devices);
|
|
|
rdev_for_each(r, mddev) {
|
|
|
- if (!r->sb_page)
|
|
|
+ if (test_bit(Journal, &rdev->flags) ||
|
|
|
+ !r->sb_page)
|
|
|
continue;
|
|
|
sb2 = page_address(r->sb_page);
|
|
|
sb2->failed_devices = 0;
|
|
@@ -2253,7 +2345,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
|
|
|
struct mddev *mddev = &rs->md;
|
|
|
struct dm_raid_superblock *sb;
|
|
|
|
|
|
- if (rs_is_raid0(rs) || !rdev->sb_page)
|
|
|
+ if (rs_is_raid0(rs) || !rdev->sb_page || rdev->raid_disk < 0)
|
|
|
return 0;
|
|
|
|
|
|
sb = page_address(rdev->sb_page);
|
|
@@ -2278,7 +2370,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
|
|
|
|
|
|
/* Enable bitmap creation for RAID levels != 0 */
|
|
|
mddev->bitmap_info.offset = rt_is_raid0(rs->raid_type) ? 0 : to_sector(4096);
|
|
|
- rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
|
|
|
+ mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
|
|
|
|
|
|
if (!test_and_clear_bit(FirstUse, &rdev->flags)) {
|
|
|
/* Retrieve device size stored in superblock to be prepared for shrink */
|
|
@@ -2316,21 +2408,22 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
|
|
|
static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
|
|
|
{
|
|
|
int r;
|
|
|
- struct raid_dev *dev;
|
|
|
- struct md_rdev *rdev, *tmp, *freshest;
|
|
|
+ struct md_rdev *rdev, *freshest;
|
|
|
struct mddev *mddev = &rs->md;
|
|
|
|
|
|
freshest = NULL;
|
|
|
- rdev_for_each_safe(rdev, tmp, mddev) {
|
|
|
+ rdev_for_each(rdev, mddev) {
|
|
|
+ if (test_bit(Journal, &rdev->flags))
|
|
|
+ continue;
|
|
|
+
|
|
|
/*
|
|
|
* Skipping super_load due to CTR_FLAG_SYNC will cause
|
|
|
* the array to undergo initialization again as
|
|
|
* though it were new. This is the intended effect
|
|
|
* of the "sync" directive.
|
|
|
*
|
|
|
- * When reshaping capability is added, we must ensure
|
|
|
- * that the "sync" directive is disallowed during the
|
|
|
- * reshape.
|
|
|
+ * With reshaping capability added, we must ensure that
|
|
|
+ * that the "sync" directive is disallowed during the reshape.
|
|
|
*/
|
|
|
if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags))
|
|
|
continue;
|
|
@@ -2347,6 +2440,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
|
|
|
case 0:
|
|
|
break;
|
|
|
default:
|
|
|
+ /* This is a failure to read the superblock from the metadata device. */
|
|
|
/*
|
|
|
* We have to keep any raid0 data/metadata device pairs or
|
|
|
* the MD raid0 personality will fail to start the array.
|
|
@@ -2354,33 +2448,16 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
|
|
|
if (rs_is_raid0(rs))
|
|
|
continue;
|
|
|
|
|
|
- dev = container_of(rdev, struct raid_dev, rdev);
|
|
|
- if (dev->meta_dev)
|
|
|
- dm_put_device(ti, dev->meta_dev);
|
|
|
-
|
|
|
- dev->meta_dev = NULL;
|
|
|
- rdev->meta_bdev = NULL;
|
|
|
-
|
|
|
- if (rdev->sb_page)
|
|
|
- put_page(rdev->sb_page);
|
|
|
-
|
|
|
- rdev->sb_page = NULL;
|
|
|
-
|
|
|
- rdev->sb_loaded = 0;
|
|
|
-
|
|
|
/*
|
|
|
- * We might be able to salvage the data device
|
|
|
- * even though the meta device has failed. For
|
|
|
- * now, we behave as though '- -' had been
|
|
|
- * set for this device in the table.
|
|
|
+ * We keep the dm_devs to be able to emit the device tuple
|
|
|
+ * properly on the table line in raid_status() (rather than
|
|
|
+ * mistakenly acting as if '- -' got passed into the constructor).
|
|
|
+ *
|
|
|
+ * The rdev has to stay on the same_set list to allow for
|
|
|
+ * the attempt to restore faulty devices on second resume.
|
|
|
*/
|
|
|
- if (dev->data_dev)
|
|
|
- dm_put_device(ti, dev->data_dev);
|
|
|
-
|
|
|
- dev->data_dev = NULL;
|
|
|
- rdev->bdev = NULL;
|
|
|
-
|
|
|
- list_del(&rdev->same_set);
|
|
|
+ rdev->raid_disk = rdev->saved_raid_disk = -1;
|
|
|
+ break;
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -2401,7 +2478,9 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
|
|
|
return -EINVAL;
|
|
|
|
|
|
rdev_for_each(rdev, mddev)
|
|
|
- if ((rdev != freshest) && super_validate(rs, rdev))
|
|
|
+ if (!test_bit(Journal, &rdev->flags) &&
|
|
|
+ rdev != freshest &&
|
|
|
+ super_validate(rs, rdev))
|
|
|
return -EINVAL;
|
|
|
return 0;
|
|
|
}
|
|
@@ -2488,10 +2567,12 @@ static int rs_adjust_data_offsets(struct raid_set *rs)
|
|
|
return -ENOSPC;
|
|
|
}
|
|
|
out:
|
|
|
- /* Adjust data offsets on all rdevs */
|
|
|
+ /* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */
|
|
|
rdev_for_each(rdev, &rs->md) {
|
|
|
- rdev->data_offset = data_offset;
|
|
|
- rdev->new_data_offset = new_data_offset;
|
|
|
+ if (!test_bit(Journal, &rdev->flags)) {
|
|
|
+ rdev->data_offset = data_offset;
|
|
|
+ rdev->new_data_offset = new_data_offset;
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
return 0;
|
|
@@ -2504,8 +2585,10 @@ static void __reorder_raid_disk_indexes(struct raid_set *rs)
|
|
|
struct md_rdev *rdev;
|
|
|
|
|
|
rdev_for_each(rdev, &rs->md) {
|
|
|
- rdev->raid_disk = i++;
|
|
|
- rdev->saved_raid_disk = rdev->new_raid_disk = -1;
|
|
|
+ if (!test_bit(Journal, &rdev->flags)) {
|
|
|
+ rdev->raid_disk = i++;
|
|
|
+ rdev->saved_raid_disk = rdev->new_raid_disk = -1;
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -2845,7 +2928,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
|
|
|
if (r)
|
|
|
goto bad;
|
|
|
|
|
|
- calculated_dev_sectors = rs->dev[0].rdev.sectors;
|
|
|
+ calculated_dev_sectors = rs->md.dev_sectors;
|
|
|
|
|
|
/*
|
|
|
* Backup any new raid set level, layout, ...
|
|
@@ -2858,7 +2941,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
|
|
|
if (r)
|
|
|
goto bad;
|
|
|
|
|
|
- resize = calculated_dev_sectors != rs->dev[0].rdev.sectors;
|
|
|
+ resize = calculated_dev_sectors != __rdev_sectors(rs);
|
|
|
|
|
|
INIT_WORK(&rs->md.event_work, do_table_event);
|
|
|
ti->private = rs;
|
|
@@ -2902,6 +2985,13 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
|
|
|
goto bad;
|
|
|
}
|
|
|
|
|
|
+ /* We can't takeover a journaled raid4/5/6 */
|
|
|
+ if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
|
|
|
+ ti->error = "Can't takeover a journaled raid4/5/6 set";
|
|
|
+ r = -EPERM;
|
|
|
+ goto bad;
|
|
|
+ }
|
|
|
+
|
|
|
/*
|
|
|
* If a takeover is needed, userspace sets any additional
|
|
|
* devices to rebuild and we can check for a valid request here.
|
|
@@ -2923,6 +3013,18 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
|
|
|
rs_setup_recovery(rs, MaxSector);
|
|
|
rs_set_new(rs);
|
|
|
} else if (rs_reshape_requested(rs)) {
|
|
|
+ /*
|
|
|
+ * No need to check for 'ongoing' takeover here, because takeover
|
|
|
+ * is an instant operation as oposed to an ongoing reshape.
|
|
|
+ */
|
|
|
+
|
|
|
+ /* We can't reshape a journaled raid4/5/6 */
|
|
|
+ if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
|
|
|
+ ti->error = "Can't reshape a journaled raid4/5/6 set";
|
|
|
+ r = -EPERM;
|
|
|
+ goto bad;
|
|
|
+ }
|
|
|
+
|
|
|
/*
|
|
|
* We can only prepare for a reshape here, because the
|
|
|
* raid set needs to run to provide the repective reshape
|
|
@@ -3071,18 +3173,23 @@ static const char *decipher_sync_action(struct mddev *mddev)
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
- * Return status string @rdev
|
|
|
+ * Return status string for @rdev
|
|
|
*
|
|
|
* Status characters:
|
|
|
*
|
|
|
- * 'D' = Dead/Failed device
|
|
|
+ * 'D' = Dead/Failed raid set component or raid4/5/6 journal device
|
|
|
* 'a' = Alive but not in-sync
|
|
|
- * 'A' = Alive and in-sync
|
|
|
+ * 'A' = Alive and in-sync raid set component or alive raid4/5/6 journal device
|
|
|
+ * '-' = Non-existing device (i.e. uspace passed '- -' into the ctr)
|
|
|
*/
|
|
|
static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync)
|
|
|
{
|
|
|
- if (test_bit(Faulty, &rdev->flags))
|
|
|
+ if (!rdev->bdev)
|
|
|
+ return "-";
|
|
|
+ else if (test_bit(Faulty, &rdev->flags))
|
|
|
return "D";
|
|
|
+ else if (test_bit(Journal, &rdev->flags))
|
|
|
+ return "A";
|
|
|
else if (!array_in_sync || !test_bit(In_sync, &rdev->flags))
|
|
|
return "a";
|
|
|
else
|
|
@@ -3151,7 +3258,8 @@ static sector_t rs_get_progress(struct raid_set *rs,
|
|
|
* being initialized.
|
|
|
*/
|
|
|
rdev_for_each(rdev, mddev)
|
|
|
- if (!test_bit(In_sync, &rdev->flags))
|
|
|
+ if (!test_bit(Journal, &rdev->flags) &&
|
|
|
+ !test_bit(In_sync, &rdev->flags))
|
|
|
*array_in_sync = true;
|
|
|
#if 0
|
|
|
r = 0; /* HM FIXME: TESTME: https://bugzilla.redhat.com/show_bug.cgi?id=1210637 ? */
|
|
@@ -3183,7 +3291,6 @@ static void raid_status(struct dm_target *ti, status_type_t type,
|
|
|
sector_t progress, resync_max_sectors, resync_mismatches;
|
|
|
const char *sync_action;
|
|
|
struct raid_type *rt;
|
|
|
- struct md_rdev *rdev;
|
|
|
|
|
|
switch (type) {
|
|
|
case STATUSTYPE_INFO:
|
|
@@ -3204,9 +3311,9 @@ static void raid_status(struct dm_target *ti, status_type_t type,
|
|
|
atomic64_read(&mddev->resync_mismatches) : 0;
|
|
|
sync_action = decipher_sync_action(&rs->md);
|
|
|
|
|
|
- /* HM FIXME: do we want another state char for raid0? It shows 'D' or 'A' now */
|
|
|
- rdev_for_each(rdev, mddev)
|
|
|
- DMEMIT(__raid_dev_status(rdev, array_in_sync));
|
|
|
+ /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
|
|
|
+ for (i = 0; i < rs->raid_disks; i++)
|
|
|
+ DMEMIT(__raid_dev_status(&rs->dev[i].rdev, array_in_sync));
|
|
|
|
|
|
/*
|
|
|
* In-sync/Reshape ratio:
|
|
@@ -3252,6 +3359,12 @@ static void raid_status(struct dm_target *ti, status_type_t type,
|
|
|
* so retrieving it from the first raid disk is sufficient.
|
|
|
*/
|
|
|
DMEMIT(" %llu", (unsigned long long) rs->dev[0].rdev.data_offset);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * v1.10.0+:
|
|
|
+ */
|
|
|
+ DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ?
|
|
|
+ __raid_dev_status(&rs->journal_dev.rdev, 0) : "-");
|
|
|
break;
|
|
|
|
|
|
case STATUSTYPE_TABLE:
|
|
@@ -3265,7 +3378,8 @@ static void raid_status(struct dm_target *ti, status_type_t type,
|
|
|
raid_param_cnt += rebuild_disks * 2 +
|
|
|
write_mostly_params +
|
|
|
hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) +
|
|
|
- hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2;
|
|
|
+ hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 +
|
|
|
+ (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0);
|
|
|
/* Emit table line */
|
|
|
DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors);
|
|
|
if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
|
|
@@ -3312,6 +3426,9 @@ static void raid_status(struct dm_target *ti, status_type_t type,
|
|
|
if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags))
|
|
|
DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE),
|
|
|
mddev->sync_speed_min);
|
|
|
+ if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags))
|
|
|
+ DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV),
|
|
|
+ __get_dev_name(rs->journal_dev.dev));
|
|
|
DMEMIT(" %d", rs->raid_disks);
|
|
|
for (i = 0; i < rs->raid_disks; i++)
|
|
|
DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev),
|
|
@@ -3347,10 +3464,11 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv)
|
|
|
else {
|
|
|
if (!strcasecmp(argv[0], "check"))
|
|
|
set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
|
|
|
- else if (!!strcasecmp(argv[0], "repair"))
|
|
|
+ else if (!strcasecmp(argv[0], "repair")) {
|
|
|
+ set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
|
|
|
+ set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
|
|
|
+ } else
|
|
|
return -EINVAL;
|
|
|
- set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
|
|
|
- set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
|
|
|
}
|
|
|
if (mddev->ro == 2) {
|
|
|
/* A write to sync_action is enough to justify
|
|
@@ -3427,11 +3545,14 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
|
|
|
|
|
|
memset(cleared_failed_devices, 0, sizeof(cleared_failed_devices));
|
|
|
|
|
|
- for (i = 0; i < rs->md.raid_disks; i++) {
|
|
|
+ for (i = 0; i < mddev->raid_disks; i++) {
|
|
|
r = &rs->dev[i].rdev;
|
|
|
- if (test_bit(Faulty, &r->flags) && r->sb_page &&
|
|
|
- sync_page_io(r, 0, r->sb_size, r->sb_page,
|
|
|
- REQ_OP_READ, 0, true)) {
|
|
|
+ /* HM FIXME: enhance journal device recovery processing */
|
|
|
+ if (test_bit(Journal, &r->flags))
|
|
|
+ continue;
|
|
|
+
|
|
|
+ if (test_bit(Faulty, &r->flags) &&
|
|
|
+ r->meta_bdev && !read_disk_sb(r, r->sb_size, true)) {
|
|
|
DMINFO("Faulty %s device #%d has readable super block."
|
|
|
" Attempting to revive it.",
|
|
|
rs->raid_type->name, i);
|
|
@@ -3445,22 +3566,26 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
|
|
|
* '>= 0' - meaning we must call this function
|
|
|
* ourselves.
|
|
|
*/
|
|
|
- if ((r->raid_disk >= 0) &&
|
|
|
- (mddev->pers->hot_remove_disk(mddev, r) != 0))
|
|
|
- /* Failed to revive this device, try next */
|
|
|
- continue;
|
|
|
-
|
|
|
- r->raid_disk = i;
|
|
|
- r->saved_raid_disk = i;
|
|
|
flags = r->flags;
|
|
|
+ clear_bit(In_sync, &r->flags); /* Mandatory for hot remove. */
|
|
|
+ if (r->raid_disk >= 0) {
|
|
|
+ if (mddev->pers->hot_remove_disk(mddev, r)) {
|
|
|
+ /* Failed to revive this device, try next */
|
|
|
+ r->flags = flags;
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ } else
|
|
|
+ r->raid_disk = r->saved_raid_disk = i;
|
|
|
+
|
|
|
clear_bit(Faulty, &r->flags);
|
|
|
clear_bit(WriteErrorSeen, &r->flags);
|
|
|
- clear_bit(In_sync, &r->flags);
|
|
|
+
|
|
|
if (mddev->pers->hot_add_disk(mddev, r)) {
|
|
|
- r->raid_disk = -1;
|
|
|
- r->saved_raid_disk = -1;
|
|
|
+ /* Failed to revive this device, try next */
|
|
|
+ r->raid_disk = r->saved_raid_disk = -1;
|
|
|
r->flags = flags;
|
|
|
} else {
|
|
|
+ clear_bit(In_sync, &r->flags);
|
|
|
r->recovery_offset = 0;
|
|
|
set_bit(i, (void *) cleared_failed_devices);
|
|
|
cleared = true;
|
|
@@ -3473,6 +3598,9 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
|
|
|
uint64_t failed_devices[DISKS_ARRAY_ELEMS];
|
|
|
|
|
|
rdev_for_each(r, &rs->md) {
|
|
|
+ if (test_bit(Journal, &r->flags))
|
|
|
+ continue;
|
|
|
+
|
|
|
sb = page_address(r->sb_page);
|
|
|
sb_retrieve_failed_devices(sb, failed_devices);
|
|
|
|
|
@@ -3651,7 +3779,7 @@ static void raid_resume(struct dm_target *ti)
|
|
|
|
|
|
static struct target_type raid_target = {
|
|
|
.name = "raid",
|
|
|
- .version = {1, 9, 1},
|
|
|
+ .version = {1, 10, 0},
|
|
|
.module = THIS_MODULE,
|
|
|
.ctr = raid_ctr,
|
|
|
.dtr = raid_dtr,
|