|
@@ -47,7 +47,9 @@
|
|
|
#include <linux/kthread.h>
|
|
|
#include <linux/raid/pq.h>
|
|
|
#include <linux/async_tx.h>
|
|
|
+#include <linux/async.h>
|
|
|
#include <linux/seq_file.h>
|
|
|
+#include <linux/cpu.h>
|
|
|
#include "md.h"
|
|
|
#include "raid5.h"
|
|
|
#include "bitmap.h"
|
|
@@ -499,11 +501,18 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
|
|
|
struct page *bio_page;
|
|
|
int i;
|
|
|
int page_offset;
|
|
|
+ struct async_submit_ctl submit;
|
|
|
+ enum async_tx_flags flags = 0;
|
|
|
|
|
|
if (bio->bi_sector >= sector)
|
|
|
page_offset = (signed)(bio->bi_sector - sector) * 512;
|
|
|
else
|
|
|
page_offset = (signed)(sector - bio->bi_sector) * -512;
|
|
|
+
|
|
|
+ if (frombio)
|
|
|
+ flags |= ASYNC_TX_FENCE;
|
|
|
+ init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
|
|
|
+
|
|
|
bio_for_each_segment(bvl, bio, i) {
|
|
|
int len = bio_iovec_idx(bio, i)->bv_len;
|
|
|
int clen;
|
|
@@ -525,15 +534,14 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
|
|
|
bio_page = bio_iovec_idx(bio, i)->bv_page;
|
|
|
if (frombio)
|
|
|
tx = async_memcpy(page, bio_page, page_offset,
|
|
|
- b_offset, clen,
|
|
|
- ASYNC_TX_DEP_ACK,
|
|
|
- tx, NULL, NULL);
|
|
|
+ b_offset, clen, &submit);
|
|
|
else
|
|
|
tx = async_memcpy(bio_page, page, b_offset,
|
|
|
- page_offset, clen,
|
|
|
- ASYNC_TX_DEP_ACK,
|
|
|
- tx, NULL, NULL);
|
|
|
+ page_offset, clen, &submit);
|
|
|
}
|
|
|
+ /* chain the operations */
|
|
|
+ submit.depend_tx = tx;
|
|
|
+
|
|
|
if (clen < len) /* hit end of page */
|
|
|
break;
|
|
|
page_offset += len;
|
|
@@ -592,6 +600,7 @@ static void ops_run_biofill(struct stripe_head *sh)
|
|
|
{
|
|
|
struct dma_async_tx_descriptor *tx = NULL;
|
|
|
raid5_conf_t *conf = sh->raid_conf;
|
|
|
+ struct async_submit_ctl submit;
|
|
|
int i;
|
|
|
|
|
|
pr_debug("%s: stripe %llu\n", __func__,
|
|
@@ -615,22 +624,34 @@ static void ops_run_biofill(struct stripe_head *sh)
|
|
|
}
|
|
|
|
|
|
atomic_inc(&sh->count);
|
|
|
- async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
|
|
|
- ops_complete_biofill, sh);
|
|
|
+ init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
|
|
|
+ async_trigger_callback(&submit);
|
|
|
}
|
|
|
|
|
|
-static void ops_complete_compute5(void *stripe_head_ref)
|
|
|
+static void mark_target_uptodate(struct stripe_head *sh, int target)
|
|
|
{
|
|
|
- struct stripe_head *sh = stripe_head_ref;
|
|
|
- int target = sh->ops.target;
|
|
|
- struct r5dev *tgt = &sh->dev[target];
|
|
|
+ struct r5dev *tgt;
|
|
|
|
|
|
- pr_debug("%s: stripe %llu\n", __func__,
|
|
|
- (unsigned long long)sh->sector);
|
|
|
+ if (target < 0)
|
|
|
+ return;
|
|
|
|
|
|
+ tgt = &sh->dev[target];
|
|
|
set_bit(R5_UPTODATE, &tgt->flags);
|
|
|
BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
|
|
|
clear_bit(R5_Wantcompute, &tgt->flags);
|
|
|
+}
|
|
|
+
|
|
|
+static void ops_complete_compute(void *stripe_head_ref)
|
|
|
+{
|
|
|
+ struct stripe_head *sh = stripe_head_ref;
|
|
|
+
|
|
|
+ pr_debug("%s: stripe %llu\n", __func__,
|
|
|
+ (unsigned long long)sh->sector);
|
|
|
+
|
|
|
+ /* mark the computed target(s) as uptodate */
|
|
|
+ mark_target_uptodate(sh, sh->ops.target);
|
|
|
+ mark_target_uptodate(sh, sh->ops.target2);
|
|
|
+
|
|
|
clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
|
|
|
if (sh->check_state == check_state_compute_run)
|
|
|
sh->check_state = check_state_compute_result;
|
|
@@ -638,16 +659,24 @@ static void ops_complete_compute5(void *stripe_head_ref)
|
|
|
release_stripe(sh);
|
|
|
}
|
|
|
|
|
|
-static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
|
|
|
+/* return a pointer to the address conversion region of the scribble buffer */
|
|
|
+static addr_conv_t *to_addr_conv(struct stripe_head *sh,
|
|
|
+ struct raid5_percpu *percpu)
|
|
|
+{
|
|
|
+ return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
|
|
|
+}
|
|
|
+
|
|
|
+static struct dma_async_tx_descriptor *
|
|
|
+ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
|
|
|
{
|
|
|
- /* kernel stack size limits the total number of disks */
|
|
|
int disks = sh->disks;
|
|
|
- struct page *xor_srcs[disks];
|
|
|
+ struct page **xor_srcs = percpu->scribble;
|
|
|
int target = sh->ops.target;
|
|
|
struct r5dev *tgt = &sh->dev[target];
|
|
|
struct page *xor_dest = tgt->page;
|
|
|
int count = 0;
|
|
|
struct dma_async_tx_descriptor *tx;
|
|
|
+ struct async_submit_ctl submit;
|
|
|
int i;
|
|
|
|
|
|
pr_debug("%s: stripe %llu block: %d\n",
|
|
@@ -660,17 +689,215 @@ static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
|
|
|
|
|
|
atomic_inc(&sh->count);
|
|
|
|
|
|
+ init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
|
|
|
+ ops_complete_compute, sh, to_addr_conv(sh, percpu));
|
|
|
if (unlikely(count == 1))
|
|
|
- tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
|
|
|
- 0, NULL, ops_complete_compute5, sh);
|
|
|
+ tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
|
|
|
else
|
|
|
- tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
|
|
|
- ASYNC_TX_XOR_ZERO_DST, NULL,
|
|
|
- ops_complete_compute5, sh);
|
|
|
+ tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
|
|
|
|
|
|
return tx;
|
|
|
}
|
|
|
|
|
|
+/* set_syndrome_sources - populate source buffers for gen_syndrome
|
|
|
+ * @srcs - (struct page *) array of size sh->disks
|
|
|
+ * @sh - stripe_head to parse
|
|
|
+ *
|
|
|
+ * Populates srcs in proper layout order for the stripe and returns the
|
|
|
+ * 'count' of sources to be used in a call to async_gen_syndrome. The P
|
|
|
+ * destination buffer is recorded in srcs[count] and the Q destination
|
|
|
+ * is recorded in srcs[count+1]].
|
|
|
+ */
|
|
|
+static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
|
|
|
+{
|
|
|
+ int disks = sh->disks;
|
|
|
+ int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
|
|
|
+ int d0_idx = raid6_d0(sh);
|
|
|
+ int count;
|
|
|
+ int i;
|
|
|
+
|
|
|
+ for (i = 0; i < disks; i++)
|
|
|
+ srcs[i] = (void *)raid6_empty_zero_page;
|
|
|
+
|
|
|
+ count = 0;
|
|
|
+ i = d0_idx;
|
|
|
+ do {
|
|
|
+ int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
|
|
|
+
|
|
|
+ srcs[slot] = sh->dev[i].page;
|
|
|
+ i = raid6_next_disk(i, disks);
|
|
|
+ } while (i != d0_idx);
|
|
|
+ BUG_ON(count != syndrome_disks);
|
|
|
+
|
|
|
+ return count;
|
|
|
+}
|
|
|
+
|
|
|
+static struct dma_async_tx_descriptor *
|
|
|
+ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
|
|
|
+{
|
|
|
+ int disks = sh->disks;
|
|
|
+ struct page **blocks = percpu->scribble;
|
|
|
+ int target;
|
|
|
+ int qd_idx = sh->qd_idx;
|
|
|
+ struct dma_async_tx_descriptor *tx;
|
|
|
+ struct async_submit_ctl submit;
|
|
|
+ struct r5dev *tgt;
|
|
|
+ struct page *dest;
|
|
|
+ int i;
|
|
|
+ int count;
|
|
|
+
|
|
|
+ if (sh->ops.target < 0)
|
|
|
+ target = sh->ops.target2;
|
|
|
+ else if (sh->ops.target2 < 0)
|
|
|
+ target = sh->ops.target;
|
|
|
+ else
|
|
|
+ /* we should only have one valid target */
|
|
|
+ BUG();
|
|
|
+ BUG_ON(target < 0);
|
|
|
+ pr_debug("%s: stripe %llu block: %d\n",
|
|
|
+ __func__, (unsigned long long)sh->sector, target);
|
|
|
+
|
|
|
+ tgt = &sh->dev[target];
|
|
|
+ BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
|
|
|
+ dest = tgt->page;
|
|
|
+
|
|
|
+ atomic_inc(&sh->count);
|
|
|
+
|
|
|
+ if (target == qd_idx) {
|
|
|
+ count = set_syndrome_sources(blocks, sh);
|
|
|
+ blocks[count] = NULL; /* regenerating p is not necessary */
|
|
|
+ BUG_ON(blocks[count+1] != dest); /* q should already be set */
|
|
|
+ init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
|
|
|
+ ops_complete_compute, sh,
|
|
|
+ to_addr_conv(sh, percpu));
|
|
|
+ tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
|
|
|
+ } else {
|
|
|
+ /* Compute any data- or p-drive using XOR */
|
|
|
+ count = 0;
|
|
|
+ for (i = disks; i-- ; ) {
|
|
|
+ if (i == target || i == qd_idx)
|
|
|
+ continue;
|
|
|
+ blocks[count++] = sh->dev[i].page;
|
|
|
+ }
|
|
|
+
|
|
|
+ init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
|
|
|
+ NULL, ops_complete_compute, sh,
|
|
|
+ to_addr_conv(sh, percpu));
|
|
|
+ tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
|
|
|
+ }
|
|
|
+
|
|
|
+ return tx;
|
|
|
+}
|
|
|
+
|
|
|
+static struct dma_async_tx_descriptor *
|
|
|
+ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
|
|
|
+{
|
|
|
+ int i, count, disks = sh->disks;
|
|
|
+ int syndrome_disks = sh->ddf_layout ? disks : disks-2;
|
|
|
+ int d0_idx = raid6_d0(sh);
|
|
|
+ int faila = -1, failb = -1;
|
|
|
+ int target = sh->ops.target;
|
|
|
+ int target2 = sh->ops.target2;
|
|
|
+ struct r5dev *tgt = &sh->dev[target];
|
|
|
+ struct r5dev *tgt2 = &sh->dev[target2];
|
|
|
+ struct dma_async_tx_descriptor *tx;
|
|
|
+ struct page **blocks = percpu->scribble;
|
|
|
+ struct async_submit_ctl submit;
|
|
|
+
|
|
|
+ pr_debug("%s: stripe %llu block1: %d block2: %d\n",
|
|
|
+ __func__, (unsigned long long)sh->sector, target, target2);
|
|
|
+ BUG_ON(target < 0 || target2 < 0);
|
|
|
+ BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
|
|
|
+ BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
|
|
|
+
|
|
|
+ /* we need to open-code set_syndrome_sources to handle the
|
|
|
+ * slot number conversion for 'faila' and 'failb'
|
|
|
+ */
|
|
|
+ for (i = 0; i < disks ; i++)
|
|
|
+ blocks[i] = (void *)raid6_empty_zero_page;
|
|
|
+ count = 0;
|
|
|
+ i = d0_idx;
|
|
|
+ do {
|
|
|
+ int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
|
|
|
+
|
|
|
+ blocks[slot] = sh->dev[i].page;
|
|
|
+
|
|
|
+ if (i == target)
|
|
|
+ faila = slot;
|
|
|
+ if (i == target2)
|
|
|
+ failb = slot;
|
|
|
+ i = raid6_next_disk(i, disks);
|
|
|
+ } while (i != d0_idx);
|
|
|
+ BUG_ON(count != syndrome_disks);
|
|
|
+
|
|
|
+ BUG_ON(faila == failb);
|
|
|
+ if (failb < faila)
|
|
|
+ swap(faila, failb);
|
|
|
+ pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
|
|
|
+ __func__, (unsigned long long)sh->sector, faila, failb);
|
|
|
+
|
|
|
+ atomic_inc(&sh->count);
|
|
|
+
|
|
|
+ if (failb == syndrome_disks+1) {
|
|
|
+ /* Q disk is one of the missing disks */
|
|
|
+ if (faila == syndrome_disks) {
|
|
|
+ /* Missing P+Q, just recompute */
|
|
|
+ init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
|
|
|
+ ops_complete_compute, sh,
|
|
|
+ to_addr_conv(sh, percpu));
|
|
|
+ return async_gen_syndrome(blocks, 0, count+2,
|
|
|
+ STRIPE_SIZE, &submit);
|
|
|
+ } else {
|
|
|
+ struct page *dest;
|
|
|
+ int data_target;
|
|
|
+ int qd_idx = sh->qd_idx;
|
|
|
+
|
|
|
+ /* Missing D+Q: recompute D from P, then recompute Q */
|
|
|
+ if (target == qd_idx)
|
|
|
+ data_target = target2;
|
|
|
+ else
|
|
|
+ data_target = target;
|
|
|
+
|
|
|
+ count = 0;
|
|
|
+ for (i = disks; i-- ; ) {
|
|
|
+ if (i == data_target || i == qd_idx)
|
|
|
+ continue;
|
|
|
+ blocks[count++] = sh->dev[i].page;
|
|
|
+ }
|
|
|
+ dest = sh->dev[data_target].page;
|
|
|
+ init_async_submit(&submit,
|
|
|
+ ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
|
|
|
+ NULL, NULL, NULL,
|
|
|
+ to_addr_conv(sh, percpu));
|
|
|
+ tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
|
|
|
+ &submit);
|
|
|
+
|
|
|
+ count = set_syndrome_sources(blocks, sh);
|
|
|
+ init_async_submit(&submit, ASYNC_TX_FENCE, tx,
|
|
|
+ ops_complete_compute, sh,
|
|
|
+ to_addr_conv(sh, percpu));
|
|
|
+ return async_gen_syndrome(blocks, 0, count+2,
|
|
|
+ STRIPE_SIZE, &submit);
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
|
|
|
+ ops_complete_compute, sh,
|
|
|
+ to_addr_conv(sh, percpu));
|
|
|
+ if (failb == syndrome_disks) {
|
|
|
+ /* We're missing D+P. */
|
|
|
+ return async_raid6_datap_recov(syndrome_disks+2,
|
|
|
+ STRIPE_SIZE, faila,
|
|
|
+ blocks, &submit);
|
|
|
+ } else {
|
|
|
+ /* We're missing D+D. */
|
|
|
+ return async_raid6_2data_recov(syndrome_disks+2,
|
|
|
+ STRIPE_SIZE, faila, failb,
|
|
|
+ blocks, &submit);
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
static void ops_complete_prexor(void *stripe_head_ref)
|
|
|
{
|
|
|
struct stripe_head *sh = stripe_head_ref;
|
|
@@ -680,12 +907,13 @@ static void ops_complete_prexor(void *stripe_head_ref)
|
|
|
}
|
|
|
|
|
|
static struct dma_async_tx_descriptor *
|
|
|
-ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
|
|
|
+ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
|
|
|
+ struct dma_async_tx_descriptor *tx)
|
|
|
{
|
|
|
- /* kernel stack size limits the total number of disks */
|
|
|
int disks = sh->disks;
|
|
|
- struct page *xor_srcs[disks];
|
|
|
+ struct page **xor_srcs = percpu->scribble;
|
|
|
int count = 0, pd_idx = sh->pd_idx, i;
|
|
|
+ struct async_submit_ctl submit;
|
|
|
|
|
|
/* existing parity data subtracted */
|
|
|
struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
|
|
@@ -700,9 +928,9 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
|
|
|
xor_srcs[count++] = dev->page;
|
|
|
}
|
|
|
|
|
|
- tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
|
|
|
- ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx,
|
|
|
- ops_complete_prexor, sh);
|
|
|
+ init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
|
|
|
+ ops_complete_prexor, sh, to_addr_conv(sh, percpu));
|
|
|
+ tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
|
|
|
|
|
|
return tx;
|
|
|
}
|
|
@@ -742,17 +970,21 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
|
|
|
return tx;
|
|
|
}
|
|
|
|
|
|
-static void ops_complete_postxor(void *stripe_head_ref)
|
|
|
+static void ops_complete_reconstruct(void *stripe_head_ref)
|
|
|
{
|
|
|
struct stripe_head *sh = stripe_head_ref;
|
|
|
- int disks = sh->disks, i, pd_idx = sh->pd_idx;
|
|
|
+ int disks = sh->disks;
|
|
|
+ int pd_idx = sh->pd_idx;
|
|
|
+ int qd_idx = sh->qd_idx;
|
|
|
+ int i;
|
|
|
|
|
|
pr_debug("%s: stripe %llu\n", __func__,
|
|
|
(unsigned long long)sh->sector);
|
|
|
|
|
|
for (i = disks; i--; ) {
|
|
|
struct r5dev *dev = &sh->dev[i];
|
|
|
- if (dev->written || i == pd_idx)
|
|
|
+
|
|
|
+ if (dev->written || i == pd_idx || i == qd_idx)
|
|
|
set_bit(R5_UPTODATE, &dev->flags);
|
|
|
}
|
|
|
|
|
@@ -770,12 +1002,12 @@ static void ops_complete_postxor(void *stripe_head_ref)
|
|
|
}
|
|
|
|
|
|
static void
|
|
|
-ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
|
|
|
+ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
|
|
|
+ struct dma_async_tx_descriptor *tx)
|
|
|
{
|
|
|
- /* kernel stack size limits the total number of disks */
|
|
|
int disks = sh->disks;
|
|
|
- struct page *xor_srcs[disks];
|
|
|
-
|
|
|
+ struct page **xor_srcs = percpu->scribble;
|
|
|
+ struct async_submit_ctl submit;
|
|
|
int count = 0, pd_idx = sh->pd_idx, i;
|
|
|
struct page *xor_dest;
|
|
|
int prexor = 0;
|
|
@@ -809,18 +1041,36 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
|
|
|
* set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
|
|
|
* for the synchronous xor case
|
|
|
*/
|
|
|
- flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK |
|
|
|
+ flags = ASYNC_TX_ACK |
|
|
|
(prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
|
|
|
|
|
|
atomic_inc(&sh->count);
|
|
|
|
|
|
- if (unlikely(count == 1)) {
|
|
|
- flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST);
|
|
|
- tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
|
|
|
- flags, tx, ops_complete_postxor, sh);
|
|
|
- } else
|
|
|
- tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
|
|
|
- flags, tx, ops_complete_postxor, sh);
|
|
|
+ init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
|
|
|
+ to_addr_conv(sh, percpu));
|
|
|
+ if (unlikely(count == 1))
|
|
|
+ tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
|
|
|
+ else
|
|
|
+ tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
|
|
|
+}
|
|
|
+
|
|
|
+static void
|
|
|
+ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
|
|
|
+ struct dma_async_tx_descriptor *tx)
|
|
|
+{
|
|
|
+ struct async_submit_ctl submit;
|
|
|
+ struct page **blocks = percpu->scribble;
|
|
|
+ int count;
|
|
|
+
|
|
|
+ pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
|
|
|
+
|
|
|
+ count = set_syndrome_sources(blocks, sh);
|
|
|
+
|
|
|
+ atomic_inc(&sh->count);
|
|
|
+
|
|
|
+ init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
|
|
|
+ sh, to_addr_conv(sh, percpu));
|
|
|
+ async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
|
|
|
}
|
|
|
|
|
|
static void ops_complete_check(void *stripe_head_ref)
|
|
@@ -835,63 +1085,115 @@ static void ops_complete_check(void *stripe_head_ref)
|
|
|
release_stripe(sh);
|
|
|
}
|
|
|
|
|
|
-static void ops_run_check(struct stripe_head *sh)
|
|
|
+static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
|
|
|
{
|
|
|
- /* kernel stack size limits the total number of disks */
|
|
|
int disks = sh->disks;
|
|
|
- struct page *xor_srcs[disks];
|
|
|
+ int pd_idx = sh->pd_idx;
|
|
|
+ int qd_idx = sh->qd_idx;
|
|
|
+ struct page *xor_dest;
|
|
|
+ struct page **xor_srcs = percpu->scribble;
|
|
|
struct dma_async_tx_descriptor *tx;
|
|
|
-
|
|
|
- int count = 0, pd_idx = sh->pd_idx, i;
|
|
|
- struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
|
|
|
+ struct async_submit_ctl submit;
|
|
|
+ int count;
|
|
|
+ int i;
|
|
|
|
|
|
pr_debug("%s: stripe %llu\n", __func__,
|
|
|
(unsigned long long)sh->sector);
|
|
|
|
|
|
+ count = 0;
|
|
|
+ xor_dest = sh->dev[pd_idx].page;
|
|
|
+ xor_srcs[count++] = xor_dest;
|
|
|
for (i = disks; i--; ) {
|
|
|
- struct r5dev *dev = &sh->dev[i];
|
|
|
- if (i != pd_idx)
|
|
|
- xor_srcs[count++] = dev->page;
|
|
|
+ if (i == pd_idx || i == qd_idx)
|
|
|
+ continue;
|
|
|
+ xor_srcs[count++] = sh->dev[i].page;
|
|
|
}
|
|
|
|
|
|
- tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
|
|
|
- &sh->ops.zero_sum_result, 0, NULL, NULL, NULL);
|
|
|
+ init_async_submit(&submit, 0, NULL, NULL, NULL,
|
|
|
+ to_addr_conv(sh, percpu));
|
|
|
+ tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
|
|
|
+ &sh->ops.zero_sum_result, &submit);
|
|
|
+
|
|
|
+ atomic_inc(&sh->count);
|
|
|
+ init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
|
|
|
+ tx = async_trigger_callback(&submit);
|
|
|
+}
|
|
|
+
|
|
|
+static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
|
|
|
+{
|
|
|
+ struct page **srcs = percpu->scribble;
|
|
|
+ struct async_submit_ctl submit;
|
|
|
+ int count;
|
|
|
+
|
|
|
+ pr_debug("%s: stripe %llu checkp: %d\n", __func__,
|
|
|
+ (unsigned long long)sh->sector, checkp);
|
|
|
+
|
|
|
+ count = set_syndrome_sources(srcs, sh);
|
|
|
+ if (!checkp)
|
|
|
+ srcs[count] = NULL;
|
|
|
|
|
|
atomic_inc(&sh->count);
|
|
|
- tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
|
|
|
- ops_complete_check, sh);
|
|
|
+ init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
|
|
|
+ sh, to_addr_conv(sh, percpu));
|
|
|
+ async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
|
|
|
+ &sh->ops.zero_sum_result, percpu->spare_page, &submit);
|
|
|
}
|
|
|
|
|
|
-static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
|
|
|
+static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
|
|
|
{
|
|
|
int overlap_clear = 0, i, disks = sh->disks;
|
|
|
struct dma_async_tx_descriptor *tx = NULL;
|
|
|
+ raid5_conf_t *conf = sh->raid_conf;
|
|
|
+ int level = conf->level;
|
|
|
+ struct raid5_percpu *percpu;
|
|
|
+ unsigned long cpu;
|
|
|
|
|
|
+ cpu = get_cpu();
|
|
|
+ percpu = per_cpu_ptr(conf->percpu, cpu);
|
|
|
if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
|
|
|
ops_run_biofill(sh);
|
|
|
overlap_clear++;
|
|
|
}
|
|
|
|
|
|
if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
|
|
|
- tx = ops_run_compute5(sh);
|
|
|
- /* terminate the chain if postxor is not set to be run */
|
|
|
- if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request))
|
|
|
+ if (level < 6)
|
|
|
+ tx = ops_run_compute5(sh, percpu);
|
|
|
+ else {
|
|
|
+ if (sh->ops.target2 < 0 || sh->ops.target < 0)
|
|
|
+ tx = ops_run_compute6_1(sh, percpu);
|
|
|
+ else
|
|
|
+ tx = ops_run_compute6_2(sh, percpu);
|
|
|
+ }
|
|
|
+ /* terminate the chain if reconstruct is not set to be run */
|
|
|
+ if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
|
|
|
async_tx_ack(tx);
|
|
|
}
|
|
|
|
|
|
if (test_bit(STRIPE_OP_PREXOR, &ops_request))
|
|
|
- tx = ops_run_prexor(sh, tx);
|
|
|
+ tx = ops_run_prexor(sh, percpu, tx);
|
|
|
|
|
|
if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
|
|
|
tx = ops_run_biodrain(sh, tx);
|
|
|
overlap_clear++;
|
|
|
}
|
|
|
|
|
|
- if (test_bit(STRIPE_OP_POSTXOR, &ops_request))
|
|
|
- ops_run_postxor(sh, tx);
|
|
|
+ if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
|
|
|
+ if (level < 6)
|
|
|
+ ops_run_reconstruct5(sh, percpu, tx);
|
|
|
+ else
|
|
|
+ ops_run_reconstruct6(sh, percpu, tx);
|
|
|
+ }
|
|
|
|
|
|
- if (test_bit(STRIPE_OP_CHECK, &ops_request))
|
|
|
- ops_run_check(sh);
|
|
|
+ if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
|
|
|
+ if (sh->check_state == check_state_run)
|
|
|
+ ops_run_check_p(sh, percpu);
|
|
|
+ else if (sh->check_state == check_state_run_q)
|
|
|
+ ops_run_check_pq(sh, percpu, 0);
|
|
|
+ else if (sh->check_state == check_state_run_pq)
|
|
|
+ ops_run_check_pq(sh, percpu, 1);
|
|
|
+ else
|
|
|
+ BUG();
|
|
|
+ }
|
|
|
|
|
|
if (overlap_clear)
|
|
|
for (i = disks; i--; ) {
|
|
@@ -899,6 +1201,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
|
|
|
if (test_and_clear_bit(R5_Overlap, &dev->flags))
|
|
|
wake_up(&sh->raid_conf->wait_for_overlap);
|
|
|
}
|
|
|
+ put_cpu();
|
|
|
}
|
|
|
|
|
|
static int grow_one_stripe(raid5_conf_t *conf)
|
|
@@ -948,6 +1251,28 @@ static int grow_stripes(raid5_conf_t *conf, int num)
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
+/**
|
|
|
+ * scribble_len - return the required size of the scribble region
|
|
|
+ * @num - total number of disks in the array
|
|
|
+ *
|
|
|
+ * The size must be enough to contain:
|
|
|
+ * 1/ a struct page pointer for each device in the array +2
|
|
|
+ * 2/ room to convert each entry in (1) to its corresponding dma
|
|
|
+ * (dma_map_page()) or page (page_address()) address.
|
|
|
+ *
|
|
|
+ * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
|
|
|
+ * calculate over all devices (not just the data blocks), using zeros in place
|
|
|
+ * of the P and Q blocks.
|
|
|
+ */
|
|
|
+static size_t scribble_len(int num)
|
|
|
+{
|
|
|
+ size_t len;
|
|
|
+
|
|
|
+ len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
|
|
|
+
|
|
|
+ return len;
|
|
|
+}
|
|
|
+
|
|
|
static int resize_stripes(raid5_conf_t *conf, int newsize)
|
|
|
{
|
|
|
/* Make all the stripes able to hold 'newsize' devices.
|
|
@@ -976,6 +1301,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
|
|
|
struct stripe_head *osh, *nsh;
|
|
|
LIST_HEAD(newstripes);
|
|
|
struct disk_info *ndisks;
|
|
|
+ unsigned long cpu;
|
|
|
int err;
|
|
|
struct kmem_cache *sc;
|
|
|
int i;
|
|
@@ -1041,7 +1367,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
|
|
|
/* Step 3.
|
|
|
* At this point, we are holding all the stripes so the array
|
|
|
* is completely stalled, so now is a good time to resize
|
|
|
- * conf->disks.
|
|
|
+ * conf->disks and the scribble region
|
|
|
*/
|
|
|
ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
|
|
|
if (ndisks) {
|
|
@@ -1052,10 +1378,30 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
|
|
|
} else
|
|
|
err = -ENOMEM;
|
|
|
|
|
|
+ get_online_cpus();
|
|
|
+ conf->scribble_len = scribble_len(newsize);
|
|
|
+ for_each_present_cpu(cpu) {
|
|
|
+ struct raid5_percpu *percpu;
|
|
|
+ void *scribble;
|
|
|
+
|
|
|
+ percpu = per_cpu_ptr(conf->percpu, cpu);
|
|
|
+ scribble = kmalloc(conf->scribble_len, GFP_NOIO);
|
|
|
+
|
|
|
+ if (scribble) {
|
|
|
+ kfree(percpu->scribble);
|
|
|
+ percpu->scribble = scribble;
|
|
|
+ } else {
|
|
|
+ err = -ENOMEM;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ put_online_cpus();
|
|
|
+
|
|
|
/* Step 4, return new stripes to service */
|
|
|
while(!list_empty(&newstripes)) {
|
|
|
nsh = list_entry(newstripes.next, struct stripe_head, lru);
|
|
|
list_del_init(&nsh->lru);
|
|
|
+
|
|
|
for (i=conf->raid_disks; i < newsize; i++)
|
|
|
if (nsh->dev[i].page == NULL) {
|
|
|
struct page *p = alloc_page(GFP_NOIO);
|
|
@@ -1594,258 +1940,13 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
|
|
|
}
|
|
|
|
|
|
|
|
|
-
|
|
|
-/*
|
|
|
- * Copy data between a page in the stripe cache, and one or more bion
|
|
|
- * The page could align with the middle of the bio, or there could be
|
|
|
- * several bion, each with several bio_vecs, which cover part of the page
|
|
|
- * Multiple bion are linked together on bi_next. There may be extras
|
|
|
- * at the end of this list. We ignore them.
|
|
|
- */
|
|
|
-static void copy_data(int frombio, struct bio *bio,
|
|
|
- struct page *page,
|
|
|
- sector_t sector)
|
|
|
-{
|
|
|
- char *pa = page_address(page);
|
|
|
- struct bio_vec *bvl;
|
|
|
- int i;
|
|
|
- int page_offset;
|
|
|
-
|
|
|
- if (bio->bi_sector >= sector)
|
|
|
- page_offset = (signed)(bio->bi_sector - sector) * 512;
|
|
|
- else
|
|
|
- page_offset = (signed)(sector - bio->bi_sector) * -512;
|
|
|
- bio_for_each_segment(bvl, bio, i) {
|
|
|
- int len = bio_iovec_idx(bio,i)->bv_len;
|
|
|
- int clen;
|
|
|
- int b_offset = 0;
|
|
|
-
|
|
|
- if (page_offset < 0) {
|
|
|
- b_offset = -page_offset;
|
|
|
- page_offset += b_offset;
|
|
|
- len -= b_offset;
|
|
|
- }
|
|
|
-
|
|
|
- if (len > 0 && page_offset + len > STRIPE_SIZE)
|
|
|
- clen = STRIPE_SIZE - page_offset;
|
|
|
- else clen = len;
|
|
|
-
|
|
|
- if (clen > 0) {
|
|
|
- char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
|
|
|
- if (frombio)
|
|
|
- memcpy(pa+page_offset, ba+b_offset, clen);
|
|
|
- else
|
|
|
- memcpy(ba+b_offset, pa+page_offset, clen);
|
|
|
- __bio_kunmap_atomic(ba, KM_USER0);
|
|
|
- }
|
|
|
- if (clen < len) /* hit end of page */
|
|
|
- break;
|
|
|
- page_offset += len;
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-#define check_xor() do { \
|
|
|
- if (count == MAX_XOR_BLOCKS) { \
|
|
|
- xor_blocks(count, STRIPE_SIZE, dest, ptr);\
|
|
|
- count = 0; \
|
|
|
- } \
|
|
|
- } while(0)
|
|
|
-
|
|
|
-static void compute_parity6(struct stripe_head *sh, int method)
|
|
|
-{
|
|
|
- raid5_conf_t *conf = sh->raid_conf;
|
|
|
- int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
|
|
|
- int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
|
|
|
- struct bio *chosen;
|
|
|
- /**** FIX THIS: This could be very bad if disks is close to 256 ****/
|
|
|
- void *ptrs[syndrome_disks+2];
|
|
|
-
|
|
|
- pd_idx = sh->pd_idx;
|
|
|
- qd_idx = sh->qd_idx;
|
|
|
- d0_idx = raid6_d0(sh);
|
|
|
-
|
|
|
- pr_debug("compute_parity, stripe %llu, method %d\n",
|
|
|
- (unsigned long long)sh->sector, method);
|
|
|
-
|
|
|
- switch(method) {
|
|
|
- case READ_MODIFY_WRITE:
|
|
|
- BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */
|
|
|
- case RECONSTRUCT_WRITE:
|
|
|
- for (i= disks; i-- ;)
|
|
|
- if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
|
|
|
- chosen = sh->dev[i].towrite;
|
|
|
- sh->dev[i].towrite = NULL;
|
|
|
-
|
|
|
- if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
|
|
|
- wake_up(&conf->wait_for_overlap);
|
|
|
-
|
|
|
- BUG_ON(sh->dev[i].written);
|
|
|
- sh->dev[i].written = chosen;
|
|
|
- }
|
|
|
- break;
|
|
|
- case CHECK_PARITY:
|
|
|
- BUG(); /* Not implemented yet */
|
|
|
- }
|
|
|
-
|
|
|
- for (i = disks; i--;)
|
|
|
- if (sh->dev[i].written) {
|
|
|
- sector_t sector = sh->dev[i].sector;
|
|
|
- struct bio *wbi = sh->dev[i].written;
|
|
|
- while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
|
|
|
- copy_data(1, wbi, sh->dev[i].page, sector);
|
|
|
- wbi = r5_next_bio(wbi, sector);
|
|
|
- }
|
|
|
-
|
|
|
- set_bit(R5_LOCKED, &sh->dev[i].flags);
|
|
|
- set_bit(R5_UPTODATE, &sh->dev[i].flags);
|
|
|
- }
|
|
|
-
|
|
|
- /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/
|
|
|
-
|
|
|
- for (i = 0; i < disks; i++)
|
|
|
- ptrs[i] = (void *)raid6_empty_zero_page;
|
|
|
-
|
|
|
- count = 0;
|
|
|
- i = d0_idx;
|
|
|
- do {
|
|
|
- int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
|
|
|
-
|
|
|
- ptrs[slot] = page_address(sh->dev[i].page);
|
|
|
- if (slot < syndrome_disks &&
|
|
|
- !test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
|
|
|
- printk(KERN_ERR "block %d/%d not uptodate "
|
|
|
- "on parity calc\n", i, count);
|
|
|
- BUG();
|
|
|
- }
|
|
|
-
|
|
|
- i = raid6_next_disk(i, disks);
|
|
|
- } while (i != d0_idx);
|
|
|
- BUG_ON(count != syndrome_disks);
|
|
|
-
|
|
|
- raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs);
|
|
|
-
|
|
|
- switch(method) {
|
|
|
- case RECONSTRUCT_WRITE:
|
|
|
- set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
|
|
|
- set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
|
|
|
- set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
|
|
|
- set_bit(R5_LOCKED, &sh->dev[qd_idx].flags);
|
|
|
- break;
|
|
|
- case UPDATE_PARITY:
|
|
|
- set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
|
|
|
- set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
|
|
|
- break;
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-
|
|
|
-/* Compute one missing block */
|
|
|
-static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
|
|
|
-{
|
|
|
- int i, count, disks = sh->disks;
|
|
|
- void *ptr[MAX_XOR_BLOCKS], *dest, *p;
|
|
|
- int qd_idx = sh->qd_idx;
|
|
|
-
|
|
|
- pr_debug("compute_block_1, stripe %llu, idx %d\n",
|
|
|
- (unsigned long long)sh->sector, dd_idx);
|
|
|
-
|
|
|
- if ( dd_idx == qd_idx ) {
|
|
|
- /* We're actually computing the Q drive */
|
|
|
- compute_parity6(sh, UPDATE_PARITY);
|
|
|
- } else {
|
|
|
- dest = page_address(sh->dev[dd_idx].page);
|
|
|
- if (!nozero) memset(dest, 0, STRIPE_SIZE);
|
|
|
- count = 0;
|
|
|
- for (i = disks ; i--; ) {
|
|
|
- if (i == dd_idx || i == qd_idx)
|
|
|
- continue;
|
|
|
- p = page_address(sh->dev[i].page);
|
|
|
- if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
|
|
|
- ptr[count++] = p;
|
|
|
- else
|
|
|
- printk("compute_block() %d, stripe %llu, %d"
|
|
|
- " not present\n", dd_idx,
|
|
|
- (unsigned long long)sh->sector, i);
|
|
|
-
|
|
|
- check_xor();
|
|
|
- }
|
|
|
- if (count)
|
|
|
- xor_blocks(count, STRIPE_SIZE, dest, ptr);
|
|
|
- if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
|
|
|
- else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-/* Compute two missing blocks */
|
|
|
-static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
|
|
|
-{
|
|
|
- int i, count, disks = sh->disks;
|
|
|
- int syndrome_disks = sh->ddf_layout ? disks : disks-2;
|
|
|
- int d0_idx = raid6_d0(sh);
|
|
|
- int faila = -1, failb = -1;
|
|
|
- /**** FIX THIS: This could be very bad if disks is close to 256 ****/
|
|
|
- void *ptrs[syndrome_disks+2];
|
|
|
-
|
|
|
- for (i = 0; i < disks ; i++)
|
|
|
- ptrs[i] = (void *)raid6_empty_zero_page;
|
|
|
- count = 0;
|
|
|
- i = d0_idx;
|
|
|
- do {
|
|
|
- int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
|
|
|
-
|
|
|
- ptrs[slot] = page_address(sh->dev[i].page);
|
|
|
-
|
|
|
- if (i == dd_idx1)
|
|
|
- faila = slot;
|
|
|
- if (i == dd_idx2)
|
|
|
- failb = slot;
|
|
|
- i = raid6_next_disk(i, disks);
|
|
|
- } while (i != d0_idx);
|
|
|
- BUG_ON(count != syndrome_disks);
|
|
|
-
|
|
|
- BUG_ON(faila == failb);
|
|
|
- if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
|
|
|
-
|
|
|
- pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
|
|
|
- (unsigned long long)sh->sector, dd_idx1, dd_idx2,
|
|
|
- faila, failb);
|
|
|
-
|
|
|
- if (failb == syndrome_disks+1) {
|
|
|
- /* Q disk is one of the missing disks */
|
|
|
- if (faila == syndrome_disks) {
|
|
|
- /* Missing P+Q, just recompute */
|
|
|
- compute_parity6(sh, UPDATE_PARITY);
|
|
|
- return;
|
|
|
- } else {
|
|
|
- /* We're missing D+Q; recompute D from P */
|
|
|
- compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ?
|
|
|
- dd_idx2 : dd_idx1),
|
|
|
- 0);
|
|
|
- compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */
|
|
|
- return;
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- /* We're missing D+P or D+D; */
|
|
|
- if (failb == syndrome_disks) {
|
|
|
- /* We're missing D+P. */
|
|
|
- raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs);
|
|
|
- } else {
|
|
|
- /* We're missing D+D. */
|
|
|
- raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb,
|
|
|
- ptrs);
|
|
|
- }
|
|
|
-
|
|
|
- /* Both the above update both missing blocks */
|
|
|
- set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
|
|
|
- set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
|
|
|
-}
|
|
|
-
|
|
|
static void
|
|
|
-schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
|
|
|
+schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
|
|
|
int rcw, int expand)
|
|
|
{
|
|
|
int i, pd_idx = sh->pd_idx, disks = sh->disks;
|
|
|
+ raid5_conf_t *conf = sh->raid_conf;
|
|
|
+ int level = conf->level;
|
|
|
|
|
|
if (rcw) {
|
|
|
/* if we are not expanding this is a proper write request, and
|
|
@@ -1858,7 +1959,7 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
|
|
|
} else
|
|
|
sh->reconstruct_state = reconstruct_state_run;
|
|
|
|
|
|
- set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
|
|
|
+ set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
|
|
|
|
|
|
for (i = disks; i--; ) {
|
|
|
struct r5dev *dev = &sh->dev[i];
|
|
@@ -1871,17 +1972,18 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
|
|
|
s->locked++;
|
|
|
}
|
|
|
}
|
|
|
- if (s->locked + 1 == disks)
|
|
|
+ if (s->locked + conf->max_degraded == disks)
|
|
|
if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
|
|
|
- atomic_inc(&sh->raid_conf->pending_full_writes);
|
|
|
+ atomic_inc(&conf->pending_full_writes);
|
|
|
} else {
|
|
|
+ BUG_ON(level == 6);
|
|
|
BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
|
|
|
test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
|
|
|
|
|
|
sh->reconstruct_state = reconstruct_state_prexor_drain_run;
|
|
|
set_bit(STRIPE_OP_PREXOR, &s->ops_request);
|
|
|
set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
|
|
|
- set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
|
|
|
+ set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
|
|
|
|
|
|
for (i = disks; i--; ) {
|
|
|
struct r5dev *dev = &sh->dev[i];
|
|
@@ -1899,13 +2001,22 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- /* keep the parity disk locked while asynchronous operations
|
|
|
+ /* keep the parity disk(s) locked while asynchronous operations
|
|
|
* are in flight
|
|
|
*/
|
|
|
set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
|
|
|
clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
|
|
|
s->locked++;
|
|
|
|
|
|
+ if (level == 6) {
|
|
|
+ int qd_idx = sh->qd_idx;
|
|
|
+ struct r5dev *dev = &sh->dev[qd_idx];
|
|
|
+
|
|
|
+ set_bit(R5_LOCKED, &dev->flags);
|
|
|
+ clear_bit(R5_UPTODATE, &dev->flags);
|
|
|
+ s->locked++;
|
|
|
+ }
|
|
|
+
|
|
|
pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
|
|
|
__func__, (unsigned long long)sh->sector,
|
|
|
s->locked, s->ops_request);
|
|
@@ -1986,13 +2097,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
|
|
|
|
|
|
static void end_reshape(raid5_conf_t *conf);
|
|
|
|
|
|
-static int page_is_zero(struct page *p)
|
|
|
-{
|
|
|
- char *a = page_address(p);
|
|
|
- return ((*(u32*)a) == 0 &&
|
|
|
- memcmp(a, a+4, STRIPE_SIZE-4)==0);
|
|
|
-}
|
|
|
-
|
|
|
static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
|
|
|
struct stripe_head *sh)
|
|
|
{
|
|
@@ -2132,9 +2236,10 @@ static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
|
|
|
set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
|
|
|
set_bit(R5_Wantcompute, &dev->flags);
|
|
|
sh->ops.target = disk_idx;
|
|
|
+ sh->ops.target2 = -1;
|
|
|
s->req_compute = 1;
|
|
|
/* Careful: from this point on 'uptodate' is in the eye
|
|
|
- * of raid5_run_ops which services 'compute' operations
|
|
|
+ * of raid_run_ops which services 'compute' operations
|
|
|
* before writes. R5_Wantcompute flags a block that will
|
|
|
* be R5_UPTODATE by the time it is needed for a
|
|
|
* subsequent operation.
|
|
@@ -2173,61 +2278,104 @@ static void handle_stripe_fill5(struct stripe_head *sh,
|
|
|
set_bit(STRIPE_HANDLE, &sh->state);
|
|
|
}
|
|
|
|
|
|
-static void handle_stripe_fill6(struct stripe_head *sh,
|
|
|
- struct stripe_head_state *s, struct r6_state *r6s,
|
|
|
- int disks)
|
|
|
+/* fetch_block6 - checks the given member device to see if its data needs
|
|
|
+ * to be read or computed to satisfy a request.
|
|
|
+ *
|
|
|
+ * Returns 1 when no more member devices need to be checked, otherwise returns
|
|
|
+ * 0 to tell the loop in handle_stripe_fill6 to continue
|
|
|
+ */
|
|
|
+static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
|
|
|
+ struct r6_state *r6s, int disk_idx, int disks)
|
|
|
{
|
|
|
- int i;
|
|
|
- for (i = disks; i--; ) {
|
|
|
- struct r5dev *dev = &sh->dev[i];
|
|
|
- if (!test_bit(R5_LOCKED, &dev->flags) &&
|
|
|
- !test_bit(R5_UPTODATE, &dev->flags) &&
|
|
|
- (dev->toread || (dev->towrite &&
|
|
|
- !test_bit(R5_OVERWRITE, &dev->flags)) ||
|
|
|
- s->syncing || s->expanding ||
|
|
|
- (s->failed >= 1 &&
|
|
|
- (sh->dev[r6s->failed_num[0]].toread ||
|
|
|
- s->to_write)) ||
|
|
|
- (s->failed >= 2 &&
|
|
|
- (sh->dev[r6s->failed_num[1]].toread ||
|
|
|
- s->to_write)))) {
|
|
|
- /* we would like to get this block, possibly
|
|
|
- * by computing it, but we might not be able to
|
|
|
+ struct r5dev *dev = &sh->dev[disk_idx];
|
|
|
+ struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]],
|
|
|
+ &sh->dev[r6s->failed_num[1]] };
|
|
|
+
|
|
|
+ if (!test_bit(R5_LOCKED, &dev->flags) &&
|
|
|
+ !test_bit(R5_UPTODATE, &dev->flags) &&
|
|
|
+ (dev->toread ||
|
|
|
+ (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
|
|
|
+ s->syncing || s->expanding ||
|
|
|
+ (s->failed >= 1 &&
|
|
|
+ (fdev[0]->toread || s->to_write)) ||
|
|
|
+ (s->failed >= 2 &&
|
|
|
+ (fdev[1]->toread || s->to_write)))) {
|
|
|
+ /* we would like to get this block, possibly by computing it,
|
|
|
+ * otherwise read it if the backing disk is insync
|
|
|
+ */
|
|
|
+ BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
|
|
|
+ BUG_ON(test_bit(R5_Wantread, &dev->flags));
|
|
|
+ if ((s->uptodate == disks - 1) &&
|
|
|
+ (s->failed && (disk_idx == r6s->failed_num[0] ||
|
|
|
+ disk_idx == r6s->failed_num[1]))) {
|
|
|
+ /* have disk failed, and we're requested to fetch it;
|
|
|
+ * do compute it
|
|
|
*/
|
|
|
- if ((s->uptodate == disks - 1) &&
|
|
|
- (s->failed && (i == r6s->failed_num[0] ||
|
|
|
- i == r6s->failed_num[1]))) {
|
|
|
- pr_debug("Computing stripe %llu block %d\n",
|
|
|
- (unsigned long long)sh->sector, i);
|
|
|
- compute_block_1(sh, i, 0);
|
|
|
- s->uptodate++;
|
|
|
- } else if ( s->uptodate == disks-2 && s->failed >= 2 ) {
|
|
|
- /* Computing 2-failure is *very* expensive; only
|
|
|
- * do it if failed >= 2
|
|
|
- */
|
|
|
- int other;
|
|
|
- for (other = disks; other--; ) {
|
|
|
- if (other == i)
|
|
|
- continue;
|
|
|
- if (!test_bit(R5_UPTODATE,
|
|
|
- &sh->dev[other].flags))
|
|
|
- break;
|
|
|
- }
|
|
|
- BUG_ON(other < 0);
|
|
|
- pr_debug("Computing stripe %llu blocks %d,%d\n",
|
|
|
- (unsigned long long)sh->sector,
|
|
|
- i, other);
|
|
|
- compute_block_2(sh, i, other);
|
|
|
- s->uptodate += 2;
|
|
|
- } else if (test_bit(R5_Insync, &dev->flags)) {
|
|
|
- set_bit(R5_LOCKED, &dev->flags);
|
|
|
- set_bit(R5_Wantread, &dev->flags);
|
|
|
- s->locked++;
|
|
|
- pr_debug("Reading block %d (sync=%d)\n",
|
|
|
- i, s->syncing);
|
|
|
+ pr_debug("Computing stripe %llu block %d\n",
|
|
|
+ (unsigned long long)sh->sector, disk_idx);
|
|
|
+ set_bit(STRIPE_COMPUTE_RUN, &sh->state);
|
|
|
+ set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
|
|
|
+ set_bit(R5_Wantcompute, &dev->flags);
|
|
|
+ sh->ops.target = disk_idx;
|
|
|
+ sh->ops.target2 = -1; /* no 2nd target */
|
|
|
+ s->req_compute = 1;
|
|
|
+ s->uptodate++;
|
|
|
+ return 1;
|
|
|
+ } else if (s->uptodate == disks-2 && s->failed >= 2) {
|
|
|
+ /* Computing 2-failure is *very* expensive; only
|
|
|
+ * do it if failed >= 2
|
|
|
+ */
|
|
|
+ int other;
|
|
|
+ for (other = disks; other--; ) {
|
|
|
+ if (other == disk_idx)
|
|
|
+ continue;
|
|
|
+ if (!test_bit(R5_UPTODATE,
|
|
|
+ &sh->dev[other].flags))
|
|
|
+ break;
|
|
|
}
|
|
|
+ BUG_ON(other < 0);
|
|
|
+ pr_debug("Computing stripe %llu blocks %d,%d\n",
|
|
|
+ (unsigned long long)sh->sector,
|
|
|
+ disk_idx, other);
|
|
|
+ set_bit(STRIPE_COMPUTE_RUN, &sh->state);
|
|
|
+ set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
|
|
|
+ set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
|
|
|
+ set_bit(R5_Wantcompute, &sh->dev[other].flags);
|
|
|
+ sh->ops.target = disk_idx;
|
|
|
+ sh->ops.target2 = other;
|
|
|
+ s->uptodate += 2;
|
|
|
+ s->req_compute = 1;
|
|
|
+ return 1;
|
|
|
+ } else if (test_bit(R5_Insync, &dev->flags)) {
|
|
|
+ set_bit(R5_LOCKED, &dev->flags);
|
|
|
+ set_bit(R5_Wantread, &dev->flags);
|
|
|
+ s->locked++;
|
|
|
+ pr_debug("Reading block %d (sync=%d)\n",
|
|
|
+ disk_idx, s->syncing);
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * handle_stripe_fill6 - read or compute data to satisfy pending requests.
|
|
|
+ */
|
|
|
+static void handle_stripe_fill6(struct stripe_head *sh,
|
|
|
+ struct stripe_head_state *s, struct r6_state *r6s,
|
|
|
+ int disks)
|
|
|
+{
|
|
|
+ int i;
|
|
|
+
|
|
|
+ /* look for blocks to read/compute, skip this if a compute
|
|
|
+ * is already in flight, or if the stripe contents are in the
|
|
|
+ * midst of changing due to a write
|
|
|
+ */
|
|
|
+ if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
|
|
|
+ !sh->reconstruct_state)
|
|
|
+ for (i = disks; i--; )
|
|
|
+ if (fetch_block6(sh, s, r6s, i, disks))
|
|
|
+ break;
|
|
|
set_bit(STRIPE_HANDLE, &sh->state);
|
|
|
}
|
|
|
|
|
@@ -2361,114 +2509,61 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,
|
|
|
*/
|
|
|
/* since handle_stripe can be called at any time we need to handle the
|
|
|
* case where a compute block operation has been submitted and then a
|
|
|
- * subsequent call wants to start a write request. raid5_run_ops only
|
|
|
- * handles the case where compute block and postxor are requested
|
|
|
+ * subsequent call wants to start a write request. raid_run_ops only
|
|
|
+ * handles the case where compute block and reconstruct are requested
|
|
|
* simultaneously. If this is not the case then new writes need to be
|
|
|
* held off until the compute completes.
|
|
|
*/
|
|
|
if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
|
|
|
(s->locked == 0 && (rcw == 0 || rmw == 0) &&
|
|
|
!test_bit(STRIPE_BIT_DELAY, &sh->state)))
|
|
|
- schedule_reconstruction5(sh, s, rcw == 0, 0);
|
|
|
+ schedule_reconstruction(sh, s, rcw == 0, 0);
|
|
|
}
|
|
|
|
|
|
static void handle_stripe_dirtying6(raid5_conf_t *conf,
|
|
|
struct stripe_head *sh, struct stripe_head_state *s,
|
|
|
- struct r6_state *r6s, int disks)
|
|
|
-{
|
|
|
- int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i;
|
|
|
- int qd_idx = sh->qd_idx;
|
|
|
- for (i = disks; i--; ) {
|
|
|
- struct r5dev *dev = &sh->dev[i];
|
|
|
- /* Would I have to read this buffer for reconstruct_write */
|
|
|
- if (!test_bit(R5_OVERWRITE, &dev->flags)
|
|
|
- && i != pd_idx && i != qd_idx
|
|
|
- && (!test_bit(R5_LOCKED, &dev->flags)
|
|
|
- ) &&
|
|
|
- !test_bit(R5_UPTODATE, &dev->flags)) {
|
|
|
- if (test_bit(R5_Insync, &dev->flags)) rcw++;
|
|
|
- else {
|
|
|
- pr_debug("raid6: must_compute: "
|
|
|
- "disk %d flags=%#lx\n", i, dev->flags);
|
|
|
- must_compute++;
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- pr_debug("for sector %llu, rcw=%d, must_compute=%d\n",
|
|
|
- (unsigned long long)sh->sector, rcw, must_compute);
|
|
|
- set_bit(STRIPE_HANDLE, &sh->state);
|
|
|
-
|
|
|
- if (rcw > 0)
|
|
|
- /* want reconstruct write, but need to get some data */
|
|
|
- for (i = disks; i--; ) {
|
|
|
- struct r5dev *dev = &sh->dev[i];
|
|
|
- if (!test_bit(R5_OVERWRITE, &dev->flags)
|
|
|
- && !(s->failed == 0 && (i == pd_idx || i == qd_idx))
|
|
|
- && !test_bit(R5_LOCKED, &dev->flags) &&
|
|
|
- !test_bit(R5_UPTODATE, &dev->flags) &&
|
|
|
- test_bit(R5_Insync, &dev->flags)) {
|
|
|
- if (
|
|
|
- test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
|
|
|
- pr_debug("Read_old stripe %llu "
|
|
|
- "block %d for Reconstruct\n",
|
|
|
- (unsigned long long)sh->sector, i);
|
|
|
- set_bit(R5_LOCKED, &dev->flags);
|
|
|
- set_bit(R5_Wantread, &dev->flags);
|
|
|
- s->locked++;
|
|
|
- } else {
|
|
|
- pr_debug("Request delayed stripe %llu "
|
|
|
- "block %d for Reconstruct\n",
|
|
|
- (unsigned long long)sh->sector, i);
|
|
|
- set_bit(STRIPE_DELAYED, &sh->state);
|
|
|
- set_bit(STRIPE_HANDLE, &sh->state);
|
|
|
- }
|
|
|
+ struct r6_state *r6s, int disks)
|
|
|
+{
|
|
|
+ int rcw = 0, pd_idx = sh->pd_idx, i;
|
|
|
+ int qd_idx = sh->qd_idx;
|
|
|
+
|
|
|
+ set_bit(STRIPE_HANDLE, &sh->state);
|
|
|
+ for (i = disks; i--; ) {
|
|
|
+ struct r5dev *dev = &sh->dev[i];
|
|
|
+ /* check if we haven't enough data */
|
|
|
+ if (!test_bit(R5_OVERWRITE, &dev->flags) &&
|
|
|
+ i != pd_idx && i != qd_idx &&
|
|
|
+ !test_bit(R5_LOCKED, &dev->flags) &&
|
|
|
+ !(test_bit(R5_UPTODATE, &dev->flags) ||
|
|
|
+ test_bit(R5_Wantcompute, &dev->flags))) {
|
|
|
+ rcw++;
|
|
|
+ if (!test_bit(R5_Insync, &dev->flags))
|
|
|
+ continue; /* it's a failed drive */
|
|
|
+
|
|
|
+ if (
|
|
|
+ test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
|
|
|
+ pr_debug("Read_old stripe %llu "
|
|
|
+ "block %d for Reconstruct\n",
|
|
|
+ (unsigned long long)sh->sector, i);
|
|
|
+ set_bit(R5_LOCKED, &dev->flags);
|
|
|
+ set_bit(R5_Wantread, &dev->flags);
|
|
|
+ s->locked++;
|
|
|
+ } else {
|
|
|
+ pr_debug("Request delayed stripe %llu "
|
|
|
+ "block %d for Reconstruct\n",
|
|
|
+ (unsigned long long)sh->sector, i);
|
|
|
+ set_bit(STRIPE_DELAYED, &sh->state);
|
|
|
+ set_bit(STRIPE_HANDLE, &sh->state);
|
|
|
}
|
|
|
}
|
|
|
+ }
|
|
|
/* now if nothing is locked, and if we have enough data, we can start a
|
|
|
* write request
|
|
|
*/
|
|
|
- if (s->locked == 0 && rcw == 0 &&
|
|
|
+ if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
|
|
|
+ s->locked == 0 && rcw == 0 &&
|
|
|
!test_bit(STRIPE_BIT_DELAY, &sh->state)) {
|
|
|
- if (must_compute > 0) {
|
|
|
- /* We have failed blocks and need to compute them */
|
|
|
- switch (s->failed) {
|
|
|
- case 0:
|
|
|
- BUG();
|
|
|
- case 1:
|
|
|
- compute_block_1(sh, r6s->failed_num[0], 0);
|
|
|
- break;
|
|
|
- case 2:
|
|
|
- compute_block_2(sh, r6s->failed_num[0],
|
|
|
- r6s->failed_num[1]);
|
|
|
- break;
|
|
|
- default: /* This request should have been failed? */
|
|
|
- BUG();
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- pr_debug("Computing parity for stripe %llu\n",
|
|
|
- (unsigned long long)sh->sector);
|
|
|
- compute_parity6(sh, RECONSTRUCT_WRITE);
|
|
|
- /* now every locked buffer is ready to be written */
|
|
|
- for (i = disks; i--; )
|
|
|
- if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
|
|
|
- pr_debug("Writing stripe %llu block %d\n",
|
|
|
- (unsigned long long)sh->sector, i);
|
|
|
- s->locked++;
|
|
|
- set_bit(R5_Wantwrite, &sh->dev[i].flags);
|
|
|
- }
|
|
|
- if (s->locked == disks)
|
|
|
- if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
|
|
|
- atomic_inc(&conf->pending_full_writes);
|
|
|
- /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
|
|
|
- set_bit(STRIPE_INSYNC, &sh->state);
|
|
|
-
|
|
|
- if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
|
|
|
- atomic_dec(&conf->preread_active_stripes);
|
|
|
- if (atomic_read(&conf->preread_active_stripes) <
|
|
|
- IO_THRESHOLD)
|
|
|
- md_wakeup_thread(conf->mddev->thread);
|
|
|
- }
|
|
|
+ schedule_reconstruction(sh, s, 1, 0);
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -2527,7 +2622,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
|
|
|
* we are done. Otherwise update the mismatch count and repair
|
|
|
* parity if !MD_RECOVERY_CHECK
|
|
|
*/
|
|
|
- if (sh->ops.zero_sum_result == 0)
|
|
|
+ if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
|
|
|
/* parity is correct (on disc,
|
|
|
* not in buffer any more)
|
|
|
*/
|
|
@@ -2544,6 +2639,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
|
|
|
set_bit(R5_Wantcompute,
|
|
|
&sh->dev[sh->pd_idx].flags);
|
|
|
sh->ops.target = sh->pd_idx;
|
|
|
+ sh->ops.target2 = -1;
|
|
|
s->uptodate++;
|
|
|
}
|
|
|
}
|
|
@@ -2560,67 +2656,74 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
|
|
|
|
|
|
|
|
|
static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
|
|
|
- struct stripe_head_state *s,
|
|
|
- struct r6_state *r6s, struct page *tmp_page,
|
|
|
- int disks)
|
|
|
+ struct stripe_head_state *s,
|
|
|
+ struct r6_state *r6s, int disks)
|
|
|
{
|
|
|
- int update_p = 0, update_q = 0;
|
|
|
- struct r5dev *dev;
|
|
|
int pd_idx = sh->pd_idx;
|
|
|
int qd_idx = sh->qd_idx;
|
|
|
+ struct r5dev *dev;
|
|
|
|
|
|
set_bit(STRIPE_HANDLE, &sh->state);
|
|
|
|
|
|
BUG_ON(s->failed > 2);
|
|
|
- BUG_ON(s->uptodate < disks);
|
|
|
+
|
|
|
/* Want to check and possibly repair P and Q.
|
|
|
* However there could be one 'failed' device, in which
|
|
|
* case we can only check one of them, possibly using the
|
|
|
* other to generate missing data
|
|
|
*/
|
|
|
|
|
|
- /* If !tmp_page, we cannot do the calculations,
|
|
|
- * but as we have set STRIPE_HANDLE, we will soon be called
|
|
|
- * by stripe_handle with a tmp_page - just wait until then.
|
|
|
- */
|
|
|
- if (tmp_page) {
|
|
|
+ switch (sh->check_state) {
|
|
|
+ case check_state_idle:
|
|
|
+ /* start a new check operation if there are < 2 failures */
|
|
|
if (s->failed == r6s->q_failed) {
|
|
|
- /* The only possible failed device holds 'Q', so it
|
|
|
+ /* The only possible failed device holds Q, so it
|
|
|
* makes sense to check P (If anything else were failed,
|
|
|
* we would have used P to recreate it).
|
|
|
*/
|
|
|
- compute_block_1(sh, pd_idx, 1);
|
|
|
- if (!page_is_zero(sh->dev[pd_idx].page)) {
|
|
|
- compute_block_1(sh, pd_idx, 0);
|
|
|
- update_p = 1;
|
|
|
- }
|
|
|
+ sh->check_state = check_state_run;
|
|
|
}
|
|
|
if (!r6s->q_failed && s->failed < 2) {
|
|
|
- /* q is not failed, and we didn't use it to generate
|
|
|
+ /* Q is not failed, and we didn't use it to generate
|
|
|
* anything, so it makes sense to check it
|
|
|
*/
|
|
|
- memcpy(page_address(tmp_page),
|
|
|
- page_address(sh->dev[qd_idx].page),
|
|
|
- STRIPE_SIZE);
|
|
|
- compute_parity6(sh, UPDATE_PARITY);
|
|
|
- if (memcmp(page_address(tmp_page),
|
|
|
- page_address(sh->dev[qd_idx].page),
|
|
|
- STRIPE_SIZE) != 0) {
|
|
|
- clear_bit(STRIPE_INSYNC, &sh->state);
|
|
|
- update_q = 1;
|
|
|
- }
|
|
|
+ if (sh->check_state == check_state_run)
|
|
|
+ sh->check_state = check_state_run_pq;
|
|
|
+ else
|
|
|
+ sh->check_state = check_state_run_q;
|
|
|
}
|
|
|
- if (update_p || update_q) {
|
|
|
- conf->mddev->resync_mismatches += STRIPE_SECTORS;
|
|
|
- if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
|
|
|
- /* don't try to repair!! */
|
|
|
- update_p = update_q = 0;
|
|
|
+
|
|
|
+ /* discard potentially stale zero_sum_result */
|
|
|
+ sh->ops.zero_sum_result = 0;
|
|
|
+
|
|
|
+ if (sh->check_state == check_state_run) {
|
|
|
+ /* async_xor_zero_sum destroys the contents of P */
|
|
|
+ clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
|
|
|
+ s->uptodate--;
|
|
|
+ }
|
|
|
+ if (sh->check_state >= check_state_run &&
|
|
|
+ sh->check_state <= check_state_run_pq) {
|
|
|
+ /* async_syndrome_zero_sum preserves P and Q, so
|
|
|
+ * no need to mark them !uptodate here
|
|
|
+ */
|
|
|
+ set_bit(STRIPE_OP_CHECK, &s->ops_request);
|
|
|
+ break;
|
|
|
}
|
|
|
|
|
|
+ /* we have 2-disk failure */
|
|
|
+ BUG_ON(s->failed != 2);
|
|
|
+ /* fall through */
|
|
|
+ case check_state_compute_result:
|
|
|
+ sh->check_state = check_state_idle;
|
|
|
+
|
|
|
+ /* check that a write has not made the stripe insync */
|
|
|
+ if (test_bit(STRIPE_INSYNC, &sh->state))
|
|
|
+ break;
|
|
|
+
|
|
|
/* now write out any block on a failed drive,
|
|
|
- * or P or Q if they need it
|
|
|
+ * or P or Q if they were recomputed
|
|
|
*/
|
|
|
-
|
|
|
+ BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
|
|
|
if (s->failed == 2) {
|
|
|
dev = &sh->dev[r6s->failed_num[1]];
|
|
|
s->locked++;
|
|
@@ -2633,14 +2736,13 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
|
|
|
set_bit(R5_LOCKED, &dev->flags);
|
|
|
set_bit(R5_Wantwrite, &dev->flags);
|
|
|
}
|
|
|
-
|
|
|
- if (update_p) {
|
|
|
+ if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
|
|
|
dev = &sh->dev[pd_idx];
|
|
|
s->locked++;
|
|
|
set_bit(R5_LOCKED, &dev->flags);
|
|
|
set_bit(R5_Wantwrite, &dev->flags);
|
|
|
}
|
|
|
- if (update_q) {
|
|
|
+ if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
|
|
|
dev = &sh->dev[qd_idx];
|
|
|
s->locked++;
|
|
|
set_bit(R5_LOCKED, &dev->flags);
|
|
@@ -2649,6 +2751,70 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
|
|
|
clear_bit(STRIPE_DEGRADED, &sh->state);
|
|
|
|
|
|
set_bit(STRIPE_INSYNC, &sh->state);
|
|
|
+ break;
|
|
|
+ case check_state_run:
|
|
|
+ case check_state_run_q:
|
|
|
+ case check_state_run_pq:
|
|
|
+ break; /* we will be called again upon completion */
|
|
|
+ case check_state_check_result:
|
|
|
+ sh->check_state = check_state_idle;
|
|
|
+
|
|
|
+ /* handle a successful check operation, if parity is correct
|
|
|
+ * we are done. Otherwise update the mismatch count and repair
|
|
|
+ * parity if !MD_RECOVERY_CHECK
|
|
|
+ */
|
|
|
+ if (sh->ops.zero_sum_result == 0) {
|
|
|
+ /* both parities are correct */
|
|
|
+ if (!s->failed)
|
|
|
+ set_bit(STRIPE_INSYNC, &sh->state);
|
|
|
+ else {
|
|
|
+ /* in contrast to the raid5 case we can validate
|
|
|
+ * parity, but still have a failure to write
|
|
|
+ * back
|
|
|
+ */
|
|
|
+ sh->check_state = check_state_compute_result;
|
|
|
+ /* Returning at this point means that we may go
|
|
|
+ * off and bring p and/or q uptodate again so
|
|
|
+ * we make sure to check zero_sum_result again
|
|
|
+ * to verify if p or q need writeback
|
|
|
+ */
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ conf->mddev->resync_mismatches += STRIPE_SECTORS;
|
|
|
+ if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
|
|
|
+ /* don't try to repair!! */
|
|
|
+ set_bit(STRIPE_INSYNC, &sh->state);
|
|
|
+ else {
|
|
|
+ int *target = &sh->ops.target;
|
|
|
+
|
|
|
+ sh->ops.target = -1;
|
|
|
+ sh->ops.target2 = -1;
|
|
|
+ sh->check_state = check_state_compute_run;
|
|
|
+ set_bit(STRIPE_COMPUTE_RUN, &sh->state);
|
|
|
+ set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
|
|
|
+ if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
|
|
|
+ set_bit(R5_Wantcompute,
|
|
|
+ &sh->dev[pd_idx].flags);
|
|
|
+ *target = pd_idx;
|
|
|
+ target = &sh->ops.target2;
|
|
|
+ s->uptodate++;
|
|
|
+ }
|
|
|
+ if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
|
|
|
+ set_bit(R5_Wantcompute,
|
|
|
+ &sh->dev[qd_idx].flags);
|
|
|
+ *target = qd_idx;
|
|
|
+ s->uptodate++;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ break;
|
|
|
+ case check_state_compute_run:
|
|
|
+ break;
|
|
|
+ default:
|
|
|
+ printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
|
|
|
+ __func__, sh->check_state,
|
|
|
+ (unsigned long long) sh->sector);
|
|
|
+ BUG();
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -2666,6 +2832,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
|
|
|
if (i != sh->pd_idx && i != sh->qd_idx) {
|
|
|
int dd_idx, j;
|
|
|
struct stripe_head *sh2;
|
|
|
+ struct async_submit_ctl submit;
|
|
|
|
|
|
sector_t bn = compute_blocknr(sh, i, 1);
|
|
|
sector_t s = raid5_compute_sector(conf, bn, 0,
|
|
@@ -2685,9 +2852,10 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
|
|
|
}
|
|
|
|
|
|
/* place all the copies on one channel */
|
|
|
+ init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
|
|
|
tx = async_memcpy(sh2->dev[dd_idx].page,
|
|
|
- sh->dev[i].page, 0, 0, STRIPE_SIZE,
|
|
|
- ASYNC_TX_DEP_ACK, tx, NULL, NULL);
|
|
|
+ sh->dev[i].page, 0, 0, STRIPE_SIZE,
|
|
|
+ &submit);
|
|
|
|
|
|
set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
|
|
|
set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
|
|
@@ -2756,7 +2924,8 @@ static bool handle_stripe5(struct stripe_head *sh)
|
|
|
rcu_read_lock();
|
|
|
for (i=disks; i--; ) {
|
|
|
mdk_rdev_t *rdev;
|
|
|
- struct r5dev *dev = &sh->dev[i];
|
|
|
+
|
|
|
+ dev = &sh->dev[i];
|
|
|
clear_bit(R5_Insync, &dev->flags);
|
|
|
|
|
|
pr_debug("check %d: state 0x%lx toread %p read %p write %p "
|
|
@@ -2973,7 +3142,7 @@ static bool handle_stripe5(struct stripe_head *sh)
|
|
|
/* Need to write out all blocks after computing parity */
|
|
|
sh->disks = conf->raid_disks;
|
|
|
stripe_set_idx(sh->sector, conf, 0, sh);
|
|
|
- schedule_reconstruction5(sh, &s, 1, 1);
|
|
|
+ schedule_reconstruction(sh, &s, 1, 1);
|
|
|
} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
|
|
|
clear_bit(STRIPE_EXPAND_READY, &sh->state);
|
|
|
atomic_dec(&conf->reshape_stripes);
|
|
@@ -2993,7 +3162,7 @@ static bool handle_stripe5(struct stripe_head *sh)
|
|
|
md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
|
|
|
|
|
|
if (s.ops_request)
|
|
|
- raid5_run_ops(sh, s.ops_request);
|
|
|
+ raid_run_ops(sh, s.ops_request);
|
|
|
|
|
|
ops_run_io(sh, &s);
|
|
|
|
|
@@ -3002,7 +3171,7 @@ static bool handle_stripe5(struct stripe_head *sh)
|
|
|
return blocked_rdev == NULL;
|
|
|
}
|
|
|
|
|
|
-static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
|
|
|
+static bool handle_stripe6(struct stripe_head *sh)
|
|
|
{
|
|
|
raid5_conf_t *conf = sh->raid_conf;
|
|
|
int disks = sh->disks;
|
|
@@ -3014,9 +3183,10 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
|
|
|
mdk_rdev_t *blocked_rdev = NULL;
|
|
|
|
|
|
pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
|
|
|
- "pd_idx=%d, qd_idx=%d\n",
|
|
|
+ "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
|
|
|
(unsigned long long)sh->sector, sh->state,
|
|
|
- atomic_read(&sh->count), pd_idx, qd_idx);
|
|
|
+ atomic_read(&sh->count), pd_idx, qd_idx,
|
|
|
+ sh->check_state, sh->reconstruct_state);
|
|
|
memset(&s, 0, sizeof(s));
|
|
|
|
|
|
spin_lock(&sh->lock);
|
|
@@ -3036,35 +3206,26 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
|
|
|
|
|
|
pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
|
|
|
i, dev->flags, dev->toread, dev->towrite, dev->written);
|
|
|
- /* maybe we can reply to a read */
|
|
|
- if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
|
|
|
- struct bio *rbi, *rbi2;
|
|
|
- pr_debug("Return read for disc %d\n", i);
|
|
|
- spin_lock_irq(&conf->device_lock);
|
|
|
- rbi = dev->toread;
|
|
|
- dev->toread = NULL;
|
|
|
- if (test_and_clear_bit(R5_Overlap, &dev->flags))
|
|
|
- wake_up(&conf->wait_for_overlap);
|
|
|
- spin_unlock_irq(&conf->device_lock);
|
|
|
- while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
|
|
|
- copy_data(0, rbi, dev->page, dev->sector);
|
|
|
- rbi2 = r5_next_bio(rbi, dev->sector);
|
|
|
- spin_lock_irq(&conf->device_lock);
|
|
|
- if (!raid5_dec_bi_phys_segments(rbi)) {
|
|
|
- rbi->bi_next = return_bi;
|
|
|
- return_bi = rbi;
|
|
|
- }
|
|
|
- spin_unlock_irq(&conf->device_lock);
|
|
|
- rbi = rbi2;
|
|
|
- }
|
|
|
- }
|
|
|
+ /* maybe we can reply to a read
|
|
|
+ *
|
|
|
+ * new wantfill requests are only permitted while
|
|
|
+ * ops_complete_biofill is guaranteed to be inactive
|
|
|
+ */
|
|
|
+ if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
|
|
|
+ !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
|
|
|
+ set_bit(R5_Wantfill, &dev->flags);
|
|
|
|
|
|
/* now count some things */
|
|
|
if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
|
|
|
if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
|
|
|
+ if (test_bit(R5_Wantcompute, &dev->flags)) {
|
|
|
+ s.compute++;
|
|
|
+ BUG_ON(s.compute > 2);
|
|
|
+ }
|
|
|
|
|
|
-
|
|
|
- if (dev->toread)
|
|
|
+ if (test_bit(R5_Wantfill, &dev->flags)) {
|
|
|
+ s.to_fill++;
|
|
|
+ } else if (dev->toread)
|
|
|
s.to_read++;
|
|
|
if (dev->towrite) {
|
|
|
s.to_write++;
|
|
@@ -3105,6 +3266,11 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
|
|
|
blocked_rdev = NULL;
|
|
|
}
|
|
|
|
|
|
+ if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
|
|
|
+ set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
|
|
|
+ set_bit(STRIPE_BIOFILL_RUN, &sh->state);
|
|
|
+ }
|
|
|
+
|
|
|
pr_debug("locked=%d uptodate=%d to_read=%d"
|
|
|
" to_write=%d failed=%d failed_num=%d,%d\n",
|
|
|
s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
|
|
@@ -3145,19 +3311,62 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
|
|
|
* or to load a block that is being partially written.
|
|
|
*/
|
|
|
if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
|
|
|
- (s.syncing && (s.uptodate < disks)) || s.expanding)
|
|
|
+ (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
|
|
|
handle_stripe_fill6(sh, &s, &r6s, disks);
|
|
|
|
|
|
- /* now to consider writing and what else, if anything should be read */
|
|
|
- if (s.to_write)
|
|
|
+ /* Now we check to see if any write operations have recently
|
|
|
+ * completed
|
|
|
+ */
|
|
|
+ if (sh->reconstruct_state == reconstruct_state_drain_result) {
|
|
|
+ int qd_idx = sh->qd_idx;
|
|
|
+
|
|
|
+ sh->reconstruct_state = reconstruct_state_idle;
|
|
|
+ /* All the 'written' buffers and the parity blocks are ready to
|
|
|
+ * be written back to disk
|
|
|
+ */
|
|
|
+ BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
|
|
|
+ BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags));
|
|
|
+ for (i = disks; i--; ) {
|
|
|
+ dev = &sh->dev[i];
|
|
|
+ if (test_bit(R5_LOCKED, &dev->flags) &&
|
|
|
+ (i == sh->pd_idx || i == qd_idx ||
|
|
|
+ dev->written)) {
|
|
|
+ pr_debug("Writing block %d\n", i);
|
|
|
+ BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
|
|
|
+ set_bit(R5_Wantwrite, &dev->flags);
|
|
|
+ if (!test_bit(R5_Insync, &dev->flags) ||
|
|
|
+ ((i == sh->pd_idx || i == qd_idx) &&
|
|
|
+ s.failed == 0))
|
|
|
+ set_bit(STRIPE_INSYNC, &sh->state);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
|
|
|
+ atomic_dec(&conf->preread_active_stripes);
|
|
|
+ if (atomic_read(&conf->preread_active_stripes) <
|
|
|
+ IO_THRESHOLD)
|
|
|
+ md_wakeup_thread(conf->mddev->thread);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /* Now to consider new write requests and what else, if anything
|
|
|
+ * should be read. We do not handle new writes when:
|
|
|
+ * 1/ A 'write' operation (copy+gen_syndrome) is already in flight.
|
|
|
+ * 2/ A 'check' operation is in flight, as it may clobber the parity
|
|
|
+ * block.
|
|
|
+ */
|
|
|
+ if (s.to_write && !sh->reconstruct_state && !sh->check_state)
|
|
|
handle_stripe_dirtying6(conf, sh, &s, &r6s, disks);
|
|
|
|
|
|
/* maybe we need to check and possibly fix the parity for this stripe
|
|
|
* Any reads will already have been scheduled, so we just see if enough
|
|
|
- * data is available
|
|
|
+ * data is available. The parity check is held off while parity
|
|
|
+ * dependent operations are in flight.
|
|
|
*/
|
|
|
- if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state))
|
|
|
- handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks);
|
|
|
+ if (sh->check_state ||
|
|
|
+ (s.syncing && s.locked == 0 &&
|
|
|
+ !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
|
|
|
+ !test_bit(STRIPE_INSYNC, &sh->state)))
|
|
|
+ handle_parity_checks6(conf, sh, &s, &r6s, disks);
|
|
|
|
|
|
if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
|
|
|
md_done_sync(conf->mddev, STRIPE_SECTORS,1);
|
|
@@ -3178,15 +3387,29 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
|
|
|
set_bit(R5_Wantwrite, &dev->flags);
|
|
|
set_bit(R5_ReWrite, &dev->flags);
|
|
|
set_bit(R5_LOCKED, &dev->flags);
|
|
|
+ s.locked++;
|
|
|
} else {
|
|
|
/* let's read it back */
|
|
|
set_bit(R5_Wantread, &dev->flags);
|
|
|
set_bit(R5_LOCKED, &dev->flags);
|
|
|
+ s.locked++;
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
|
|
|
+ /* Finish reconstruct operations initiated by the expansion process */
|
|
|
+ if (sh->reconstruct_state == reconstruct_state_result) {
|
|
|
+ sh->reconstruct_state = reconstruct_state_idle;
|
|
|
+ clear_bit(STRIPE_EXPANDING, &sh->state);
|
|
|
+ for (i = conf->raid_disks; i--; ) {
|
|
|
+ set_bit(R5_Wantwrite, &sh->dev[i].flags);
|
|
|
+ set_bit(R5_LOCKED, &sh->dev[i].flags);
|
|
|
+ s.locked++;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
|
|
|
+ !sh->reconstruct_state) {
|
|
|
struct stripe_head *sh2
|
|
|
= get_active_stripe(conf, sh->sector, 1, 1, 1);
|
|
|
if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
|
|
@@ -3207,14 +3430,8 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
|
|
|
/* Need to write out all blocks after computing P&Q */
|
|
|
sh->disks = conf->raid_disks;
|
|
|
stripe_set_idx(sh->sector, conf, 0, sh);
|
|
|
- compute_parity6(sh, RECONSTRUCT_WRITE);
|
|
|
- for (i = conf->raid_disks ; i-- ; ) {
|
|
|
- set_bit(R5_LOCKED, &sh->dev[i].flags);
|
|
|
- s.locked++;
|
|
|
- set_bit(R5_Wantwrite, &sh->dev[i].flags);
|
|
|
- }
|
|
|
- clear_bit(STRIPE_EXPANDING, &sh->state);
|
|
|
- } else if (s.expanded) {
|
|
|
+ schedule_reconstruction(sh, &s, 1, 1);
|
|
|
+ } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
|
|
|
clear_bit(STRIPE_EXPAND_READY, &sh->state);
|
|
|
atomic_dec(&conf->reshape_stripes);
|
|
|
wake_up(&conf->wait_for_overlap);
|
|
@@ -3232,6 +3449,9 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
|
|
|
if (unlikely(blocked_rdev))
|
|
|
md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
|
|
|
|
|
|
+ if (s.ops_request)
|
|
|
+ raid_run_ops(sh, s.ops_request);
|
|
|
+
|
|
|
ops_run_io(sh, &s);
|
|
|
|
|
|
return_io(return_bi);
|
|
@@ -3240,16 +3460,14 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
|
|
|
}
|
|
|
|
|
|
/* returns true if the stripe was handled */
|
|
|
-static bool handle_stripe(struct stripe_head *sh, struct page *tmp_page)
|
|
|
+static bool handle_stripe(struct stripe_head *sh)
|
|
|
{
|
|
|
if (sh->raid_conf->level == 6)
|
|
|
- return handle_stripe6(sh, tmp_page);
|
|
|
+ return handle_stripe6(sh);
|
|
|
else
|
|
|
return handle_stripe5(sh);
|
|
|
}
|
|
|
|
|
|
-
|
|
|
-
|
|
|
static void raid5_activate_delayed(raid5_conf_t *conf)
|
|
|
{
|
|
|
if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
|
|
@@ -3331,6 +3549,9 @@ static int raid5_congested(void *data, int bits)
|
|
|
/* No difference between reads and writes. Just check
|
|
|
* how busy the stripe_cache is
|
|
|
*/
|
|
|
+
|
|
|
+ if (mddev_congested(mddev, bits))
|
|
|
+ return 1;
|
|
|
if (conf->inactive_blocked)
|
|
|
return 1;
|
|
|
if (conf->quiesce)
|
|
@@ -3880,7 +4101,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
|
|
|
INIT_LIST_HEAD(&stripes);
|
|
|
for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
|
|
|
int j;
|
|
|
- int skipped = 0;
|
|
|
+ int skipped_disk = 0;
|
|
|
sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
|
|
|
set_bit(STRIPE_EXPANDING, &sh->state);
|
|
|
atomic_inc(&conf->reshape_stripes);
|
|
@@ -3896,14 +4117,14 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
|
|
|
continue;
|
|
|
s = compute_blocknr(sh, j, 0);
|
|
|
if (s < raid5_size(mddev, 0, 0)) {
|
|
|
- skipped = 1;
|
|
|
+ skipped_disk = 1;
|
|
|
continue;
|
|
|
}
|
|
|
memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
|
|
|
set_bit(R5_Expanded, &sh->dev[j].flags);
|
|
|
set_bit(R5_UPTODATE, &sh->dev[j].flags);
|
|
|
}
|
|
|
- if (!skipped) {
|
|
|
+ if (!skipped_disk) {
|
|
|
set_bit(STRIPE_EXPAND_READY, &sh->state);
|
|
|
set_bit(STRIPE_HANDLE, &sh->state);
|
|
|
}
|
|
@@ -4057,7 +4278,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
|
|
|
spin_unlock(&sh->lock);
|
|
|
|
|
|
/* wait for any blocked device to be handled */
|
|
|
- while(unlikely(!handle_stripe(sh, NULL)))
|
|
|
+ while (unlikely(!handle_stripe(sh)))
|
|
|
;
|
|
|
release_stripe(sh);
|
|
|
|
|
@@ -4114,7 +4335,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
|
|
|
return handled;
|
|
|
}
|
|
|
|
|
|
- handle_stripe(sh, NULL);
|
|
|
+ handle_stripe(sh);
|
|
|
release_stripe(sh);
|
|
|
handled++;
|
|
|
}
|
|
@@ -4128,6 +4349,36 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
|
|
|
return handled;
|
|
|
}
|
|
|
|
|
|
+#ifdef CONFIG_MULTICORE_RAID456
|
|
|
+static void __process_stripe(void *param, async_cookie_t cookie)
|
|
|
+{
|
|
|
+ struct stripe_head *sh = param;
|
|
|
+
|
|
|
+ handle_stripe(sh);
|
|
|
+ release_stripe(sh);
|
|
|
+}
|
|
|
+
|
|
|
+static void process_stripe(struct stripe_head *sh, struct list_head *domain)
|
|
|
+{
|
|
|
+ async_schedule_domain(__process_stripe, sh, domain);
|
|
|
+}
|
|
|
+
|
|
|
+static void synchronize_stripe_processing(struct list_head *domain)
|
|
|
+{
|
|
|
+ async_synchronize_full_domain(domain);
|
|
|
+}
|
|
|
+#else
|
|
|
+static void process_stripe(struct stripe_head *sh, struct list_head *domain)
|
|
|
+{
|
|
|
+ handle_stripe(sh);
|
|
|
+ release_stripe(sh);
|
|
|
+ cond_resched();
|
|
|
+}
|
|
|
+
|
|
|
+static void synchronize_stripe_processing(struct list_head *domain)
|
|
|
+{
|
|
|
+}
|
|
|
+#endif
|
|
|
|
|
|
|
|
|
/*
|
|
@@ -4142,6 +4393,7 @@ static void raid5d(mddev_t *mddev)
|
|
|
struct stripe_head *sh;
|
|
|
raid5_conf_t *conf = mddev->private;
|
|
|
int handled;
|
|
|
+ LIST_HEAD(raid_domain);
|
|
|
|
|
|
pr_debug("+++ raid5d active\n");
|
|
|
|
|
@@ -4178,8 +4430,7 @@ static void raid5d(mddev_t *mddev)
|
|
|
spin_unlock_irq(&conf->device_lock);
|
|
|
|
|
|
handled++;
|
|
|
- handle_stripe(sh, conf->spare_page);
|
|
|
- release_stripe(sh);
|
|
|
+ process_stripe(sh, &raid_domain);
|
|
|
|
|
|
spin_lock_irq(&conf->device_lock);
|
|
|
}
|
|
@@ -4187,6 +4438,7 @@ static void raid5d(mddev_t *mddev)
|
|
|
|
|
|
spin_unlock_irq(&conf->device_lock);
|
|
|
|
|
|
+ synchronize_stripe_processing(&raid_domain);
|
|
|
async_tx_issue_pending_all();
|
|
|
unplug_slaves(mddev);
|
|
|
|
|
@@ -4319,15 +4571,118 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
|
|
|
return sectors * (raid_disks - conf->max_degraded);
|
|
|
}
|
|
|
|
|
|
+static void raid5_free_percpu(raid5_conf_t *conf)
|
|
|
+{
|
|
|
+ struct raid5_percpu *percpu;
|
|
|
+ unsigned long cpu;
|
|
|
+
|
|
|
+ if (!conf->percpu)
|
|
|
+ return;
|
|
|
+
|
|
|
+ get_online_cpus();
|
|
|
+ for_each_possible_cpu(cpu) {
|
|
|
+ percpu = per_cpu_ptr(conf->percpu, cpu);
|
|
|
+ safe_put_page(percpu->spare_page);
|
|
|
+ kfree(percpu->scribble);
|
|
|
+ }
|
|
|
+#ifdef CONFIG_HOTPLUG_CPU
|
|
|
+ unregister_cpu_notifier(&conf->cpu_notify);
|
|
|
+#endif
|
|
|
+ put_online_cpus();
|
|
|
+
|
|
|
+ free_percpu(conf->percpu);
|
|
|
+}
|
|
|
+
|
|
|
static void free_conf(raid5_conf_t *conf)
|
|
|
{
|
|
|
shrink_stripes(conf);
|
|
|
- safe_put_page(conf->spare_page);
|
|
|
+ raid5_free_percpu(conf);
|
|
|
kfree(conf->disks);
|
|
|
kfree(conf->stripe_hashtbl);
|
|
|
kfree(conf);
|
|
|
}
|
|
|
|
|
|
+#ifdef CONFIG_HOTPLUG_CPU
|
|
|
+static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
|
|
|
+ void *hcpu)
|
|
|
+{
|
|
|
+ raid5_conf_t *conf = container_of(nfb, raid5_conf_t, cpu_notify);
|
|
|
+ long cpu = (long)hcpu;
|
|
|
+ struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
|
|
|
+
|
|
|
+ switch (action) {
|
|
|
+ case CPU_UP_PREPARE:
|
|
|
+ case CPU_UP_PREPARE_FROZEN:
|
|
|
+ if (conf->level == 6 && !percpu->spare_page)
|
|
|
+ percpu->spare_page = alloc_page(GFP_KERNEL);
|
|
|
+ if (!percpu->scribble)
|
|
|
+ percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
|
|
|
+
|
|
|
+ if (!percpu->scribble ||
|
|
|
+ (conf->level == 6 && !percpu->spare_page)) {
|
|
|
+ safe_put_page(percpu->spare_page);
|
|
|
+ kfree(percpu->scribble);
|
|
|
+ pr_err("%s: failed memory allocation for cpu%ld\n",
|
|
|
+ __func__, cpu);
|
|
|
+ return NOTIFY_BAD;
|
|
|
+ }
|
|
|
+ break;
|
|
|
+ case CPU_DEAD:
|
|
|
+ case CPU_DEAD_FROZEN:
|
|
|
+ safe_put_page(percpu->spare_page);
|
|
|
+ kfree(percpu->scribble);
|
|
|
+ percpu->spare_page = NULL;
|
|
|
+ percpu->scribble = NULL;
|
|
|
+ break;
|
|
|
+ default:
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ return NOTIFY_OK;
|
|
|
+}
|
|
|
+#endif
|
|
|
+
|
|
|
+static int raid5_alloc_percpu(raid5_conf_t *conf)
|
|
|
+{
|
|
|
+ unsigned long cpu;
|
|
|
+ struct page *spare_page;
|
|
|
+ struct raid5_percpu *allcpus;
|
|
|
+ void *scribble;
|
|
|
+ int err;
|
|
|
+
|
|
|
+ allcpus = alloc_percpu(struct raid5_percpu);
|
|
|
+ if (!allcpus)
|
|
|
+ return -ENOMEM;
|
|
|
+ conf->percpu = allcpus;
|
|
|
+
|
|
|
+ get_online_cpus();
|
|
|
+ err = 0;
|
|
|
+ for_each_present_cpu(cpu) {
|
|
|
+ if (conf->level == 6) {
|
|
|
+ spare_page = alloc_page(GFP_KERNEL);
|
|
|
+ if (!spare_page) {
|
|
|
+ err = -ENOMEM;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
|
|
|
+ }
|
|
|
+ scribble = kmalloc(scribble_len(conf->raid_disks), GFP_KERNEL);
|
|
|
+ if (!scribble) {
|
|
|
+ err = -ENOMEM;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
|
|
|
+ }
|
|
|
+#ifdef CONFIG_HOTPLUG_CPU
|
|
|
+ conf->cpu_notify.notifier_call = raid456_cpu_notify;
|
|
|
+ conf->cpu_notify.priority = 0;
|
|
|
+ if (err == 0)
|
|
|
+ err = register_cpu_notifier(&conf->cpu_notify);
|
|
|
+#endif
|
|
|
+ put_online_cpus();
|
|
|
+
|
|
|
+ return err;
|
|
|
+}
|
|
|
+
|
|
|
static raid5_conf_t *setup_conf(mddev_t *mddev)
|
|
|
{
|
|
|
raid5_conf_t *conf;
|
|
@@ -4369,6 +4724,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
|
|
|
goto abort;
|
|
|
|
|
|
conf->raid_disks = mddev->raid_disks;
|
|
|
+ conf->scribble_len = scribble_len(conf->raid_disks);
|
|
|
if (mddev->reshape_position == MaxSector)
|
|
|
conf->previous_raid_disks = mddev->raid_disks;
|
|
|
else
|
|
@@ -4384,11 +4740,10 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
|
|
|
if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
|
|
|
goto abort;
|
|
|
|
|
|
- if (mddev->new_level == 6) {
|
|
|
- conf->spare_page = alloc_page(GFP_KERNEL);
|
|
|
- if (!conf->spare_page)
|
|
|
- goto abort;
|
|
|
- }
|
|
|
+ conf->level = mddev->new_level;
|
|
|
+ if (raid5_alloc_percpu(conf) != 0)
|
|
|
+ goto abort;
|
|
|
+
|
|
|
spin_lock_init(&conf->device_lock);
|
|
|
init_waitqueue_head(&conf->wait_for_stripe);
|
|
|
init_waitqueue_head(&conf->wait_for_overlap);
|
|
@@ -4447,7 +4802,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
|
|
|
printk(KERN_INFO "raid5: allocated %dkB for %s\n",
|
|
|
memory, mdname(mddev));
|
|
|
|
|
|
- conf->thread = md_register_thread(raid5d, mddev, "%s_raid5");
|
|
|
+ conf->thread = md_register_thread(raid5d, mddev, NULL);
|
|
|
if (!conf->thread) {
|
|
|
printk(KERN_ERR
|
|
|
"raid5: couldn't allocate thread for %s\n",
|
|
@@ -4613,7 +4968,7 @@ static int run(mddev_t *mddev)
|
|
|
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
|
|
|
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
|
|
|
mddev->sync_thread = md_register_thread(md_do_sync, mddev,
|
|
|
- "%s_reshape");
|
|
|
+ "reshape");
|
|
|
}
|
|
|
|
|
|
/* read-ahead size must cover two whole stripes, which is
|
|
@@ -5031,7 +5386,7 @@ static int raid5_start_reshape(mddev_t *mddev)
|
|
|
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
|
|
|
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
|
|
|
mddev->sync_thread = md_register_thread(md_do_sync, mddev,
|
|
|
- "%s_reshape");
|
|
|
+ "reshape");
|
|
|
if (!mddev->sync_thread) {
|
|
|
mddev->recovery = 0;
|
|
|
spin_lock_irq(&conf->device_lock);
|