From 2b6b24574256c05be145936f1493aec74c6904e5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 21 May 2015 15:10:01 +1000 Subject: [PATCH 01/10] md/raid5: ensure whole batch is delayed for all required bitmap updates. When we add a stripe to a batch, we need to be sure that head stripe will wait for the bitmap update required for the new stripe. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b9f2b9cc6060..c55a68f37c72 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -837,6 +837,15 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh < IO_THRESHOLD) md_wakeup_thread(conf->mddev->thread); + if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) { + int seq = sh->bm_seq; + if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) && + sh->batch_head->bm_seq > seq) + seq = sh->batch_head->bm_seq; + set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state); + sh->batch_head->bm_seq = seq; + } + atomic_inc(&sh->count); unlock_out: unlock_two_stripes(head, sh); From d0852df543e5aa7db34c1ad26d053782bcbf48f1 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 27 May 2015 08:43:45 +1000 Subject: [PATCH 02/10] md/raid5: close race between STRIPE_BIT_DELAY and batching. When we add a write to a stripe we need to make sure the bitmap bit is set. While doing that the stripe is not locked so it could be added to a batch after which further changes to STRIPE_BIT_DELAY and ->bm_seq are ineffective. So we need to hold off adding to a stripe until bitmap_startwrite has completed at least once, and we need to avoid further changes to STRIPE_BIT_DELAY once the stripe has been added to a batch. If a bitmap_startwrite() completes after the stripe was added to a batch, it will not have set the bit, only incremented a counter, so no extra delay of the stripe is needed. Reported-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid5.c | 25 ++++++++++++++++++++++--- drivers/md/raid5.h | 3 +++ 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c55a68f37c72..42d0ea6c8597 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -749,6 +749,7 @@ static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) static bool stripe_can_batch(struct stripe_head *sh) { return test_bit(STRIPE_BATCH_READY, &sh->state) && + !test_bit(STRIPE_BITMAP_PENDING, &sh->state) && is_full_stripe_write(sh); } @@ -2996,14 +2997,32 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", (unsigned long long)(*bip)->bi_iter.bi_sector, (unsigned long long)sh->sector, dd_idx); - spin_unlock_irq(&sh->stripe_lock); if (conf->mddev->bitmap && firstwrite) { + /* Cannot hold spinlock over bitmap_startwrite, + * but must ensure this isn't added to a batch until + * we have added to the bitmap and set bm_seq. + * So set STRIPE_BITMAP_PENDING to prevent + * batching. + * If multiple add_stripe_bio() calls race here they + * much all set STRIPE_BITMAP_PENDING. So only the first one + * to complete "bitmap_startwrite" gets to set + * STRIPE_BIT_DELAY. This is important as once a stripe + * is added to a batch, STRIPE_BIT_DELAY cannot be changed + * any more. + */ + set_bit(STRIPE_BITMAP_PENDING, &sh->state); + spin_unlock_irq(&sh->stripe_lock); bitmap_startwrite(conf->mddev->bitmap, sh->sector, STRIPE_SECTORS, 0); - sh->bm_seq = conf->seq_flush+1; - set_bit(STRIPE_BIT_DELAY, &sh->state); + spin_lock_irq(&sh->stripe_lock); + clear_bit(STRIPE_BITMAP_PENDING, &sh->state); + if (!sh->batch_head) { + sh->bm_seq = conf->seq_flush+1; + set_bit(STRIPE_BIT_DELAY, &sh->state); + } } + spin_unlock_irq(&sh->stripe_lock); if (stripe_can_batch(sh)) stripe_add_to_batch_list(conf, sh); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 7dc0dd86074b..01cdb9f3a0c4 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -337,6 +337,9 @@ enum { STRIPE_ON_RELEASE_LIST, STRIPE_BATCH_READY, STRIPE_BATCH_ERR, + STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add + * to batch yet. + */ }; #define STRIPE_EXPAND_SYNC_FLAG \ From b15a9dbdbfe72848b7ed4cd3f97fe80daaf99c89 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 22 May 2015 15:20:04 +1000 Subject: [PATCH 03/10] md/raid5: Ensure a batch member is not handled prematurely. If a stripe is a member of a batch, but not the head, it must not be handled separately from the rest of the batch. 'clear_batch_ready()' handles this requirement to some extent but not completely. If a member is passed to handle_stripe() a second time it returns '0' indicating the stripe can be handled, which is wrong. So add an extra test. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 42d0ea6c8597..e58736740bac 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4200,9 +4200,13 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) static int clear_batch_ready(struct stripe_head *sh) { + /* Return '1' if this is a member of batch, or + * '0' if it is a lone stripe or a head which can now be + * handled. + */ struct stripe_head *tmp; if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state)) - return 0; + return (sh->batch_head && sh->batch_head != sh); spin_lock(&sh->stripe_lock); if (!sh->batch_head) { spin_unlock(&sh->stripe_lock); From 4e3d62ff4976f26d22b8b91572a49136bb3a23f1 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 21 May 2015 11:50:16 +1000 Subject: [PATCH 04/10] md/raid5: remove condition test from check_break_stripe_batch_list. handle_stripe_clean_event() contains a chunk of code very similar to check_break_stripe_batch_list(). If we make the latter more like the former, we can end up with just one copy of this code. This first step removed the condition (and the 'check_') part of the name. This has the added advantage of making it clear what check is being performed at the point where the function is called. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e58736740bac..fc5c4039c394 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4234,16 +4234,11 @@ static int clear_batch_ready(struct stripe_head *sh) return 0; } -static void check_break_stripe_batch_list(struct stripe_head *sh) +static void break_stripe_batch_list(struct stripe_head *head_sh) { - struct stripe_head *head_sh, *next; + struct stripe_head *sh, *next; int i; - if (!test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) - return; - - head_sh = sh; - list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) { list_del_init(&sh->batch_list); @@ -4290,7 +4285,8 @@ static void handle_stripe(struct stripe_head *sh) return; } - check_break_stripe_batch_list(sh); + if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) + break_stripe_batch_list(sh); if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { spin_lock(&sh->stripe_lock); From fb642b92c267beeefd352af9bc461eac93a7552c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 21 May 2015 12:00:47 +1000 Subject: [PATCH 05/10] md/raid5: duplicate some more handle_stripe_clean_event code in break_stripe_batch_list break_stripe_batch list didn't clear head_sh->batch_head. This was probably a bug. Also clear all R5_Overlap flags and if any were cleared, wake up 'wait_for_overlap'. This isn't always necessary but the worst effect is a little extra checking for code that is waiting on wait_for_overlap. Also, don't use wake_up_nr() because that does the wrong thing if 'nr' is zero, and it number of flags cleared doesn't strongly correlate with the number of threads to wake. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index fc5c4039c394..6de2e1edd492 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3557,7 +3557,8 @@ static void handle_stripe_clean_event(struct r5conf *conf, spin_lock_irq(&head_sh->stripe_lock); head_sh->batch_head = NULL; spin_unlock_irq(&head_sh->stripe_lock); - wake_up_nr(&conf->wait_for_overlap, wakeup_nr); + if (wakeup_nr) + wake_up(&conf->wait_for_overlap); if (head_sh->state & STRIPE_EXPAND_SYNC_FLAG) set_bit(STRIPE_HANDLE, &head_sh->state); } @@ -4238,6 +4239,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh) { struct stripe_head *sh, *next; int i; + int do_wakeup = 0; list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) { @@ -4250,10 +4252,12 @@ static void break_stripe_batch_list(struct stripe_head *head_sh) STRIPE_EXPAND_SYNC_FLAG)); sh->check_state = head_sh->check_state; sh->reconstruct_state = head_sh->reconstruct_state; - for (i = 0; i < sh->disks; i++) + for (i = 0; i < sh->disks; i++) { + if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) + do_wakeup = 1; sh->dev[i].flags = head_sh->dev[i].flags & (~((1 << R5_WriteError) | (1 << R5_Overlap))); - + } spin_lock_irq(&sh->stripe_lock); sh->batch_head = NULL; spin_unlock_irq(&sh->stripe_lock); @@ -4261,6 +4265,15 @@ static void break_stripe_batch_list(struct stripe_head *head_sh) set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } + spin_lock_irq(&head_sh->stripe_lock); + head_sh->batch_head = NULL; + spin_unlock_irq(&head_sh->stripe_lock); + for (i = 0; i < head_sh->disks; i++) + if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) + do_wakeup = 1; + + if (do_wakeup) + wake_up(&head_sh->raid_conf->wait_for_overlap); } static void handle_stripe(struct stripe_head *sh) From 3960ce796198254b7a1b420dc9a26d80928523bd Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 21 May 2015 12:20:36 +1000 Subject: [PATCH 06/10] md/raid5: add handle_flags arg to break_stripe_batch_list. When we break a stripe_batch_list we sometimes want to set STRIPE_HANDLE on the individual stripes, and sometimes not. So pass a 'handle_flags' arg. If it is zero, always set STRIPE_HANDLE (on non-head stripes). If not zero, only set it if any of the given flags are present. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6de2e1edd492..0b65eb51e562 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4235,7 +4235,8 @@ static int clear_batch_ready(struct stripe_head *sh) return 0; } -static void break_stripe_batch_list(struct stripe_head *head_sh) +static void break_stripe_batch_list(struct stripe_head *head_sh, + unsigned long handle_flags) { struct stripe_head *sh, *next; int i; @@ -4261,8 +4262,9 @@ static void break_stripe_batch_list(struct stripe_head *head_sh) spin_lock_irq(&sh->stripe_lock); sh->batch_head = NULL; spin_unlock_irq(&sh->stripe_lock); - - set_bit(STRIPE_HANDLE, &sh->state); + if (handle_flags == 0 || + sh->state & handle_flags) + set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } spin_lock_irq(&head_sh->stripe_lock); @@ -4271,6 +4273,8 @@ static void break_stripe_batch_list(struct stripe_head *head_sh) for (i = 0; i < head_sh->disks; i++) if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) do_wakeup = 1; + if (head_sh->state & handle_flags) + set_bit(STRIPE_HANDLE, &head_sh->state); if (do_wakeup) wake_up(&head_sh->raid_conf->wait_for_overlap); @@ -4299,7 +4303,7 @@ static void handle_stripe(struct stripe_head *sh) } if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) - break_stripe_batch_list(sh); + break_stripe_batch_list(sh, 0); if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { spin_lock(&sh->stripe_lock); From 1b956f7a8f9aa63ea9644ab8c3374cf381993363 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 21 May 2015 12:40:26 +1000 Subject: [PATCH 07/10] md/raid5: be more selective about distributing flags across batch. When a batch of stripes is broken up, we keep some of the flags that were per-stripe, and copy other flags from the head to all others. This only happens while a stripe is being handled, so many of the flags are irrelevant. The "SYNC_FLAGS" (which I've renamed to make it clear there are several) and STRIPE_DEGRADED are set per-stripe and so need to be preserved. STRIPE_INSYNC is the only flag that is set on the head that needs to be propagated to all others. For safety, add a WARN_ON if others are set, except: STRIPE_HANDLE - this is safe and per-stripe and we are going to set in several cases anyway STRIPE_INSYNC STRIPE_IO_STARTED - this is just a hint and doesn't hurt. STRIPE_ON_PLUG_LIST STRIPE_ON_RELEASE_LIST - It is a point pointless for a batched stripe to be on one of these lists, but it can happen as can be safely ignored. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 55 ++++++++++++++++++++++++++++++++++++---------- drivers/md/raid5.h | 2 +- 2 files changed, 45 insertions(+), 12 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 0b65eb51e562..1141b7f62e6e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3534,10 +3534,27 @@ static void handle_stripe_clean_event(struct r5conf *conf, struct stripe_head, batch_list); list_del_init(&sh->batch_list); - set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG, - head_sh->state & ~((1 << STRIPE_ACTIVE) | - (1 << STRIPE_PREREAD_ACTIVE) | - STRIPE_EXPAND_SYNC_FLAG)); + WARN_ON_ONCE(sh->state & ((1 << STRIPE_ACTIVE) | + (1 << STRIPE_SYNCING) | + (1 << STRIPE_REPLACED) | + (1 << STRIPE_PREREAD_ACTIVE) | + (1 << STRIPE_DELAYED) | + (1 << STRIPE_BIT_DELAY) | + (1 << STRIPE_FULL_WRITE) | + (1 << STRIPE_BIOFILL_RUN) | + (1 << STRIPE_COMPUTE_RUN) | + (1 << STRIPE_OPS_REQ_PENDING) | + (1 << STRIPE_DISCARD) | + (1 << STRIPE_BATCH_READY) | + (1 << STRIPE_BATCH_ERR) | + (1 << STRIPE_BITMAP_PENDING))); + WARN_ON_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) | + (1 << STRIPE_REPLACED))); + + set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | + (1 << STRIPE_DEGRADED)), + head_sh->state & (1 << STRIPE_INSYNC)); + sh->check_state = head_sh->check_state; sh->reconstruct_state = head_sh->reconstruct_state; for (i = 0; i < sh->disks; i++) { @@ -3549,7 +3566,7 @@ static void handle_stripe_clean_event(struct r5conf *conf, spin_lock_irq(&sh->stripe_lock); sh->batch_head = NULL; spin_unlock_irq(&sh->stripe_lock); - if (sh->state & STRIPE_EXPAND_SYNC_FLAG) + if (sh->state & STRIPE_EXPAND_SYNC_FLAGS) set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } @@ -3559,7 +3576,7 @@ static void handle_stripe_clean_event(struct r5conf *conf, spin_unlock_irq(&head_sh->stripe_lock); if (wakeup_nr) wake_up(&conf->wait_for_overlap); - if (head_sh->state & STRIPE_EXPAND_SYNC_FLAG) + if (head_sh->state & STRIPE_EXPAND_SYNC_FLAGS) set_bit(STRIPE_HANDLE, &head_sh->state); } @@ -4246,11 +4263,27 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, list_del_init(&sh->batch_list); - set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG, - head_sh->state & ~((1 << STRIPE_ACTIVE) | - (1 << STRIPE_PREREAD_ACTIVE) | - (1 << STRIPE_DEGRADED) | - STRIPE_EXPAND_SYNC_FLAG)); + WARN_ON_ONCE(sh->state & ((1 << STRIPE_ACTIVE) | + (1 << STRIPE_SYNCING) | + (1 << STRIPE_REPLACED) | + (1 << STRIPE_PREREAD_ACTIVE) | + (1 << STRIPE_DELAYED) | + (1 << STRIPE_BIT_DELAY) | + (1 << STRIPE_FULL_WRITE) | + (1 << STRIPE_BIOFILL_RUN) | + (1 << STRIPE_COMPUTE_RUN) | + (1 << STRIPE_OPS_REQ_PENDING) | + (1 << STRIPE_DISCARD) | + (1 << STRIPE_BATCH_READY) | + (1 << STRIPE_BATCH_ERR) | + (1 << STRIPE_BITMAP_PENDING))); + WARN_ON_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) | + (1 << STRIPE_REPLACED))); + + set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | + (1 << STRIPE_DEGRADED)), + head_sh->state & (1 << STRIPE_INSYNC)); + sh->check_state = head_sh->check_state; sh->reconstruct_state = head_sh->reconstruct_state; for (i = 0; i < sh->disks; i++) { diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 01cdb9f3a0c4..896d603ad0da 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -342,7 +342,7 @@ enum { */ }; -#define STRIPE_EXPAND_SYNC_FLAG \ +#define STRIPE_EXPAND_SYNC_FLAGS \ ((1 << STRIPE_EXPAND_SOURCE) |\ (1 << STRIPE_EXPAND_READY) |\ (1 << STRIPE_EXPANDING) |\ From 787b76fa37159050f6d26aebfa6210009baed93b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 21 May 2015 12:56:41 +1000 Subject: [PATCH 08/10] md/raid5: call break_stripe_batch_list from handle_stripe_clean_event Now that the code in break_stripe_batch_list() is nearly identical to the end of handle_stripe_clean_event, replace the later with a function call. The only remaining difference of any interest is the masking that is applieds to dev[i].flags copied from head_sh. R5_WriteError certainly isn't wanted as it is set per-stripe, not per-patch. R5_Overlap isn't wanted as it is explicitly handled. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 61 +++------------------------------------------- 1 file changed, 4 insertions(+), 57 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 1141b7f62e6e..3254504b1080 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3420,6 +3420,8 @@ static void handle_stripe_fill(struct stripe_head *sh, set_bit(STRIPE_HANDLE, &sh->state); } +static void break_stripe_batch_list(struct stripe_head *head_sh, + unsigned long handle_flags); /* handle_stripe_clean_event * any written block on an uptodate or failed drive can be returned. * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but @@ -3433,7 +3435,6 @@ static void handle_stripe_clean_event(struct r5conf *conf, int discard_pending = 0; struct stripe_head *head_sh = sh; bool do_endio = false; - int wakeup_nr = 0; for (i = disks; i--; ) if (sh->dev[i].written) { @@ -3522,62 +3523,8 @@ static void handle_stripe_clean_event(struct r5conf *conf, if (atomic_dec_and_test(&conf->pending_full_writes)) md_wakeup_thread(conf->mddev->thread); - if (!head_sh->batch_head || !do_endio) - return; - for (i = 0; i < head_sh->disks; i++) { - if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) - wakeup_nr++; - } - while (!list_empty(&head_sh->batch_list)) { - int i; - sh = list_first_entry(&head_sh->batch_list, - struct stripe_head, batch_list); - list_del_init(&sh->batch_list); - - WARN_ON_ONCE(sh->state & ((1 << STRIPE_ACTIVE) | - (1 << STRIPE_SYNCING) | - (1 << STRIPE_REPLACED) | - (1 << STRIPE_PREREAD_ACTIVE) | - (1 << STRIPE_DELAYED) | - (1 << STRIPE_BIT_DELAY) | - (1 << STRIPE_FULL_WRITE) | - (1 << STRIPE_BIOFILL_RUN) | - (1 << STRIPE_COMPUTE_RUN) | - (1 << STRIPE_OPS_REQ_PENDING) | - (1 << STRIPE_DISCARD) | - (1 << STRIPE_BATCH_READY) | - (1 << STRIPE_BATCH_ERR) | - (1 << STRIPE_BITMAP_PENDING))); - WARN_ON_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) | - (1 << STRIPE_REPLACED))); - - set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | - (1 << STRIPE_DEGRADED)), - head_sh->state & (1 << STRIPE_INSYNC)); - - sh->check_state = head_sh->check_state; - sh->reconstruct_state = head_sh->reconstruct_state; - for (i = 0; i < sh->disks; i++) { - if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wakeup_nr++; - sh->dev[i].flags = head_sh->dev[i].flags; - } - - spin_lock_irq(&sh->stripe_lock); - sh->batch_head = NULL; - spin_unlock_irq(&sh->stripe_lock); - if (sh->state & STRIPE_EXPAND_SYNC_FLAGS) - set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); - } - - spin_lock_irq(&head_sh->stripe_lock); - head_sh->batch_head = NULL; - spin_unlock_irq(&head_sh->stripe_lock); - if (wakeup_nr) - wake_up(&conf->wait_for_overlap); - if (head_sh->state & STRIPE_EXPAND_SYNC_FLAGS) - set_bit(STRIPE_HANDLE, &head_sh->state); + if (head_sh->batch_head && do_endio) + break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); } static void handle_stripe_dirtying(struct r5conf *conf, From 626f2092c85ac847bb80b3257eb6a565dec32278 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 22 May 2015 14:03:10 +1000 Subject: [PATCH 09/10] md/raid5: break stripe-batches when the array has failed. Once the array has too much failure, we need to break stripe-batches up so they can all be dealt with. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3254504b1080..553d54b87052 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4337,6 +4337,7 @@ static void handle_stripe(struct stripe_head *sh) if (s.failed > conf->max_degraded) { sh->check_state = 0; sh->reconstruct_state = 0; + break_stripe_batch_list(sh, 0); if (s.to_read+s.to_write+s.written) handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); if (s.syncing + s.replacing) From 56ccc1125bc141cf63927eda7febff4216dea2d3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 28 May 2015 17:53:29 +1000 Subject: [PATCH 10/10] md: fix race when unfreezing sync_action A recent change removed the need for locking around writing to "sync_action" (and various other places), but introduced a subtle race. When e.g. setting 'reshape' on a 'frozen' array, the 'frozen' flag is cleared before 'reshape' is set, so the md thread can get in and start trying recovery - which isn't wanted. So instead of clearing MD_RECOVERY_FROZEN for any command except 'frozen', only clear it when each specific command is parsed. This allows the handling of 'reshape' to clear the bit while a lock is held. Also remove some places where we set MD_RECOVERY_NEEDED, as it is always set on non-error exit of the function. Signed-off-by: NeilBrown Fixes: 6791875e2e53 ("md: make reconfig_mutex optional for writes to md sysfs files.") --- drivers/md/md.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index d4f31e195e26..8f10f4ea70ea 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4211,12 +4211,12 @@ action_store(struct mddev *mddev, const char *page, size_t len) if (!mddev->pers || !mddev->pers->sync_request) return -EINVAL; - if (cmd_match(page, "frozen")) - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - else - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { + if (cmd_match(page, "frozen")) + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + else + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); flush_workqueue(md_misc_wq); if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); @@ -4229,16 +4229,17 @@ action_store(struct mddev *mddev, const char *page, size_t len) test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) return -EBUSY; else if (cmd_match(page, "resync")) - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); else if (cmd_match(page, "recover")) { + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } else if (cmd_match(page, "reshape")) { int err; if (mddev->pers->start_reshape == NULL) return -EINVAL; err = mddev_lock(mddev); if (!err) { + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); err = mddev->pers->start_reshape(mddev); mddev_unlock(mddev); } @@ -4250,6 +4251,7 @@ action_store(struct mddev *mddev, const char *page, size_t len) set_bit(MD_RECOVERY_CHECK, &mddev->recovery); else if (!cmd_match(page, "repair")) return -EINVAL; + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); set_bit(MD_RECOVERY_SYNC, &mddev->recovery); }