From 2b6b24574256c05be145936f1493aec74c6904e5 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 21 May 2015 15:10:01 +1000
Subject: [PATCH 01/10] md/raid5: ensure whole batch is delayed for all
 required bitmap updates.

When we add a stripe to a batch, we need to be sure that
head stripe will wait for the bitmap update required for the new
stripe.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b9f2b9cc6060..c55a68f37c72 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -837,6 +837,15 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
 		    < IO_THRESHOLD)
 			md_wakeup_thread(conf->mddev->thread);
 
+	if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) {
+		int seq = sh->bm_seq;
+		if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
+		    sh->batch_head->bm_seq > seq)
+			seq = sh->batch_head->bm_seq;
+		set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state);
+		sh->batch_head->bm_seq = seq;
+	}
+
 	atomic_inc(&sh->count);
 unlock_out:
 	unlock_two_stripes(head, sh);

From d0852df543e5aa7db34c1ad26d053782bcbf48f1 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 27 May 2015 08:43:45 +1000
Subject: [PATCH 02/10] md/raid5: close race between STRIPE_BIT_DELAY and
 batching.

When we add a write to a stripe we need to make sure the bitmap
bit is set.  While doing that the stripe is not locked so it could
be added to a batch after which further changes to STRIPE_BIT_DELAY
and ->bm_seq are ineffective.

So we need to hold off adding to a stripe until bitmap_startwrite has
completed at least once, and we need to avoid further changes to
STRIPE_BIT_DELAY once the stripe has been added to a batch.

If a bitmap_startwrite() completes after the stripe was added to a
batch, it will not have set the bit, only incremented a counter, so no
extra delay of the stripe is needed.

Reported-by: Shaohua Li <shli@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 25 ++++++++++++++++++++++---
 drivers/md/raid5.h |  3 +++
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c55a68f37c72..42d0ea6c8597 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -749,6 +749,7 @@ static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
 static bool stripe_can_batch(struct stripe_head *sh)
 {
 	return test_bit(STRIPE_BATCH_READY, &sh->state) &&
+		!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
 		is_full_stripe_write(sh);
 }
 
@@ -2996,14 +2997,32 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 	pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
 		(unsigned long long)(*bip)->bi_iter.bi_sector,
 		(unsigned long long)sh->sector, dd_idx);
-	spin_unlock_irq(&sh->stripe_lock);
 
 	if (conf->mddev->bitmap && firstwrite) {
+		/* Cannot hold spinlock over bitmap_startwrite,
+		 * but must ensure this isn't added to a batch until
+		 * we have added to the bitmap and set bm_seq.
+		 * So set STRIPE_BITMAP_PENDING to prevent
+		 * batching.
+		 * If multiple add_stripe_bio() calls race here they
+		 * much all set STRIPE_BITMAP_PENDING.  So only the first one
+		 * to complete "bitmap_startwrite" gets to set
+		 * STRIPE_BIT_DELAY.  This is important as once a stripe
+		 * is added to a batch, STRIPE_BIT_DELAY cannot be changed
+		 * any more.
+		 */
+		set_bit(STRIPE_BITMAP_PENDING, &sh->state);
+		spin_unlock_irq(&sh->stripe_lock);
 		bitmap_startwrite(conf->mddev->bitmap, sh->sector,
 				  STRIPE_SECTORS, 0);
-		sh->bm_seq = conf->seq_flush+1;
-		set_bit(STRIPE_BIT_DELAY, &sh->state);
+		spin_lock_irq(&sh->stripe_lock);
+		clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
+		if (!sh->batch_head) {
+			sh->bm_seq = conf->seq_flush+1;
+			set_bit(STRIPE_BIT_DELAY, &sh->state);
+		}
 	}
+	spin_unlock_irq(&sh->stripe_lock);
 
 	if (stripe_can_batch(sh))
 		stripe_add_to_batch_list(conf, sh);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 7dc0dd86074b..01cdb9f3a0c4 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -337,6 +337,9 @@ enum {
 	STRIPE_ON_RELEASE_LIST,
 	STRIPE_BATCH_READY,
 	STRIPE_BATCH_ERR,
+	STRIPE_BITMAP_PENDING,	/* Being added to bitmap, don't add
+				 * to batch yet.
+				 */
 };
 
 #define STRIPE_EXPAND_SYNC_FLAG \

From b15a9dbdbfe72848b7ed4cd3f97fe80daaf99c89 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 22 May 2015 15:20:04 +1000
Subject: [PATCH 03/10] md/raid5: Ensure a batch member is not handled
 prematurely.

If a stripe is a member of a batch, but not the head, it must
not be handled separately from the rest of the batch.

'clear_batch_ready()' handles this requirement to some
extent but not completely.  If a member is passed to handle_stripe()
a second time it returns '0' indicating the stripe can be handled,
which is wrong.
So add an extra test.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 42d0ea6c8597..e58736740bac 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4200,9 +4200,13 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 
 static int clear_batch_ready(struct stripe_head *sh)
 {
+	/* Return '1' if this is a member of batch, or
+	 * '0' if it is a lone stripe or a head which can now be
+	 * handled.
+	 */
 	struct stripe_head *tmp;
 	if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
-		return 0;
+		return (sh->batch_head && sh->batch_head != sh);
 	spin_lock(&sh->stripe_lock);
 	if (!sh->batch_head) {
 		spin_unlock(&sh->stripe_lock);

From 4e3d62ff4976f26d22b8b91572a49136bb3a23f1 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 21 May 2015 11:50:16 +1000
Subject: [PATCH 04/10] md/raid5: remove condition test from
 check_break_stripe_batch_list.

handle_stripe_clean_event() contains a chunk of code very
similar to check_break_stripe_batch_list().
If we make the latter more like the former, we can end up
with just one copy of this code.

This  first step removed the condition (and the 'check_') part
of the name.  This has the added advantage of making it clear
what check is being performed at the point where the function is
called.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e58736740bac..fc5c4039c394 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4234,16 +4234,11 @@ static int clear_batch_ready(struct stripe_head *sh)
 	return 0;
 }
 
-static void check_break_stripe_batch_list(struct stripe_head *sh)
+static void break_stripe_batch_list(struct stripe_head *head_sh)
 {
-	struct stripe_head *head_sh, *next;
+	struct stripe_head *sh, *next;
 	int i;
 
-	if (!test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
-		return;
-
-	head_sh = sh;
-
 	list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
 
 		list_del_init(&sh->batch_list);
@@ -4290,7 +4285,8 @@ static void handle_stripe(struct stripe_head *sh)
 		return;
 	}
 
-	check_break_stripe_batch_list(sh);
+	if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
+		break_stripe_batch_list(sh);
 
 	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
 		spin_lock(&sh->stripe_lock);

From fb642b92c267beeefd352af9bc461eac93a7552c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 21 May 2015 12:00:47 +1000
Subject: [PATCH 05/10] md/raid5: duplicate some more handle_stripe_clean_event
 code in break_stripe_batch_list

break_stripe_batch list didn't clear head_sh->batch_head.
This was probably a bug.

Also clear all R5_Overlap flags and if any were cleared, wake up
'wait_for_overlap'.
This isn't always necessary but the worst effect is a little
extra checking for code that is waiting on wait_for_overlap.

Also, don't use wake_up_nr() because that does the wrong thing
if 'nr' is zero, and it number of flags cleared doesn't
strongly correlate with the number of threads to wake.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index fc5c4039c394..6de2e1edd492 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3557,7 +3557,8 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 	spin_lock_irq(&head_sh->stripe_lock);
 	head_sh->batch_head = NULL;
 	spin_unlock_irq(&head_sh->stripe_lock);
-	wake_up_nr(&conf->wait_for_overlap, wakeup_nr);
+	if (wakeup_nr)
+		wake_up(&conf->wait_for_overlap);
 	if (head_sh->state & STRIPE_EXPAND_SYNC_FLAG)
 		set_bit(STRIPE_HANDLE, &head_sh->state);
 }
@@ -4238,6 +4239,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh)
 {
 	struct stripe_head *sh, *next;
 	int i;
+	int do_wakeup = 0;
 
 	list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
 
@@ -4250,10 +4252,12 @@ static void break_stripe_batch_list(struct stripe_head *head_sh)
 						 STRIPE_EXPAND_SYNC_FLAG));
 		sh->check_state = head_sh->check_state;
 		sh->reconstruct_state = head_sh->reconstruct_state;
-		for (i = 0; i < sh->disks; i++)
+		for (i = 0; i < sh->disks; i++) {
+			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+				do_wakeup = 1;
 			sh->dev[i].flags = head_sh->dev[i].flags &
 				(~((1 << R5_WriteError) | (1 << R5_Overlap)));
-
+		}
 		spin_lock_irq(&sh->stripe_lock);
 		sh->batch_head = NULL;
 		spin_unlock_irq(&sh->stripe_lock);
@@ -4261,6 +4265,15 @@ static void break_stripe_batch_list(struct stripe_head *head_sh)
 		set_bit(STRIPE_HANDLE, &sh->state);
 		release_stripe(sh);
 	}
+	spin_lock_irq(&head_sh->stripe_lock);
+	head_sh->batch_head = NULL;
+	spin_unlock_irq(&head_sh->stripe_lock);
+	for (i = 0; i < head_sh->disks; i++)
+		if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
+			do_wakeup = 1;
+
+	if (do_wakeup)
+		wake_up(&head_sh->raid_conf->wait_for_overlap);
 }
 
 static void handle_stripe(struct stripe_head *sh)

From 3960ce796198254b7a1b420dc9a26d80928523bd Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 21 May 2015 12:20:36 +1000
Subject: [PATCH 06/10] md/raid5: add handle_flags arg to
 break_stripe_batch_list.

When we break a stripe_batch_list we sometimes want to set
STRIPE_HANDLE on the individual stripes, and sometimes not.

So pass a 'handle_flags' arg.  If it is zero, always set STRIPE_HANDLE
(on non-head stripes).  If not zero, only set it if any of the given
flags are present.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6de2e1edd492..0b65eb51e562 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4235,7 +4235,8 @@ static int clear_batch_ready(struct stripe_head *sh)
 	return 0;
 }
 
-static void break_stripe_batch_list(struct stripe_head *head_sh)
+static void break_stripe_batch_list(struct stripe_head *head_sh,
+				    unsigned long handle_flags)
 {
 	struct stripe_head *sh, *next;
 	int i;
@@ -4261,8 +4262,9 @@ static void break_stripe_batch_list(struct stripe_head *head_sh)
 		spin_lock_irq(&sh->stripe_lock);
 		sh->batch_head = NULL;
 		spin_unlock_irq(&sh->stripe_lock);
-
-		set_bit(STRIPE_HANDLE, &sh->state);
+		if (handle_flags == 0 ||
+		    sh->state & handle_flags)
+			set_bit(STRIPE_HANDLE, &sh->state);
 		release_stripe(sh);
 	}
 	spin_lock_irq(&head_sh->stripe_lock);
@@ -4271,6 +4273,8 @@ static void break_stripe_batch_list(struct stripe_head *head_sh)
 	for (i = 0; i < head_sh->disks; i++)
 		if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
 			do_wakeup = 1;
+	if (head_sh->state & handle_flags)
+		set_bit(STRIPE_HANDLE, &head_sh->state);
 
 	if (do_wakeup)
 		wake_up(&head_sh->raid_conf->wait_for_overlap);
@@ -4299,7 +4303,7 @@ static void handle_stripe(struct stripe_head *sh)
 	}
 
 	if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
-		break_stripe_batch_list(sh);
+		break_stripe_batch_list(sh, 0);
 
 	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
 		spin_lock(&sh->stripe_lock);

From 1b956f7a8f9aa63ea9644ab8c3374cf381993363 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 21 May 2015 12:40:26 +1000
Subject: [PATCH 07/10] md/raid5: be more selective about distributing flags
 across batch.

When a batch of stripes is broken up, we keep some of the flags
that were per-stripe, and copy other flags from the head to all
others.

This only happens while a stripe is being handled, so many of the
flags are irrelevant.

The "SYNC_FLAGS" (which I've renamed to make it clear there are
several) and STRIPE_DEGRADED are set per-stripe and so need to be
preserved.  STRIPE_INSYNC is the only flag that is set on the head
that needs to be propagated to all others.

For safety, add a WARN_ON if others are set, except:
 STRIPE_HANDLE - this is safe and per-stripe and we are going to set
      in several cases anyway
 STRIPE_INSYNC
 STRIPE_IO_STARTED - this is just a hint and doesn't hurt.
 STRIPE_ON_PLUG_LIST
 STRIPE_ON_RELEASE_LIST - It is a point pointless for a batched
           stripe to be on one of these lists, but it can happen
           as can be safely ignored.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 55 ++++++++++++++++++++++++++++++++++++----------
 drivers/md/raid5.h |  2 +-
 2 files changed, 45 insertions(+), 12 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0b65eb51e562..1141b7f62e6e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3534,10 +3534,27 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 				      struct stripe_head, batch_list);
 		list_del_init(&sh->batch_list);
 
-		set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
-			      head_sh->state & ~((1 << STRIPE_ACTIVE) |
-						 (1 << STRIPE_PREREAD_ACTIVE) |
-						 STRIPE_EXPAND_SYNC_FLAG));
+		WARN_ON_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
+					  (1 << STRIPE_SYNCING) |
+					  (1 << STRIPE_REPLACED) |
+					  (1 << STRIPE_PREREAD_ACTIVE) |
+					  (1 << STRIPE_DELAYED) |
+					  (1 << STRIPE_BIT_DELAY) |
+					  (1 << STRIPE_FULL_WRITE) |
+					  (1 << STRIPE_BIOFILL_RUN) |
+					  (1 << STRIPE_COMPUTE_RUN)  |
+					  (1 << STRIPE_OPS_REQ_PENDING) |
+					  (1 << STRIPE_DISCARD) |
+					  (1 << STRIPE_BATCH_READY) |
+					  (1 << STRIPE_BATCH_ERR) |
+					  (1 << STRIPE_BITMAP_PENDING)));
+		WARN_ON_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
+					      (1 << STRIPE_REPLACED)));
+
+		set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
+					    (1 << STRIPE_DEGRADED)),
+			      head_sh->state & (1 << STRIPE_INSYNC));
+
 		sh->check_state = head_sh->check_state;
 		sh->reconstruct_state = head_sh->reconstruct_state;
 		for (i = 0; i < sh->disks; i++) {
@@ -3549,7 +3566,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 		spin_lock_irq(&sh->stripe_lock);
 		sh->batch_head = NULL;
 		spin_unlock_irq(&sh->stripe_lock);
-		if (sh->state & STRIPE_EXPAND_SYNC_FLAG)
+		if (sh->state & STRIPE_EXPAND_SYNC_FLAGS)
 			set_bit(STRIPE_HANDLE, &sh->state);
 		release_stripe(sh);
 	}
@@ -3559,7 +3576,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 	spin_unlock_irq(&head_sh->stripe_lock);
 	if (wakeup_nr)
 		wake_up(&conf->wait_for_overlap);
-	if (head_sh->state & STRIPE_EXPAND_SYNC_FLAG)
+	if (head_sh->state & STRIPE_EXPAND_SYNC_FLAGS)
 		set_bit(STRIPE_HANDLE, &head_sh->state);
 }
 
@@ -4246,11 +4263,27 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
 
 		list_del_init(&sh->batch_list);
 
-		set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
-			      head_sh->state & ~((1 << STRIPE_ACTIVE) |
-						 (1 << STRIPE_PREREAD_ACTIVE) |
-						 (1 << STRIPE_DEGRADED) |
-						 STRIPE_EXPAND_SYNC_FLAG));
+		WARN_ON_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
+					  (1 << STRIPE_SYNCING) |
+					  (1 << STRIPE_REPLACED) |
+					  (1 << STRIPE_PREREAD_ACTIVE) |
+					  (1 << STRIPE_DELAYED) |
+					  (1 << STRIPE_BIT_DELAY) |
+					  (1 << STRIPE_FULL_WRITE) |
+					  (1 << STRIPE_BIOFILL_RUN) |
+					  (1 << STRIPE_COMPUTE_RUN)  |
+					  (1 << STRIPE_OPS_REQ_PENDING) |
+					  (1 << STRIPE_DISCARD) |
+					  (1 << STRIPE_BATCH_READY) |
+					  (1 << STRIPE_BATCH_ERR) |
+					  (1 << STRIPE_BITMAP_PENDING)));
+		WARN_ON_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
+					      (1 << STRIPE_REPLACED)));
+
+		set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
+					    (1 << STRIPE_DEGRADED)),
+			      head_sh->state & (1 << STRIPE_INSYNC));
+
 		sh->check_state = head_sh->check_state;
 		sh->reconstruct_state = head_sh->reconstruct_state;
 		for (i = 0; i < sh->disks; i++) {
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 01cdb9f3a0c4..896d603ad0da 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -342,7 +342,7 @@ enum {
 				 */
 };
 
-#define STRIPE_EXPAND_SYNC_FLAG \
+#define STRIPE_EXPAND_SYNC_FLAGS \
 	((1 << STRIPE_EXPAND_SOURCE) |\
 	(1 << STRIPE_EXPAND_READY) |\
 	(1 << STRIPE_EXPANDING) |\

From 787b76fa37159050f6d26aebfa6210009baed93b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 21 May 2015 12:56:41 +1000
Subject: [PATCH 08/10] md/raid5: call break_stripe_batch_list from
 handle_stripe_clean_event

Now that the code in break_stripe_batch_list() is nearly identical
to the end of handle_stripe_clean_event, replace the later
with a function call.

The only remaining difference of any interest is the masking that is
applieds to dev[i].flags copied from head_sh.
R5_WriteError certainly isn't wanted as it is set per-stripe, not
per-patch.  R5_Overlap isn't wanted as it is explicitly handled.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 61 +++-------------------------------------------
 1 file changed, 4 insertions(+), 57 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 1141b7f62e6e..3254504b1080 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3420,6 +3420,8 @@ static void handle_stripe_fill(struct stripe_head *sh,
 	set_bit(STRIPE_HANDLE, &sh->state);
 }
 
+static void break_stripe_batch_list(struct stripe_head *head_sh,
+				    unsigned long handle_flags);
 /* handle_stripe_clean_event
  * any written block on an uptodate or failed drive can be returned.
  * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
@@ -3433,7 +3435,6 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 	int discard_pending = 0;
 	struct stripe_head *head_sh = sh;
 	bool do_endio = false;
-	int wakeup_nr = 0;
 
 	for (i = disks; i--; )
 		if (sh->dev[i].written) {
@@ -3522,62 +3523,8 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 		if (atomic_dec_and_test(&conf->pending_full_writes))
 			md_wakeup_thread(conf->mddev->thread);
 
-	if (!head_sh->batch_head || !do_endio)
-		return;
-	for (i = 0; i < head_sh->disks; i++) {
-		if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
-			wakeup_nr++;
-	}
-	while (!list_empty(&head_sh->batch_list)) {
-		int i;
-		sh = list_first_entry(&head_sh->batch_list,
-				      struct stripe_head, batch_list);
-		list_del_init(&sh->batch_list);
-
-		WARN_ON_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
-					  (1 << STRIPE_SYNCING) |
-					  (1 << STRIPE_REPLACED) |
-					  (1 << STRIPE_PREREAD_ACTIVE) |
-					  (1 << STRIPE_DELAYED) |
-					  (1 << STRIPE_BIT_DELAY) |
-					  (1 << STRIPE_FULL_WRITE) |
-					  (1 << STRIPE_BIOFILL_RUN) |
-					  (1 << STRIPE_COMPUTE_RUN)  |
-					  (1 << STRIPE_OPS_REQ_PENDING) |
-					  (1 << STRIPE_DISCARD) |
-					  (1 << STRIPE_BATCH_READY) |
-					  (1 << STRIPE_BATCH_ERR) |
-					  (1 << STRIPE_BITMAP_PENDING)));
-		WARN_ON_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
-					      (1 << STRIPE_REPLACED)));
-
-		set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
-					    (1 << STRIPE_DEGRADED)),
-			      head_sh->state & (1 << STRIPE_INSYNC));
-
-		sh->check_state = head_sh->check_state;
-		sh->reconstruct_state = head_sh->reconstruct_state;
-		for (i = 0; i < sh->disks; i++) {
-			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
-				wakeup_nr++;
-			sh->dev[i].flags = head_sh->dev[i].flags;
-		}
-
-		spin_lock_irq(&sh->stripe_lock);
-		sh->batch_head = NULL;
-		spin_unlock_irq(&sh->stripe_lock);
-		if (sh->state & STRIPE_EXPAND_SYNC_FLAGS)
-			set_bit(STRIPE_HANDLE, &sh->state);
-		release_stripe(sh);
-	}
-
-	spin_lock_irq(&head_sh->stripe_lock);
-	head_sh->batch_head = NULL;
-	spin_unlock_irq(&head_sh->stripe_lock);
-	if (wakeup_nr)
-		wake_up(&conf->wait_for_overlap);
-	if (head_sh->state & STRIPE_EXPAND_SYNC_FLAGS)
-		set_bit(STRIPE_HANDLE, &head_sh->state);
+	if (head_sh->batch_head && do_endio)
+		break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
 }
 
 static void handle_stripe_dirtying(struct r5conf *conf,

From 626f2092c85ac847bb80b3257eb6a565dec32278 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 22 May 2015 14:03:10 +1000
Subject: [PATCH 09/10] md/raid5: break stripe-batches when the array has
 failed.

Once the array has too much failure, we need to break
stripe-batches up so they can all be dealt with.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 3254504b1080..553d54b87052 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4337,6 +4337,7 @@ static void handle_stripe(struct stripe_head *sh)
 	if (s.failed > conf->max_degraded) {
 		sh->check_state = 0;
 		sh->reconstruct_state = 0;
+		break_stripe_batch_list(sh, 0);
 		if (s.to_read+s.to_write+s.written)
 			handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
 		if (s.syncing + s.replacing)

From 56ccc1125bc141cf63927eda7febff4216dea2d3 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 May 2015 17:53:29 +1000
Subject: [PATCH 10/10] md: fix race when unfreezing sync_action

A recent change removed the need for locking around writing
to "sync_action" (and various other places), but introduced a
subtle race.
When e.g. setting 'reshape' on a 'frozen' array, the 'frozen'
flag is cleared before 'reshape' is set, so the md thread can
get in and start trying recovery - which isn't wanted.

So instead of clearing MD_RECOVERY_FROZEN for any command
except 'frozen', only clear it when each specific command
is parsed.  This allows the handling of 'reshape' to clear
the bit while a lock is held.

Also remove some places where we set MD_RECOVERY_NEEDED,
as it is always set on non-error exit of the function.


Signed-off-by: NeilBrown <neilb@suse.de>
Fixes: 6791875e2e53 ("md: make reconfig_mutex optional for writes to md sysfs files.")
---
 drivers/md/md.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index d4f31e195e26..8f10f4ea70ea 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4211,12 +4211,12 @@ action_store(struct mddev *mddev, const char *page, size_t len)
 	if (!mddev->pers || !mddev->pers->sync_request)
 		return -EINVAL;
 
-	if (cmd_match(page, "frozen"))
-		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
-	else
-		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 
 	if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
+		if (cmd_match(page, "frozen"))
+			set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+		else
+			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 		flush_workqueue(md_misc_wq);
 		if (mddev->sync_thread) {
 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
@@ -4229,16 +4229,17 @@ action_store(struct mddev *mddev, const char *page, size_t len)
 		   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
 		return -EBUSY;
 	else if (cmd_match(page, "resync"))
-		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 	else if (cmd_match(page, "recover")) {
+		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
-		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	} else if (cmd_match(page, "reshape")) {
 		int err;
 		if (mddev->pers->start_reshape == NULL)
 			return -EINVAL;
 		err = mddev_lock(mddev);
 		if (!err) {
+			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 			err = mddev->pers->start_reshape(mddev);
 			mddev_unlock(mddev);
 		}
@@ -4250,6 +4251,7 @@ action_store(struct mddev *mddev, const char *page, size_t len)
 			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 		else if (!cmd_match(page, "repair"))
 			return -EINVAL;
+		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 		set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
 		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 	}