From e0a115e5aa554b93150a8dc1c3fe15467708abb2 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 5 Jun 2008 22:45:52 -0700
Subject: md: fix prexor vs sync_request race

During the initial array synchronization process there is a window between
when a prexor operation is scheduled to a specific stripe and when it
completes for a sync_request to be scheduled to the same stripe.  When
this happens the prexor completes and the stripe is unconditionally marked
"insync", effectively canceling the sync_request for the stripe.  Prior to
2.6.23 this was not a problem because the prexor operation was done under
sh->lock.  The effect in older kernels being that the prexor would still
erroneously mark the stripe "insync", but sync_request would be held off
and re-mark the stripe as "!in_sync".

Change the write completion logic to not mark the stripe "in_sync" if a
prexor was performed.  The effect of the change is to sometimes not set
STRIPE_INSYNC.  The worst this can do is cause the resync to stall waiting
for STRIPE_INSYNC to be set.  If this were happening, then STRIPE_SYNCING
would be set and handle_issuing_new_read_requests would cause all
available blocks to eventually be read, at which point prexor would never
be used on that stripe any more and STRIPE_INSYNC would eventually be set.

echo repair > /sys/block/mdN/md/sync_action will correct arrays that may
have lost this race.

Cc: <stable@kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/md/raid5.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 425958a76b8..f0f0585c107 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2645,6 +2645,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	struct r5dev *dev;
 	unsigned long pending = 0;
 	mdk_rdev_t *blocked_rdev = NULL;
+	int prexor;
 
 	memset(&s, 0, sizeof(s));
 	pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d "
@@ -2774,9 +2775,11 @@ static void handle_stripe5(struct stripe_head *sh)
 	/* leave prexor set until postxor is done, allows us to distinguish
 	 * a rmw from a rcw during biodrain
 	 */
+	prexor = 0;
 	if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) &&
 		test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
 
+		prexor = 1;
 		clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
 		clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack);
 		clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
@@ -2810,6 +2813,8 @@ static void handle_stripe5(struct stripe_head *sh)
 				if (!test_and_set_bit(
 				    STRIPE_OP_IO, &sh->ops.pending))
 					sh->ops.count++;
+				if (prexor)
+					continue;
 				if (!test_bit(R5_Insync, &dev->flags) ||
 				    (i == sh->pd_idx && s.failed == 0))
 					set_bit(STRIPE_INSYNC, &sh->state);
-- 
cgit v1.2.3


From a6d8113a986c66aeb379a26b6e0062488b3e59e1 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 5 Jun 2008 22:45:53 -0700
Subject: md: fix uninitialized use of mddev->recovery_wait

If an array was created with --assume-clean we will oops when trying to
set ->resync_max.

Fix this by initializing ->recovery_wait in mddev_find.

Cc: <stable@kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 51c19f86ff9..7cf512a34cc 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -276,6 +276,7 @@ static mddev_t * mddev_find(dev_t unit)
 	atomic_set(&new->active, 1);
 	spin_lock_init(&new->write_lock);
 	init_waitqueue_head(&new->sb_wait);
+	init_waitqueue_head(&new->recovery_wait);
 	new->reshape_position = MaxSector;
 	new->resync_max = MaxSector;
 	new->level = LEVEL_NONE;
@@ -5665,7 +5666,6 @@ void md_do_sync(mddev_t *mddev)
 		window/2,(unsigned long long) max_sectors/2);
 
 	atomic_set(&mddev->recovery_active, 0);
-	init_waitqueue_head(&mddev->recovery_wait);
 	last_check = 0;
 
 	if (j>2) {
-- 
cgit v1.2.3


From c337869d95011495fa181536786e74aa2d7ff031 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 5 Jun 2008 22:45:54 -0700
Subject: md: do not compute parity unless it is on a failed drive

If a block is computed (rather than read) then a check/repair operation
may be lead to believe that the data on disk is correct, when infact it
isn't.  So only compute blocks for failed devices.

This issue has been around since at least 2.6.12, but has become harder to
hit in recent kernels since most reads bypass the cache.

echo repair > /sys/block/mdN/md/sync_action will set the parity blocks to the
correct state.

Cc: <stable@kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/md/raid5.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f0f0585c107..c37e256b117 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2002,6 +2002,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
 		 * have quiesced.
 		 */
 		if ((s->uptodate == disks - 1) &&
+		    (s->failed && disk_idx == s->failed_num) &&
 		    !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
 			set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
 			set_bit(R5_Wantcompute, &dev->flags);
@@ -2087,7 +2088,9 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh,
 			/* we would like to get this block, possibly
 			 * by computing it, but we might not be able to
 			 */
-			if (s->uptodate == disks-1) {
+			if ((s->uptodate == disks - 1) &&
+			    (s->failed && (i == r6s->failed_num[0] ||
+					   i == r6s->failed_num[1]))) {
 				pr_debug("Computing stripe %llu block %d\n",
 				       (unsigned long long)sh->sector, i);
 				compute_block_1(sh, i, 0);
-- 
cgit v1.2.3


From 8c2e870a625bd336b2e7a65a97c1836acef07322 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:30:52 +1000
Subject: Ensure interrupted recovery completed properly (v1 metadata plus
 bitmap)

If, while assembling an array, we find a device which is not fully
in-sync with the array, it is important to set the "fullsync" flags.
This is an exact analog to the setting of this flag in hot_add_disk
methods.

Currently, only v1.x metadata supports having devices in an array
which are not fully in-sync (it keep track of how in sync they are).
The 'fullsync' flag only makes a difference when a write-intent bitmap
is being used.  In this case it tells recovery to ignore the bitmap
and recovery all blocks.

This fix is already in place for raid1, but not raid5/6 or raid10.

So without this fix, a raid1 ir raid4/5/6 array with version 1.x
metadata and a write intent bitmaps, that is stopped in the middle
of a recovery, will appear to complete the recovery instantly
after it is reassembled, but the recovery will not be correct.

If you might have an array like that, issueing
   echo repair > /sys/block/mdXX/md/sync_action

will make sure recovery completes properly.

Cc: <stable@kernel.org>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid10.c | 2 ++
 drivers/md/raid5.c  | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 1de17da34a9..a71277b640a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2137,6 +2137,8 @@ static int run(mddev_t *mddev)
 		    !test_bit(In_sync, &disk->rdev->flags)) {
 			disk->head_position = 0;
 			mddev->degraded++;
+			if (disk->rdev)
+				conf->fullsync = 1;
 		}
 	}
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c37e256b117..475fba4d371 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4305,7 +4305,9 @@ static int run(mddev_t *mddev)
 				" disk %d\n", bdevname(rdev->bdev,b),
 				raid_disk);
 			working_disks++;
-		}
+		} else
+			/* Cannot rely on bitmap to complete recovery */
+			conf->fullsync = 1;
 	}
 
 	/*
-- 
cgit v1.2.3


From efe311431869b40d67911820a309f9a1a41306f3 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:14 +1000
Subject: Don't acknowlege that stripe-expand is complete until it really is.

We shouldn't acknowledge that a stripe has been expanded (When
reshaping a raid5 by adding a device) until the moved data has
actually been written out.  However we are currently
acknowledging (by calling md_done_sync) when the POST_XOR
is complete and before the write.

So track in s.locked whether there are pending writes, and don't
call md_done_sync yet if there are.

Note: we all set R5_LOCKED on devices which are are about to
read from.  This probably isn't technically necessary, but is
usually done when writing a block, and justifies the use of
s.locked here.

This bug can lead to a crash if an array is stopped while an reshape
is in progress.

Cc: <stable@kernel.org>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 475fba4d371..54c8ee28fcc 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2898,6 +2898,8 @@ static void handle_stripe5(struct stripe_head *sh)
 
 		for (i = conf->raid_disks; i--; ) {
 			set_bit(R5_Wantwrite, &sh->dev[i].flags);
+			set_bit(R5_LOCKED, &dev->flags);
+			s.locked++;
 			if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
 				sh->ops.count++;
 		}
@@ -2911,6 +2913,7 @@ static void handle_stripe5(struct stripe_head *sh)
 			conf->raid_disks);
 		s.locked += handle_write_operations5(sh, 1, 1);
 	} else if (s.expanded &&
+		   s.locked == 0 &&
 		!test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
 		clear_bit(STRIPE_EXPAND_READY, &sh->state);
 		atomic_dec(&conf->reshape_stripes);
-- 
cgit v1.2.3


From 9bbbca3a0ee09293108b67835c6bdf6196d7bcb3 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:17 +1000
Subject: Fix error paths if md_probe fails.

md_probe can fail (e.g. alloc_disk could fail) without
returning an error (as it alway returns NULL).
So when we call mddev_find immediately afterwards, we need
to check that md_probe actually succeeded.  This means checking
that mdev->gendisk is non-NULL.

cc: <stable@kernel.org>
Cc: Dave Jones <davej@redhat.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 7cf512a34cc..2580ac1b9b0 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3897,8 +3897,10 @@ static void autorun_devices(int part)
 
 		md_probe(dev, NULL, NULL);
 		mddev = mddev_find(dev);
-		if (!mddev) {
-			printk(KERN_ERR 
+		if (!mddev || !mddev->gendisk) {
+			if (mddev)
+				mddev_put(mddev);
+			printk(KERN_ERR
 				"md: cannot allocate memory for md drive.\n");
 			break;
 		}
-- 
cgit v1.2.3


From c7f1b2044191a82e7f0a1a674751ed582289e2e0 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Wed, 2 Jul 2008 09:34:28 +0100
Subject: dm crypt: use cond_resched

Add cond_resched() to prevent monopolising CPU when processing large bios.

dm-crypt processes encryption of bios in sector units.  If the bio request
is big it can spend a long time in the encryption call.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Tested-by: Yan Li <elliot.li.tech@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 835def11419..ab6a61db63c 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -432,6 +432,7 @@ static int crypt_convert(struct crypt_config *cc,
 		case 0:
 			atomic_dec(&ctx->pending);
 			ctx->sector++;
+			cond_resched();
 			continue;
 
 		/* error */
-- 
cgit v1.2.3