From 19052c0e85a3e9d3b7d190b29fcdbf0e6c105381 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 5 Aug 2008 15:54:13 +1000
Subject: Make writes to md/safe_mode_delay immediately effective.

If we reduce the 'safe_mode_delay', it could still wait for the old
delay to completely expire before doing anything about safe_mode.
Thus the effect if the change is delayed.

To make the effect more immediate, run the timeout function
immediately if the delay was reduced.  This may cause it to run
slightly earlier that required, but that is the safer option.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index c7aae66c6f9..48afe4f7ad4 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2393,6 +2393,8 @@ static void analyze_sbs(mddev_t * mddev)
 
 }
 
+static void md_safemode_timeout(unsigned long data);
+
 static ssize_t
 safe_delay_show(mddev_t *mddev, char *page)
 {
@@ -2432,9 +2434,12 @@ safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
 	if (msec == 0)
 		mddev->safemode_delay = 0;
 	else {
+		unsigned long old_delay = mddev->safemode_delay;
 		mddev->safemode_delay = (msec*HZ)/1000;
 		if (mddev->safemode_delay == 0)
 			mddev->safemode_delay = 1;
+		if (mddev->safemode_delay < old_delay)
+			md_safemode_timeout((unsigned long)mddev);
 	}
 	return len;
 }
-- 
cgit v1.2.3


From 2b25000bf5157c28d8591f03f0575248a8cbd900 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 5 Aug 2008 15:54:13 +1000
Subject: Restore force switch of md array to readonly at reboot time.

A recent patch allowed do_md_stop to know whether it was being called
via an ioctl or not, and thus where to allow for an extra open file
descriptor when checking if it is in use.
This broke then switch to readonly performed by the shutdown notifier,
which needs to work even when the array is still (apparently) active
(as md doesn't get told when the filesystem becomes readonly).

So restore this feature by pretending that there can be lots of
file descriptors open, but we still want do_md_stop to switch to
readonly.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 48afe4f7ad4..8d11cd1a0d8 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6237,7 +6237,11 @@ static int md_notify_reboot(struct notifier_block *this,
 
 		for_each_mddev(mddev, tmp)
 			if (mddev_trylock(mddev)) {
-				do_md_stop (mddev, 1, 0);
+				/* Force a switch to readonly even array
+				 * appears to still be in use.  Hence
+				 * the '100'.
+				 */
+				do_md_stop (mddev, 1, 100);
 				mddev_unlock(mddev);
 			}
 		/*
-- 
cgit v1.2.3


From dba034eef2456d2a9f9a76806846c97acf6c3ad1 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 5 Aug 2008 15:54:13 +1000
Subject: Fail safely when trying to grow an array with a write-intent bitmap.

We cannot currently change the size of a write-intent bitmap.
So if we change the size of an array which has such a bitmap, it
tries to set bits beyond the end of the bitmap.

For now, simply reject any request to change the size of an array
which has a bitmap.  mdadm can remove the bitmap and add a new one
after the array has changed size.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c    | 5 +++++
 drivers/md/raid5.c | 3 +++
 2 files changed, 8 insertions(+)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8d11cd1a0d8..6eb95451f16 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4639,6 +4639,11 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
 	 */
 	if (mddev->sync_thread)
 		return -EBUSY;
+	if (mddev->bitmap)
+		/* Sorry, cannot grow a bitmap yet, just remove it,
+		 * grow, and re-add.
+		 */
+		return -EBUSY;
 	rdev_for_each(rdev, tmp, mddev) {
 		sector_t avail;
 		avail = rdev->size * 2;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 40e93967565..17e0953c3c0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4446,6 +4446,9 @@ static int raid5_check_reshape(mddev_t *mddev)
 		return -EINVAL; /* Cannot shrink array or change level yet */
 	if (mddev->delta_disks == 0)
 		return 0; /* nothing to do */
+	if (mddev->bitmap)
+		/* Cannot grow a bitmap yet */
+		return -EBUSY;
 
 	/* Can only proceed if there are plenty of stripe_heads.
 	 * We need a minimum of one full stripe,, and for sensible progress
-- 
cgit v1.2.3


From ac4090d24c6a26211bc4523d920376e054d4f3f8 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 5 Aug 2008 15:54:13 +1000
Subject: Don't let a blocked_rdev interfere with read request in raid5/6

When we have externally managed metadata, we need to mark a failed
device as 'Blocked' and not allow any writes until that device
have been marked as faulty in the metadata and the Blocked flag has
been removed.

However it is perfectly OK to allow read requests when there is a
Blocked device, and with a readonly array, there may not be any
metadata-handler watching for blocked devices.

So in raid5/raid6 only allow a Blocked device to interfere with
Write request or resync.  Read requests go through untouched.

raid1 and raid10 already differentiate between read and write
properly.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 17e0953c3c0..224de022e7c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2568,10 +2568,10 @@ static bool handle_stripe5(struct stripe_head *sh)
 		if (dev->written)
 			s.written++;
 		rdev = rcu_dereference(conf->disks[i].rdev);
-		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
+		if (blocked_rdev == NULL &&
+		    rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
 			blocked_rdev = rdev;
 			atomic_inc(&rdev->nr_pending);
-			break;
 		}
 		if (!rdev || !test_bit(In_sync, &rdev->flags)) {
 			/* The ReadError flag will just be confusing now */
@@ -2588,8 +2588,14 @@ static bool handle_stripe5(struct stripe_head *sh)
 	rcu_read_unlock();
 
 	if (unlikely(blocked_rdev)) {
-		set_bit(STRIPE_HANDLE, &sh->state);
-		goto unlock;
+		if (s.syncing || s.expanding || s.expanded ||
+		    s.to_write || s.written) {
+			set_bit(STRIPE_HANDLE, &sh->state);
+			goto unlock;
+		}
+		/* There is nothing for the blocked_rdev to block */
+		rdev_dec_pending(blocked_rdev, conf->mddev);
+		blocked_rdev = NULL;
 	}
 
 	if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
@@ -2832,10 +2838,10 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 		if (dev->written)
 			s.written++;
 		rdev = rcu_dereference(conf->disks[i].rdev);
-		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
+		if (blocked_rdev == NULL &&
+		    rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
 			blocked_rdev = rdev;
 			atomic_inc(&rdev->nr_pending);
-			break;
 		}
 		if (!rdev || !test_bit(In_sync, &rdev->flags)) {
 			/* The ReadError flag will just be confusing now */
@@ -2853,9 +2859,16 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 	rcu_read_unlock();
 
 	if (unlikely(blocked_rdev)) {
-		set_bit(STRIPE_HANDLE, &sh->state);
-		goto unlock;
+		if (s.syncing || s.expanding || s.expanded ||
+		    s.to_write || s.written) {
+			set_bit(STRIPE_HANDLE, &sh->state);
+			goto unlock;
+		}
+		/* There is nothing for the blocked_rdev to block */
+		rdev_dec_pending(blocked_rdev, conf->mddev);
+		blocked_rdev = NULL;
 	}
+
 	pr_debug("locked=%d uptodate=%d to_read=%d"
 	       " to_write=%d failed=%d failed_num=%d,%d\n",
 	       s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
-- 
cgit v1.2.3


From c89a8eee61540df04fc83f32f51ef0f46ec018b1 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 5 Aug 2008 15:54:13 +1000
Subject: Allow faulty devices to be removed from a readonly array.

Removing faulty devices from an array is a two stage process.
First the device is moved from being a part of the active array
to being similar to a spare device.  Then it can be removed
by a request from user space.

The first step is currently not performed for read-only arrays,
so the second step can never succeed.

So allow readonly arrays to remove failed devices (which aren't
blocked).

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 6eb95451f16..25b893ec562 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6003,7 +6003,7 @@ static int remove_and_add_spares(mddev_t *mddev)
 			}
 		}
 
-	if (mddev->degraded) {
+	if (mddev->degraded && ! mddev->ro) {
 		rdev_for_each(rdev, rtmp, mddev) {
 			if (rdev->raid_disk >= 0 &&
 			    !test_bit(In_sync, &rdev->flags) &&
@@ -6077,6 +6077,8 @@ void md_check_recovery(mddev_t *mddev)
 		flush_signals(current);
 	}
 
+	if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
+		return;
 	if ( ! (
 		(mddev->flags && !mddev->external) ||
 		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
@@ -6090,6 +6092,15 @@ void md_check_recovery(mddev_t *mddev)
 	if (mddev_trylock(mddev)) {
 		int spares = 0;
 
+		if (mddev->ro) {
+			/* Only thing we do on a ro array is remove
+			 * failed devices.
+			 */
+			remove_and_add_spares(mddev);
+			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+			goto unlock;
+		}
+
 		if (!mddev->external) {
 			int did_change = 0;
 			spin_lock_irq(&mddev->write_lock);
-- 
cgit v1.2.3


From 0310fa216decc3ecfab41f327638fa48a81f3735 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 5 Aug 2008 15:54:14 +1000
Subject: Allow raid10 resync to happening in larger chunks.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The raid10 resync/recovery code currently limits the amount of
in-flight resync IO to 2Meg.  This was copied from raid1 where
it seems quite adequate.  However for raid10, some layouts require
a bit of seeking to perform a resync, and allowing a larger buffer
size means that the seeking can be significantly reduced.

There is probably no real need to limit the amount of in-flight
IO at all.  Any shortage of memory will naturally reduce the
amount of buffer space available down to a set minimum, and any
concurrent normal IO will quickly cause resync IO to back off.

The only problem would be that normal IO has to wait for all resync IO
to finish, so a very large amount of resync IO could cause unpleasant
latency when normal IO starts up.

So: increase RESYNC_DEPTH to allow 32Meg of buffer (if memory is
available) which seems to be a good amount.  Also reduce the amount
of memory reserved as there is no need to keep 2Meg just for resync if
memory is tight.

Thanks to Keld for the suggestion.

Cc: Keld Jørn Simonsen <keld@dkuug.dk>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index d41bebb6da0..e34cd0e6247 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -76,11 +76,13 @@ static void r10bio_pool_free(void *r10_bio, void *data)
 	kfree(r10_bio);
 }
 
+/* Maximum size of each resync request */
 #define RESYNC_BLOCK_SIZE (64*1024)
-//#define RESYNC_BLOCK_SIZE PAGE_SIZE
-#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
-#define RESYNC_WINDOW (2048*1024)
+/* amount of memory to reserve for resync requests */
+#define RESYNC_WINDOW (1024*1024)
+/* maximum number of concurrent requests, memory permitting */
+#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
 
 /*
  * When performing a resync, we need to read and compare, so
@@ -690,7 +692,6 @@ static int flush_pending_writes(conf_t *conf)
  *    there is no normal IO happeing.  It must arrange to call
  *    lower_barrier when the particular background IO completes.
  */
-#define RESYNC_DEPTH 32
 
 static void raise_barrier(conf_t *conf, int force)
 {
-- 
cgit v1.2.3


From 56ac36d722d0d27c03599d1245ac0ab59e474e5c Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 7 Aug 2008 10:02:47 -0700
Subject: md: cancel check/repair requests when recovery is needed

If a 'repair' is requested when an array is in a position to 'recover' raid1
will perform the repair while md believes a recovery is happening.  Address
this at both ends, i.e. cancel check/repair requests upon detecting a
recover condition and do not call ->spare_active after completing a
check/repair.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/md/md.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 25b893ec562..8cfadc5bd2b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6138,7 +6138,8 @@ void md_check_recovery(mddev_t *mddev)
 			/* resync has finished, collect result */
 			md_unregister_thread(mddev->sync_thread);
 			mddev->sync_thread = NULL;
-			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
+			    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
 				/* success...*/
 				/* activate any spares */
 				if (mddev->pers->spare_active(mddev))
@@ -6190,6 +6191,7 @@ void md_check_recovery(mddev_t *mddev)
 		} else if ((spares = remove_and_add_spares(mddev))) {
 			clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 			clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+			clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
 			set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 		} else if (mddev->recovery_cp < MaxSector) {
 			set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
-- 
cgit v1.2.3