From 5e8651060cea6b44844521ddcac665e2c021f5d8 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 12 Nov 2009 12:08:04 +1100
Subject: md: factor out updating of 'recovery_offset'.

Each device has its own 'recovery_offset' showing how far
recovery has progressed on the device.
As the only real significance of this is that fact that it can
be stored in the metadata and recovered at restart, and as
only 1.x metadata can do this, we were only updating
'recovery_offset' to 'curr_resync_completed' when updating
v1.x metadata.
But this is wrong, and we will shortly make limited use of this
field in v0.90 metadata.

So move the update into common code.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index e64c971038d..01b9a0fd16e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1382,8 +1382,6 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 
 	if (rdev->raid_disk >= 0 &&
 	    !test_bit(In_sync, &rdev->flags)) {
-		if (mddev->curr_resync_completed > rdev->recovery_offset)
-			rdev->recovery_offset = mddev->curr_resync_completed;
 		if (rdev->recovery_offset > 0) {
 			sb->feature_map |=
 				cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
@@ -1917,6 +1915,14 @@ static void sync_sbs(mddev_t * mddev, int nospares)
 	 */
 	mdk_rdev_t *rdev;
 
+	/* First make sure individual recovery_offsets are correct */
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
+		if (rdev->raid_disk >= 0 &&
+		    !test_bit(In_sync, &rdev->flags) &&
+		    mddev->curr_resync_completed > rdev->recovery_offset)
+				rdev->recovery_offset = mddev->curr_resync_completed;
+
+	}	
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		if (rdev->sb_events == mddev->events ||
 		    (nospares &&
-- 
cgit v1.2.3


From 0261cd9f1cb42fa44ece314d27868d83742bdf03 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 13 Nov 2009 17:40:48 +1100
Subject: md: allow v0.91 metadata to record devices as being active but not
 in-sync.

This is a combination that didn't really make sense before.
However when a reshape is converting e.g. raid5 -> raid6, the extra
device is not fully in-sync, but is certainly active and contains
important data.
So allow that start to be meaningful and in particular get
the 'recovery_offset' value (which is needed for any non-in-sync
active device) from the reshape_position.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 01b9a0fd16e..b182f86a19d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -944,6 +944,14 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 			    desc->raid_disk < mddev->raid_disks */) {
 			set_bit(In_sync, &rdev->flags);
 			rdev->raid_disk = desc->raid_disk;
+		} else if (desc->state & (1<<MD_DISK_ACTIVE)) {
+			/* active but not in sync implies recovery up to
+			 * reshape position.  We don't know exactly where
+			 * that is, so set to zero for now */
+			if (mddev->minor_version >= 91) {
+				rdev->recovery_offset = 0;
+				rdev->raid_disk = desc->raid_disk;
+			}
 		}
 		if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
 			set_bit(WriteMostly, &rdev->flags);
@@ -1032,8 +1040,19 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 	list_for_each_entry(rdev2, &mddev->disks, same_set) {
 		mdp_disk_t *d;
 		int desc_nr;
-		if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
-		    && !test_bit(Faulty, &rdev2->flags))
+		int is_active = test_bit(In_sync, &rdev2->flags);
+
+		if (rdev2->raid_disk >= 0 &&
+		    sb->minor_version >= 91)
+			/* we have nowhere to store the recovery_offset,
+			 * but if it is not below the reshape_position,
+			 * we can piggy-back on that.
+			 */
+			is_active = 1;
+		if (rdev2->raid_disk < 0 ||
+		    test_bit(Faulty, &rdev2->flags))
+			is_active = 0;
+		if (is_active)
 			desc_nr = rdev2->raid_disk;
 		else
 			desc_nr = next_spare++;
@@ -1043,16 +1062,16 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 		d->number = rdev2->desc_nr;
 		d->major = MAJOR(rdev2->bdev->bd_dev);
 		d->minor = MINOR(rdev2->bdev->bd_dev);
-		if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
-		    && !test_bit(Faulty, &rdev2->flags))
+		if (is_active)
 			d->raid_disk = rdev2->raid_disk;
 		else
 			d->raid_disk = rdev2->desc_nr; /* compatibility */
 		if (test_bit(Faulty, &rdev2->flags))
 			d->state = (1<<MD_DISK_FAULTY);
-		else if (test_bit(In_sync, &rdev2->flags)) {
+		else if (is_active) {
 			d->state = (1<<MD_DISK_ACTIVE);
-			d->state |= (1<<MD_DISK_SYNC);
+			if (test_bit(In_sync, &rdev2->flags))
+				d->state |= (1<<MD_DISK_SYNC);
 			active++;
 			working++;
 		} else {
-- 
cgit v1.2.3


From 7ef90146a14c2bb1de2e22399f147ebec5b74f0b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 13 Nov 2009 17:40:51 +1100
Subject: Don't unconditionally set in_sync on newly added device in
 raid5_reshape

When a reshape finds that it can add spare devices into the array,
those devices might already be 'in_sync' if they are beyond the old
size of the array, or they might not if they are within the array.

The first case happens when we change an N-drive RAID5 to an
N+1-drive RAID5.
The second happens when we convert an N-drive RAID5 to an
N+1-drive RAID6.

So set the flag more carefully.
Also, ->recovery_offset is only meaningful when the flag is clear,
so only set it in that case.

This change needs the preceding two to ensure that the non-in_sync
device doesn't get evicted from the array when it is stopped, in the
case where v0.90 metadata is used.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index dcce204b6c7..ab40529bdab 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5361,9 +5361,11 @@ static int raid5_start_reshape(mddev_t *mddev)
 		    !test_bit(Faulty, &rdev->flags)) {
 			if (raid5_add_disk(mddev, rdev) == 0) {
 				char nm[20];
-				set_bit(In_sync, &rdev->flags);
+				if (rdev->raid_disk >= conf->previous_raid_disks)
+					set_bit(In_sync, &rdev->flags);
+				else
+					rdev->recovery_offset = 0;
 				added_devices++;
-				rdev->recovery_offset = 0;
 				sprintf(nm, "rd%d", rdev->raid_disk);
 				if (sysfs_create_link(&mddev->kobj,
 						      &rdev->kobj, nm))
-- 
cgit v1.2.3


From c148ffdcda00b6599b70f8b65e6a1fadd1dbb127 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 13 Nov 2009 17:47:00 +1100
Subject: md/raid5: Allow dirty-degraded arrays to be assembled when only party
 is degraded.

Normally is it not safe to allow a raid5 that is both dirty and
degraded to be assembled without explicit request from that admin, as
it can cause hidden data corruption.
This is because 'dirty' means that the parity cannot be trusted, and
'degraded' means that the parity needs to be used.

However, if the device that is missing contains only parity, then
there is no issue and assembly can continue.
This particularly applies when a RAID5 is being converted to a RAID6
and there is an unclean shutdown while the conversion is happening.

So check for whether the degraded space only contains parity, and
in that case, allow the assembly.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 75 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ab40529bdab..d29215d966d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4823,11 +4823,40 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
 		return ERR_PTR(-ENOMEM);
 }
 
+
+static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
+{
+	switch (algo) {
+	case ALGORITHM_PARITY_0:
+		if (raid_disk < max_degraded)
+			return 1;
+		break;
+	case ALGORITHM_PARITY_N:
+		if (raid_disk >= raid_disks - max_degraded)
+			return 1;
+		break;
+	case ALGORITHM_PARITY_0_6:
+		if (raid_disk == 0 || 
+		    raid_disk == raid_disks - 1)
+			return 1;
+		break;
+	case ALGORITHM_LEFT_ASYMMETRIC_6:
+	case ALGORITHM_RIGHT_ASYMMETRIC_6:
+	case ALGORITHM_LEFT_SYMMETRIC_6:
+	case ALGORITHM_RIGHT_SYMMETRIC_6:
+		if (raid_disk == raid_disks - 1)
+			return 1;
+	}
+	return 0;
+}
+
 static int run(mddev_t *mddev)
 {
 	raid5_conf_t *conf;
 	int working_disks = 0, chunk_size;
+	int dirty_parity_disks = 0;
 	mdk_rdev_t *rdev;
+	sector_t reshape_offset = 0;
 
 	if (mddev->recovery_cp != MaxSector)
 		printk(KERN_NOTICE "raid5: %s is not clean"
@@ -4861,6 +4890,7 @@ static int run(mddev_t *mddev)
 			       "on a stripe boundary\n");
 			return -EINVAL;
 		}
+		reshape_offset = here_new * mddev->new_chunk_sectors;
 		/* here_new is the stripe we will write to */
 		here_old = mddev->reshape_position;
 		sector_div(here_old, mddev->chunk_sectors *
@@ -4916,10 +4946,51 @@ static int run(mddev_t *mddev)
 	/*
 	 * 0 for a fully functional array, 1 or 2 for a degraded array.
 	 */
-	list_for_each_entry(rdev, &mddev->disks, same_set)
-		if (rdev->raid_disk >= 0 &&
-		    test_bit(In_sync, &rdev->flags))
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
+		if (rdev->raid_disk < 0)
+			continue;
+		if (test_bit(In_sync, &rdev->flags))
 			working_disks++;
+		/* This disc is not fully in-sync.  However if it
+		 * just stored parity (beyond the recovery_offset),
+		 * when we don't need to be concerned about the
+		 * array being dirty.
+		 * When reshape goes 'backwards', we never have
+		 * partially completed devices, so we only need
+		 * to worry about reshape going forwards.
+		 */
+		/* Hack because v0.91 doesn't store recovery_offset properly. */
+		if (mddev->major_version == 0 &&
+		    mddev->minor_version > 90)
+			rdev->recovery_offset = reshape_offset;
+			
+		printk("%d: w=%d pa=%d pr=%d m=%d a=%d r=%d op1=%d op2=%d\n",
+		       rdev->raid_disk, working_disks, conf->prev_algo,
+		       conf->previous_raid_disks, conf->max_degraded,
+		       conf->algorithm, conf->raid_disks, 
+		       only_parity(rdev->raid_disk,
+				   conf->prev_algo,
+				   conf->previous_raid_disks,
+				   conf->max_degraded),
+		       only_parity(rdev->raid_disk,
+				   conf->algorithm,
+				   conf->raid_disks,
+				   conf->max_degraded));
+		if (rdev->recovery_offset < reshape_offset) {
+			/* We need to check old and new layout */
+			if (!only_parity(rdev->raid_disk,
+					 conf->algorithm,
+					 conf->raid_disks,
+					 conf->max_degraded))
+				continue;
+		}
+		if (!only_parity(rdev->raid_disk,
+				 conf->prev_algo,
+				 conf->previous_raid_disks,
+				 conf->max_degraded))
+			continue;
+		dirty_parity_disks++;
+	}
 
 	mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks)
 			   - working_disks);
@@ -4935,7 +5006,7 @@ static int run(mddev_t *mddev)
 	mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
 	mddev->resync_max_sectors = mddev->dev_sectors;
 
-	if (mddev->degraded > 0 &&
+	if (mddev->degraded > dirty_parity_disks &&
 	    mddev->recovery_cp != MaxSector) {
 		if (mddev->ok_start_degraded)
 			printk(KERN_WARNING
-- 
cgit v1.2.3