From 6d56e278444bc8323b1bcfcada126b8e4dbba0f4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 14 Apr 2009 12:01:57 +1000 Subject: md: allow setting newly added device to 'in_sync' via sysfs. When adding devices to an active array via sysfs, there is currently no way to mark a device as 'in-sync' which is useful when incrementally assembling an array. So add that option. Signed-off-by: NeilBrown --- drivers/md/md.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index ed5727c089a..298731b8e95 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2086,6 +2086,7 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) * -writemostly - clears write_mostly * blocked - sets the Blocked flag * -blocked - clears the Blocked flag + * insync - sets Insync providing device isn't active */ int err = -EINVAL; if (cmd_match(buf, "faulty") && rdev->mddev->pers) { @@ -2117,6 +2118,9 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); md_wakeup_thread(rdev->mddev->thread); + err = 0; + } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { + set_bit(In_sync, &rdev->flags); err = 0; } if (!err && rdev->sysfs_state) @@ -2190,7 +2194,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) } else if (rdev->mddev->pers) { mdk_rdev_t *rdev2; /* Activating a spare .. or possibly reactivating - * if we every get bitmaps working here. + * if we ever get bitmaps working here. */ if (rdev->raid_disk != -1) -- cgit v1.2.3 From acb180b0e335ad88acfed6c8a33e39c05b95dc49 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 14 Apr 2009 16:28:34 +1000 Subject: md: improve usefulness and accuracy of sysfs file md/sync_completed. The sync_completed file reports how much of a resync (or recovery or reshape) has been completed. However due to the possibility of out-of-order completion of writes, it is not certain to be accurate. We have an internal value - mddev->curr_resync_completed - which is an accurate value (though it might not always be quite so uptodate). So: - make curr_resync_completed be uptodate a little more often, particularly when raid5 reshape updates status in the metadata - report curr_resync_completed in the sysfs file - allow poll/select to report all updates to md/sync_completed. This makes sync_completed completed usable by any external metadata handler that wants to record this status information in its metadata. Signed-off-by: NeilBrown --- drivers/md/md.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 298731b8e95..7af64f3846a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2017,6 +2017,8 @@ repeat: clear_bit(MD_CHANGE_PENDING, &mddev->flags); spin_unlock_irq(&mddev->write_lock); wake_up(&mddev->sb_wait); + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } @@ -3486,12 +3488,15 @@ sync_completed_show(mddev_t *mddev, char *page) { unsigned long max_sectors, resync; + if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + return sprintf(page, "none\n"); + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) max_sectors = mddev->resync_max_sectors; else max_sectors = mddev->dev_sectors; - resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); + resync = mddev->curr_resync_completed; return sprintf(page, "%lu / %lu\n", resync, max_sectors); } @@ -6338,18 +6343,12 @@ void md_do_sync(mddev_t *mddev) sector_t sectors; skipped = 0; - if (j >= mddev->resync_max) { - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); - wait_event(mddev->recovery_wait, - mddev->resync_max > j - || kthread_should_stop()); - } - if (kthread_should_stop()) - goto interrupted; - if (mddev->curr_resync > mddev->curr_resync_completed && - (mddev->curr_resync - mddev->curr_resync_completed) - > (max_sectors >> 4)) { + if ((mddev->curr_resync > mddev->curr_resync_completed && + (mddev->curr_resync - mddev->curr_resync_completed) + > (max_sectors >> 4)) || + j >= mddev->resync_max + ) { /* time to update curr_resync_completed */ blk_unplug(mddev->queue); wait_event(mddev->recovery_wait, @@ -6357,7 +6356,17 @@ void md_do_sync(mddev_t *mddev) mddev->curr_resync_completed = mddev->curr_resync; set_bit(MD_CHANGE_CLEAN, &mddev->flags); + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } + + if (j >= mddev->resync_max) + wait_event(mddev->recovery_wait, + mddev->resync_max > j + || kthread_should_stop()); + + if (kthread_should_stop()) + goto interrupted; + sectors = mddev->pers->sync_request(mddev, j, &skipped, currspeed < speed_min(mddev)); if (sectors == 0) { @@ -6465,6 +6474,7 @@ void md_do_sync(mddev_t *mddev) skip: mddev->curr_resync = 0; + mddev->curr_resync_completed = 0; mddev->resync_min = 0; mddev->resync_max = MaxSector; sysfs_notify(&mddev->kobj, NULL, "sync_completed"); -- cgit v1.2.3 From c03f6a19699fce4389888a45db8245969311d868 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 17 Apr 2009 11:06:30 +1000 Subject: md: update sync_completed and reshape_position even more often. There are circumstances when a user-space process might need to "oversee" a resync/reshape process. For example when doing an in-place reshape of a raid5, it is prudent to take a backup of each section before reshaping it as this is the only way to provide safety against an unplanned shutdown (i.e. crash/power failure). The sync_max sysfs value can be used to stop the resync from advancing beyond a particular point. So user-space can: suspend IO to the first section and back it up set 'sync_max' to the end of the section wait for 'sync_completed' to reach that point resume IO on the first section and move on to the next section. However this process requires the kernel and user-space to run in lock-step which could introduce unnecessary delays. It would be better if a 'double buffered' approach could be used with userspace and kernel space working on different sections with the 'next' section always ready when the 'current' section is finished. One problem with implementing this is that sync_completed is only guaranteed to be updated when the sync process reaches sync_max. (it is updated on a time basis at other times, but it is hard to rely on that). This defeats some of the double buffering. With this patch, sync_completed (and reshape_position) get updated as the current position approaches sync_max, so there is room for userspace to advance sync_max early without losing updates. To be precise, sync_completed is updated when the current sync position reaches half way between the current value of sync_completed and the value of sync_max. This will usually be a good time for user space to update sync_max. If sync_max does not get updated, the updates to sync_completed (together with associated metadata updates) will occur at an exponentially increasing frequency which will get unreasonably fast (one update every page) immediately before the process hits sync_max and stops. So the update rate will be unreasonably fast only for an insignificant period of time. Signed-off-by: NeilBrown --- drivers/md/md.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 7af64f3846a..612343fdde9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6347,7 +6347,8 @@ void md_do_sync(mddev_t *mddev) if ((mddev->curr_resync > mddev->curr_resync_completed && (mddev->curr_resync - mddev->curr_resync_completed) > (max_sectors >> 4)) || - j >= mddev->resync_max + (j - mddev->curr_resync_completed)*2 + >= mddev->resync_max - mddev->curr_resync_completed ) { /* time to update curr_resync_completed */ blk_unplug(mddev->queue); -- cgit v1.2.3 From dd71cf6b2773310b01c6fe6c773064c80fd2476b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 7 May 2009 12:49:35 +1000 Subject: md: tidy up status_resync to handle large arrays. Two problems in status_resync. 1/ It still used Kilobytes as the basic block unit, while most code now uses sectors uniformly. 2/ It doesn't allow for the possibility that max_sectors exceeds the range of "unsigned long". So - change "max_blocks" to "max_sectors", and store sector numbers in there and in 'resync' - Make 'rt' a 'sector_t' so it can temporarily hold the number of remaining sectors. - use sector_div rather than normal division. - change the magic '100' used to preserve precision to '32'. + making it a power of 2 makes division easier + it doesn't need to be as large as it was chosen when we averaged speed over the entire run. Now we average speed over the last 30 seconds or so. Reported-by: "Mario 'BitKoenig' Holbe" Signed-off-by: NeilBrown --- drivers/md/md.c | 45 ++++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 612343fdde9..5eb01a4d27b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5705,37 +5705,38 @@ static void status_unused(struct seq_file *seq) static void status_resync(struct seq_file *seq, mddev_t * mddev) { - sector_t max_blocks, resync, res; - unsigned long dt, db, rt; + sector_t max_sectors, resync, res; + unsigned long dt, db; + sector_t rt; int scale; unsigned int per_milli; - resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; + resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - max_blocks = mddev->resync_max_sectors >> 1; + max_sectors = mddev->resync_max_sectors; else - max_blocks = mddev->dev_sectors / 2; + max_sectors = mddev->dev_sectors; /* * Should not happen. */ - if (!max_blocks) { + if (!max_sectors) { MD_BUG(); return; } /* Pick 'scale' such that (resync>>scale)*1000 will fit - * in a sector_t, and (max_blocks>>scale) will fit in a + * in a sector_t, and (max_sectors>>scale) will fit in a * u32, as those are the requirements for sector_div. * Thus 'scale' must be at least 10 */ scale = 10; if (sizeof(sector_t) > sizeof(unsigned long)) { - while ( max_blocks/2 > (1ULL<<(scale+32))) + while ( max_sectors/2 > (1ULL<<(scale+32))) scale++; } res = (resync>>scale)*1000; - sector_div(res, (u32)((max_blocks>>scale)+1)); + sector_div(res, (u32)((max_sectors>>scale)+1)); per_milli = res; { @@ -5756,25 +5757,35 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev) (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? "resync" : "recovery"))), per_milli/10, per_milli % 10, - (unsigned long long) resync, - (unsigned long long) max_blocks); + (unsigned long long) resync/2, + (unsigned long long) max_sectors/2); /* - * We do not want to overflow, so the order of operands and - * the * 100 / 100 trick are important. We do a +1 to be - * safe against division by zero. We only estimate anyway. - * * dt: time from mark until now * db: blocks written from mark until now * rt: remaining time + * + * rt is a sector_t, so could be 32bit or 64bit. + * So we divide before multiply in case it is 32bit and close + * to the limit. + * We scale the divisor (db) by 32 to avoid loosing precision + * near the end of resync when the number of remaining sectors + * is close to 'db'. + * We then divide rt by 32 after multiplying by db to compensate. + * The '+1' avoids division by zero if db is very small. */ dt = ((jiffies - mddev->resync_mark) / HZ); if (!dt) dt++; db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) - mddev->resync_mark_cnt; - rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100; - seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); + rt = max_sectors - resync; /* number of remaining sectors */ + sector_div(rt, db/32+1); + rt *= dt; + rt >>= 5; + + seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, + ((unsigned long)rt % 60)/6); seq_printf(seq, " speed=%ldK/sec", db/2/dt); } -- cgit v1.2.3 From 110518bccf076726cc93bf604527d8019aae50ba Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Thu, 7 May 2009 12:49:37 +1000 Subject: md: constify VFTs Signed-off-by: Jan Engelhardt Signed-off-by: NeilBrown --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 5eb01a4d27b..8350bde60d1 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5976,7 +5976,7 @@ static int md_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations md_seq_ops = { +static const struct seq_operations md_seq_ops = { .start = md_seq_start, .next = md_seq_next, .stop = md_seq_stop, -- cgit v1.2.3 From 5bf295975416f8e97117bbbcfb0191c00bc3e2b4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 7 May 2009 12:50:57 +1000 Subject: md: remove ability to explicit set an inactive array to 'clean'. Being able to write 'clean' to an 'array_state' of an inactive array to activate it in 'clean' mode is both unnecessary and inconvenient. It is unnecessary because the same can be achieved by writing 'active'. This activates and array, but it still remains 'clean' until the first write. It is inconvenient because writing 'clean' is more often used to cause an 'active' array to revert to 'clean' mode (thus blocking any writes until a 'write-pending' is promoted to 'active'). Allowing 'clean' to both activate an array and mark an active array as clean can lead to races: One program writes 'clean' to mark the active array as clean at the same time as another program writes 'inactive' to deactivate (stop) and active array. Depending on which writes first, the array could be deactivated and immediately reactivated which isn't what was desired. So just disable the use of 'clean' to activate an array. This avoids a race that can be triggered with mdadm-3.0 and external metadata, so it suitable for -stable. Reported-by: Rafal Marszewski Acked-by: Dan Williams Cc: Signed-off-by: NeilBrown --- drivers/md/md.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 8350bde60d1..1dd723d3188 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3066,11 +3066,8 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) } else err = -EBUSY; spin_unlock_irq(&mddev->write_lock); - } else { - mddev->ro = 0; - mddev->recovery_cp = MaxSector; - err = do_md_run(mddev); - } + } else + err = -EINVAL; break; case active: if (mddev->pers) { -- cgit v1.2.3 From c4647292fda0833bebe45be27f04453b736981fa Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 7 May 2009 12:51:06 +1000 Subject: md: remove rd%d links immediately after stopping an array. md maintains link in sys/mdXX/md/ to identify which device has which role in the array. e.g. rd2 -> dev-sda indicates that the device with role '2' in the array is sda. These links are only present when the array is active. They are created immediately after ->run is called, and so should be removed immediately after ->stop is called. However they are currently removed a little bit later, and it is possible for ->run to be called again, thus adding these links, before they are removed. So move the removal earlier so they are consistently only present when the array is active. Signed-off-by: NeilBrown --- drivers/md/md.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 1dd723d3188..fccc8343a25 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4294,6 +4294,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) { int err = 0; struct gendisk *disk = mddev->gendisk; + mdk_rdev_t *rdev; if (atomic_read(&mddev->openers) > is_open) { printk("md: %s still in use.\n",mdname(mddev)); @@ -4336,6 +4337,13 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) /* tell userspace to handle 'inactive' */ sysfs_notify_dirent(mddev->sysfs_state); + list_for_each_entry(rdev, &mddev->disks, same_set) + if (rdev->raid_disk >= 0) { + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + } + set_capacity(disk, 0); mddev->changed = 1; @@ -4356,7 +4364,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) * Free resources if final stop */ if (mode == 0) { - mdk_rdev_t *rdev; printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); @@ -4368,13 +4375,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) } mddev->bitmap_offset = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk >= 0) { - char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); - } - /* make sure all md_delayed_delete calls have finished */ flush_scheduled_work(); -- cgit v1.2.3