From 75ad23bc0fcb4f992a5d06982bf0857ab1738e9e Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 29 Apr 2008 14:48:33 +0200 Subject: block: make queue flags non-atomic We can save some atomic ops in the IO path, if we clearly define the rules of how to modify the queue flags. Signed-off-by: Jens Axboe --- drivers/md/md.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 87620b705be..acd716b657b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -282,7 +282,8 @@ static mddev_t * mddev_find(dev_t unit) kfree(new); return NULL; } - set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags); + /* Can be unlocked because the queue is new: no concurrency */ + queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, new->queue); blk_queue_make_request(new->queue, md_fail_request); -- cgit v1.2.3 From c7705f3449c7edd5c1744871097f93977227afc4 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Tue, 29 Apr 2008 01:02:35 -0700 Subject: drivers: use non-racy method for proc entries creation (2) Use proc_create()/proc_create_data() to make sure that ->proc_fops and ->data be setup before gluing PDE to main tree. Signed-off-by: Denis V. Lunev Cc: Greg Kroah-Hartman Cc: Alexey Dobriyan Cc: "Eric W. Biederman" Cc: Peter Osterlund Cc: Bartlomiej Zolnierkiewicz Cc: Dmitry Torokhov Cc: Neil Brown Cc: Mauro Carvalho Chehab Cc: Bjorn Helgaas Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 87620b705be..6fe4a769c85 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5947,13 +5947,9 @@ static struct notifier_block md_notifier = { static void md_geninit(void) { - struct proc_dir_entry *p; - dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); - p = create_proc_entry("mdstat", S_IRUGO, NULL); - if (p) - p->proc_fops = &md_seq_fops; + proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops); } static int __init md_init(void) -- cgit v1.2.3 From 6a51830e14529063cb2685921e1177d9af50e49a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 30 Apr 2008 00:52:28 -0700 Subject: md: fix use after free when removing rdev via sysfs rdev->mddev is no longer valid upon return from entry->store() when the 'remove' command is given. Cc: Signed-off-by: Dan Williams Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index bb3e4b1cb77..11457a28a14 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2097,7 +2097,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, rv = -EBUSY; else rv = entry->store(rdev, page, length); - mddev_unlock(rdev->mddev); + mddev_unlock(mddev); } return rv; } -- cgit v1.2.3 From 8377bc808029251c2c0f52116cf87d80291b25bf Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 30 Apr 2008 00:52:28 -0700 Subject: md: skip all metadata update processing when using external metadata. All the metadata update processing for external metadata is on in user-space or through the sysfs interfaces, so make "md_update_sb" a no-op in that case. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 11457a28a14..61767f1962e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1652,6 +1652,8 @@ static void md_update_sb(mddev_t * mddev, int force_change) int sync_req; int nospares = 0; + if (mddev->external) + return; repeat: spin_lock_irq(&mddev->write_lock); -- cgit v1.2.3 From d897dbf91490f26dccef3d7056ffd09eb83a15a5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 30 Apr 2008 00:52:29 -0700 Subject: md: reinitialise more mddev fields in do_md_stop. I keep finding problems where an mddev gets reused and some fields has a value from a previous usage that confuses the new usage. So clear all fields that could possible need clearing when calling do_md_stop. Also initialise the 'level' of a new array to LEVEL_NONE (which isn't 0). Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 61767f1962e..7a44c81ae3c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -276,6 +276,7 @@ static mddev_t * mddev_find(dev_t unit) init_waitqueue_head(&new->sb_wait); new->reshape_position = MaxSector; new->resync_max = MaxSector; + new->level = LEVEL_NONE; new->queue = blk_alloc_queue(GFP_KERNEL); if (!new->queue) { @@ -3714,6 +3715,30 @@ static int do_md_stop(mddev_t * mddev, int mode) mddev->reshape_position = MaxSector; mddev->external = 0; mddev->persistent = 0; + mddev->level = LEVEL_NONE; + mddev->clevel[0] = 0; + mddev->flags = 0; + mddev->ro = 0; + mddev->metadata_type[0] = 0; + mddev->chunk_size = 0; + mddev->ctime = mddev->utime = 0; + mddev->layout = 0; + mddev->max_disks = 0; + mddev->events = 0; + mddev->delta_disks = 0; + mddev->new_level = LEVEL_NONE; + mddev->new_layout = 0; + mddev->new_chunk = 0; + mddev->curr_resync = 0; + mddev->resync_mismatches = 0; + mddev->suspend_lo = mddev->suspend_hi = 0; + mddev->sync_speed_min = mddev->sync_speed_max = 0; + mddev->recovery = 0; + mddev->in_sync = 0; + mddev->changed = 0; + mddev->degraded = 0; + mddev->barriers_work = 0; + mddev->safemode = 0; } else if (mddev->pers) printk(KERN_INFO "md: %s switched to read-only mode.\n", -- cgit v1.2.3 From 31a59e3425d32743738e043c1df1668e0f22bbab Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 30 Apr 2008 00:52:30 -0700 Subject: md: fix 'safemode' handling for external metadata. 'safemode' relates to marking an array as 'clean' if there has been no write traffic for a while (a couple of seconds), to reduce the chance of the array being found dirty on reboot. ->safemode is set to '1' when there have been no write for a while, and it gets set to '0' when the superblock is updates with the 'clean' flag set. This requires a few fixes for 'external' metadata: - When an array is set to 'clean' via sysfs, 'safemode' must be cleared. - when we write to an array that has 'safemode' set (there must have been some delay in updating the metadata), we need to clear safemode. - Don't try to update external metadata in md_check_recovery for safemode transitions - it won't work. Also, don't try to support "immediate safe mode" (safemode==2) for external metadata, it cannot really work (the safemode timeout can be set very low if this is really needed). Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 7a44c81ae3c..ff9fef38960 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2615,6 +2615,8 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) if (atomic_read(&mddev->writes_pending) == 0) { if (mddev->in_sync == 0) { mddev->in_sync = 1; + if (mddev->safemode == 1) + mddev->safemode = 0; if (mddev->persistent) set_bit(MD_CHANGE_CLEAN, &mddev->flags); @@ -5392,6 +5394,8 @@ void md_write_start(mddev_t *mddev, struct bio *bi) md_wakeup_thread(mddev->sync_thread); } atomic_inc(&mddev->writes_pending); + if (mddev->safemode == 1) + mddev->safemode = 0; if (mddev->in_sync) { spin_lock_irq(&mddev->write_lock); if (mddev->in_sync) { @@ -5816,7 +5820,7 @@ void md_check_recovery(mddev_t *mddev) return; if (signal_pending(current)) { - if (mddev->pers->sync_request) { + if (mddev->pers->sync_request && !mddev->external) { printk(KERN_INFO "md: %s in immediate safe mode\n", mdname(mddev)); mddev->safemode = 2; @@ -5828,7 +5832,7 @@ void md_check_recovery(mddev_t *mddev) (mddev->flags && !mddev->external) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || test_bit(MD_RECOVERY_DONE, &mddev->recovery) || - (mddev->safemode == 1) || + (mddev->external == 0 && mddev->safemode == 1) || (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) && !mddev->in_sync && mddev->recovery_cp == MaxSector) )) @@ -5837,16 +5841,20 @@ void md_check_recovery(mddev_t *mddev) if (mddev_trylock(mddev)) { int spares = 0; - spin_lock_irq(&mddev->write_lock); - if (mddev->safemode && !atomic_read(&mddev->writes_pending) && - !mddev->in_sync && mddev->recovery_cp == MaxSector) { - mddev->in_sync = 1; - if (mddev->persistent) - set_bit(MD_CHANGE_CLEAN, &mddev->flags); + if (!mddev->external) { + spin_lock_irq(&mddev->write_lock); + if (mddev->safemode && + !atomic_read(&mddev->writes_pending) && + !mddev->in_sync && + mddev->recovery_cp == MaxSector) { + mddev->in_sync = 1; + if (mddev->persistent) + set_bit(MD_CHANGE_CLEAN, &mddev->flags); + } + if (mddev->safemode == 1) + mddev->safemode = 0; + spin_unlock_irq(&mddev->write_lock); } - if (mddev->safemode == 1) - mddev->safemode = 0; - spin_unlock_irq(&mddev->write_lock); if (mddev->flags) md_update_sb(mddev, 0); -- cgit v1.2.3 From 648b629ed406233b0a607a3cf29d8a169876131f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 30 Apr 2008 00:52:30 -0700 Subject: md: fix up switching md arrays between read-only and read-write When setting an array to 'readonly' or to 'active' via sysfs, we must make the appropriate set_disk_ro call too. Also when switching to "read_auto" (which is like readonly, but blocks on the first write so that metadata can be marked 'dirty') we need to be more careful about what state we are changing from. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index ff9fef38960..32b2ad88a2f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2594,15 +2594,20 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) err = do_md_stop(mddev, 1); else { mddev->ro = 1; + set_disk_ro(mddev->gendisk, 1); err = do_md_run(mddev); } break; case read_auto: - /* stopping an active array */ if (mddev->pers) { - err = do_md_stop(mddev, 1); - if (err == 0) - mddev->ro = 2; /* FIXME mark devices writable */ + if (mddev->ro != 1) + err = do_md_stop(mddev, 1); + else + err = restart_array(mddev); + if (err == 0) { + mddev->ro = 2; + set_disk_ro(mddev->gendisk, 0); + } } else { mddev->ro = 2; err = do_md_run(mddev); @@ -2640,6 +2645,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) err = 0; } else { mddev->ro = 0; + set_disk_ro(mddev->gendisk, 0); err = do_md_run(mddev); } break; -- cgit v1.2.3 From 242b363e2207d14125f52a6701cfda7376a2a2fc Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 30 Apr 2008 00:52:31 -0700 Subject: md: remove a stray command from a copy and paste error in resync_start_store Signed-off-by: Dan Williams Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 32b2ad88a2f..75a3f483522 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2460,7 +2460,6 @@ resync_start_show(mddev_t *mddev, char *page) static ssize_t resync_start_store(mddev_t *mddev, const char *buf, size_t len) { - /* can only set chunk_size if array is not yet active */ char *e; unsigned long long n = simple_strtoull(buf, &e, 10); -- cgit v1.2.3 From 11e2ede0228ee0f81ccacd15894908c3bf241f73 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 30 Apr 2008 00:52:32 -0700 Subject: md: prevent duplicates in bind_rdev_to_array Found when trying to reassemble an active externally managed array. Without this check we hit the more noisy "sysfs duplicate" warning in the later call to kobject_add. Signed-off-by: Dan Williams Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 75a3f483522..bec00b201a7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1370,6 +1370,11 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) MD_BUG(); return -EINVAL; } + + /* prevent duplicates */ + if (find_rdev(mddev, rdev->bdev->bd_dev)) + return -EEXIST; + /* make sure rdev->size exceeds mddev->size */ if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) { if (mddev->pers) { -- cgit v1.2.3 From 6bfe0b499082fd3950429017cd8ebf2a6c458aa5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 30 Apr 2008 00:52:32 -0700 Subject: md: support blocking writes to an array on device failure Allows a userspace metadata handler to take action upon detecting a device failure. Based on an original patch by Neil Brown. Changes: -added blocked_wait waitqueue to rdev -don't qualify Blocked with Faulty always let userspace block writes -added md_wait_for_blocked_rdev to wait for the block device to be clear, if userspace misses the notification another one is sent every 5 seconds -set MD_RECOVERY_NEEDED after clearing "blocked" -kill DoBlock flag, just test mddev->external Signed-off-by: Dan Williams Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index bec00b201a7..83eb78b0013 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1828,6 +1828,10 @@ state_show(mdk_rdev_t *rdev, char *page) len += sprintf(page+len, "%swrite_mostly",sep); sep = ","; } + if (test_bit(Blocked, &rdev->flags)) { + len += sprintf(page+len, "%sblocked", sep); + sep = ","; + } if (!test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags)) { len += sprintf(page+len, "%sspare", sep); @@ -1844,6 +1848,8 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) * remove - disconnects the device * writemostly - sets write_mostly * -writemostly - clears write_mostly + * blocked - sets the Blocked flag + * -blocked - clears the Blocked flag */ int err = -EINVAL; if (cmd_match(buf, "faulty") && rdev->mddev->pers) { @@ -1865,6 +1871,16 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) err = 0; } else if (cmd_match(buf, "-writemostly")) { clear_bit(WriteMostly, &rdev->flags); + err = 0; + } else if (cmd_match(buf, "blocked")) { + set_bit(Blocked, &rdev->flags); + err = 0; + } else if (cmd_match(buf, "-blocked")) { + clear_bit(Blocked, &rdev->flags); + wake_up(&rdev->blocked_wait); + set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); + md_wakeup_thread(rdev->mddev->thread); + err = 0; } return err ? err : len; @@ -2194,7 +2210,9 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi goto abort_free; } } + INIT_LIST_HEAD(&rdev->same_set); + init_waitqueue_head(&rdev->blocked_wait); return rdev; @@ -4958,6 +4976,9 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) if (!rdev || test_bit(Faulty, &rdev->flags)) return; + + if (mddev->external) + set_bit(Blocked, &rdev->flags); /* dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", mdname(mddev), @@ -5760,7 +5781,7 @@ static int remove_and_add_spares(mddev_t *mddev) rdev_for_each(rdev, rtmp, mddev) if (rdev->raid_disk >= 0 && - !mddev->external && + !test_bit(Blocked, &rdev->flags) && (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) && atomic_read(&rdev->nr_pending)==0) { @@ -5959,6 +5980,16 @@ void md_check_recovery(mddev_t *mddev) } } +void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) +{ + sysfs_notify(&rdev->kobj, NULL, "state"); + wait_event_timeout(rdev->blocked_wait, + !test_bit(Blocked, &rdev->flags), + msecs_to_jiffies(5000)); + rdev_dec_pending(rdev, mddev); +} +EXPORT_SYMBOL(md_wait_for_blocked_rdev); + static int md_notify_reboot(struct notifier_block *this, unsigned long code, void *x) { -- cgit v1.2.3