From e0a115e5aa554b93150a8dc1c3fe15467708abb2 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 5 Jun 2008 22:45:52 -0700 Subject: md: fix prexor vs sync_request race During the initial array synchronization process there is a window between when a prexor operation is scheduled to a specific stripe and when it completes for a sync_request to be scheduled to the same stripe. When this happens the prexor completes and the stripe is unconditionally marked "insync", effectively canceling the sync_request for the stripe. Prior to 2.6.23 this was not a problem because the prexor operation was done under sh->lock. The effect in older kernels being that the prexor would still erroneously mark the stripe "insync", but sync_request would be held off and re-mark the stripe as "!in_sync". Change the write completion logic to not mark the stripe "in_sync" if a prexor was performed. The effect of the change is to sometimes not set STRIPE_INSYNC. The worst this can do is cause the resync to stall waiting for STRIPE_INSYNC to be set. If this were happening, then STRIPE_SYNCING would be set and handle_issuing_new_read_requests would cause all available blocks to eventually be read, at which point prexor would never be used on that stripe any more and STRIPE_INSYNC would eventually be set. echo repair > /sys/block/mdN/md/sync_action will correct arrays that may have lost this race. Cc: Signed-off-by: Dan Williams Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 425958a76b8..f0f0585c107 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2645,6 +2645,7 @@ static void handle_stripe5(struct stripe_head *sh) struct r5dev *dev; unsigned long pending = 0; mdk_rdev_t *blocked_rdev = NULL; + int prexor; memset(&s, 0, sizeof(s)); pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " @@ -2774,9 +2775,11 @@ static void handle_stripe5(struct stripe_head *sh) /* leave prexor set until postxor is done, allows us to distinguish * a rmw from a rcw during biodrain */ + prexor = 0; if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) && test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { + prexor = 1; clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete); clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack); clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending); @@ -2810,6 +2813,8 @@ static void handle_stripe5(struct stripe_head *sh) if (!test_and_set_bit( STRIPE_OP_IO, &sh->ops.pending)) sh->ops.count++; + if (prexor) + continue; if (!test_bit(R5_Insync, &dev->flags) || (i == sh->pd_idx && s.failed == 0)) set_bit(STRIPE_INSYNC, &sh->state); -- cgit v1.2.3 From a6d8113a986c66aeb379a26b6e0062488b3e59e1 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 5 Jun 2008 22:45:53 -0700 Subject: md: fix uninitialized use of mddev->recovery_wait If an array was created with --assume-clean we will oops when trying to set ->resync_max. Fix this by initializing ->recovery_wait in mddev_find. Cc: Signed-off-by: Dan Williams Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 51c19f86ff9..7cf512a34cc 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -276,6 +276,7 @@ static mddev_t * mddev_find(dev_t unit) atomic_set(&new->active, 1); spin_lock_init(&new->write_lock); init_waitqueue_head(&new->sb_wait); + init_waitqueue_head(&new->recovery_wait); new->reshape_position = MaxSector; new->resync_max = MaxSector; new->level = LEVEL_NONE; @@ -5665,7 +5666,6 @@ void md_do_sync(mddev_t *mddev) window/2,(unsigned long long) max_sectors/2); atomic_set(&mddev->recovery_active, 0); - init_waitqueue_head(&mddev->recovery_wait); last_check = 0; if (j>2) { -- cgit v1.2.3 From c337869d95011495fa181536786e74aa2d7ff031 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 5 Jun 2008 22:45:54 -0700 Subject: md: do not compute parity unless it is on a failed drive If a block is computed (rather than read) then a check/repair operation may be lead to believe that the data on disk is correct, when infact it isn't. So only compute blocks for failed devices. This issue has been around since at least 2.6.12, but has become harder to hit in recent kernels since most reads bypass the cache. echo repair > /sys/block/mdN/md/sync_action will set the parity blocks to the correct state. Cc: Signed-off-by: Dan Williams Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f0f0585c107..c37e256b117 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2002,6 +2002,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, * have quiesced. */ if ((s->uptodate == disks - 1) && + (s->failed && disk_idx == s->failed_num) && !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); set_bit(R5_Wantcompute, &dev->flags); @@ -2087,7 +2088,9 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh, /* we would like to get this block, possibly * by computing it, but we might not be able to */ - if (s->uptodate == disks-1) { + if ((s->uptodate == disks - 1) && + (s->failed && (i == r6s->failed_num[0] || + i == r6s->failed_num[1]))) { pr_debug("Computing stripe %llu block %d\n", (unsigned long long)sh->sector, i); compute_block_1(sh, i, 0); -- cgit v1.2.3 From 8c2e870a625bd336b2e7a65a97c1836acef07322 Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Sat, 28 Jun 2008 08:30:52 +1000 Subject: Ensure interrupted recovery completed properly (v1 metadata plus bitmap) If, while assembling an array, we find a device which is not fully in-sync with the array, it is important to set the "fullsync" flags. This is an exact analog to the setting of this flag in hot_add_disk methods. Currently, only v1.x metadata supports having devices in an array which are not fully in-sync (it keep track of how in sync they are). The 'fullsync' flag only makes a difference when a write-intent bitmap is being used. In this case it tells recovery to ignore the bitmap and recovery all blocks. This fix is already in place for raid1, but not raid5/6 or raid10. So without this fix, a raid1 ir raid4/5/6 array with version 1.x metadata and a write intent bitmaps, that is stopped in the middle of a recovery, will appear to complete the recovery instantly after it is reassembled, but the recovery will not be correct. If you might have an array like that, issueing echo repair > /sys/block/mdXX/md/sync_action will make sure recovery completes properly. Cc: Signed-off-by: Neil Brown --- drivers/md/raid10.c | 2 ++ drivers/md/raid5.c | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 1de17da34a9..a71277b640a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2137,6 +2137,8 @@ static int run(mddev_t *mddev) !test_bit(In_sync, &disk->rdev->flags)) { disk->head_position = 0; mddev->degraded++; + if (disk->rdev) + conf->fullsync = 1; } } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c37e256b117..475fba4d371 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4305,7 +4305,9 @@ static int run(mddev_t *mddev) " disk %d\n", bdevname(rdev->bdev,b), raid_disk); working_disks++; - } + } else + /* Cannot rely on bitmap to complete recovery */ + conf->fullsync = 1; } /* -- cgit v1.2.3 From efe311431869b40d67911820a309f9a1a41306f3 Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Sat, 28 Jun 2008 08:31:14 +1000 Subject: Don't acknowlege that stripe-expand is complete until it really is. We shouldn't acknowledge that a stripe has been expanded (When reshaping a raid5 by adding a device) until the moved data has actually been written out. However we are currently acknowledging (by calling md_done_sync) when the POST_XOR is complete and before the write. So track in s.locked whether there are pending writes, and don't call md_done_sync yet if there are. Note: we all set R5_LOCKED on devices which are are about to read from. This probably isn't technically necessary, but is usually done when writing a block, and justifies the use of s.locked here. This bug can lead to a crash if an array is stopped while an reshape is in progress. Cc: Signed-off-by: Neil Brown --- drivers/md/raid5.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 475fba4d371..54c8ee28fcc 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2898,6 +2898,8 @@ static void handle_stripe5(struct stripe_head *sh) for (i = conf->raid_disks; i--; ) { set_bit(R5_Wantwrite, &sh->dev[i].flags); + set_bit(R5_LOCKED, &dev->flags); + s.locked++; if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) sh->ops.count++; } @@ -2911,6 +2913,7 @@ static void handle_stripe5(struct stripe_head *sh) conf->raid_disks); s.locked += handle_write_operations5(sh, 1, 1); } else if (s.expanded && + s.locked == 0 && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); -- cgit v1.2.3 From 9bbbca3a0ee09293108b67835c6bdf6196d7bcb3 Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Sat, 28 Jun 2008 08:31:17 +1000 Subject: Fix error paths if md_probe fails. md_probe can fail (e.g. alloc_disk could fail) without returning an error (as it alway returns NULL). So when we call mddev_find immediately afterwards, we need to check that md_probe actually succeeded. This means checking that mdev->gendisk is non-NULL. cc: Cc: Dave Jones Signed-off-by: Neil Brown --- drivers/md/md.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 7cf512a34cc..2580ac1b9b0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3897,8 +3897,10 @@ static void autorun_devices(int part) md_probe(dev, NULL, NULL); mddev = mddev_find(dev); - if (!mddev) { - printk(KERN_ERR + if (!mddev || !mddev->gendisk) { + if (mddev) + mddev_put(mddev); + printk(KERN_ERR "md: cannot allocate memory for md drive.\n"); break; } -- cgit v1.2.3 From c7f1b2044191a82e7f0a1a674751ed582289e2e0 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Wed, 2 Jul 2008 09:34:28 +0100 Subject: dm crypt: use cond_resched Add cond_resched() to prevent monopolising CPU when processing large bios. dm-crypt processes encryption of bios in sector units. If the bio request is big it can spend a long time in the encryption call. Signed-off-by: Milan Broz Tested-by: Yan Li Signed-off-by: Andrew Morton Signed-off-by: Alasdair G Kergon --- drivers/md/dm-crypt.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/md') diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 835def11419..ab6a61db63c 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -432,6 +432,7 @@ static int crypt_convert(struct crypt_config *cc, case 0: atomic_dec(&ctx->pending); ctx->sector++; + cond_resched(); continue; /* error */ -- cgit v1.2.3