From 23032a0eb97c8eaae8ac9d17373b53b19d0f5413 Mon Sep 17 00:00:00 2001 From: "Raz Ben-Jehuda(caro)" Date: Sun, 10 Dec 2006 02:20:45 -0800 Subject: [PATCH] md: define raid5_mergeable_bvec This will encourage read request to be on only one device, so we will often be able to bypass the cache for read requests. Signed-off-by: Neil Brown Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 52914d5cec7..b86ceba04f6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2611,6 +2611,28 @@ static int raid5_congested(void *data, int bits) return 0; } +/* We want read requests to align with chunks where possible, + * but write requests don't need to. + */ +static int raid5_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio_vec *biovec) +{ + mddev_t *mddev = q->queuedata; + sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + int max; + unsigned int chunk_sectors = mddev->chunk_size >> 9; + unsigned int bio_sectors = bio->bi_size >> 9; + + if (bio_data_dir(bio)) + return biovec->bv_len; /* always allow writes to be mergeable */ + + max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; + if (max < 0) max = 0; + if (max <= biovec->bv_len && bio_sectors == 0) + return biovec->bv_len; + else + return max; +} + static int make_request(request_queue_t *q, struct bio * bi) { mddev_t *mddev = q->queuedata; @@ -3320,6 +3342,8 @@ static int run(mddev_t *mddev) mddev->array_size = mddev->size * (conf->previous_raid_disks - conf->max_degraded); + blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); + return 0; abort: if (conf) { -- cgit v1.2.3 From f679623f50545bc0577caf2d0f8675b61162f059 Mon Sep 17 00:00:00 2001 From: "Raz Ben-Jehuda(caro)" Date: Sun, 10 Dec 2006 02:20:46 -0800 Subject: [PATCH] md: handle bypassing the read cache (assuming nothing fails) Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b86ceba04f6..269b7771a30 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2633,6 +2633,84 @@ static int raid5_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio_ return max; } + +static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) +{ + sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + unsigned int chunk_sectors = mddev->chunk_size >> 9; + unsigned int bio_sectors = bio->bi_size >> 9; + + return chunk_sectors >= + ((sector & (chunk_sectors - 1)) + bio_sectors); +} + +/* + * The "raid5_align_endio" should check if the read succeeded and if it + * did, call bio_endio on the original bio (having bio_put the new bio + * first). + * If the read failed.. + */ +int raid5_align_endio(struct bio *bi, unsigned int bytes , int error) +{ + struct bio* raid_bi = bi->bi_private; + if (bi->bi_size) + return 1; + bio_put(bi); + bio_endio(raid_bi, bytes, error); + return 0; +} + +static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio) +{ + mddev_t *mddev = q->queuedata; + raid5_conf_t *conf = mddev_to_conf(mddev); + const unsigned int raid_disks = conf->raid_disks; + const unsigned int data_disks = raid_disks - 1; + unsigned int dd_idx, pd_idx; + struct bio* align_bi; + mdk_rdev_t *rdev; + + if (!in_chunk_boundary(mddev, raid_bio)) { + printk("chunk_aligned_read : non aligned\n"); + return 0; + } + /* + * use bio_clone to make a copy of the bio + */ + align_bi = bio_clone(raid_bio, GFP_NOIO); + if (!align_bi) + return 0; + /* + * set bi_end_io to a new function, and set bi_private to the + * original bio. + */ + align_bi->bi_end_io = raid5_align_endio; + align_bi->bi_private = raid_bio; + /* + * compute position + */ + align_bi->bi_sector = raid5_compute_sector(raid_bio->bi_sector, + raid_disks, + data_disks, + &dd_idx, + &pd_idx, + conf); + + rcu_read_lock(); + rdev = rcu_dereference(conf->disks[dd_idx].rdev); + if (rdev && test_bit(In_sync, &rdev->flags)) { + align_bi->bi_bdev = rdev->bdev; + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + generic_make_request(align_bi); + return 1; + } else { + rcu_read_unlock(); + return 0; + } +} + + static int make_request(request_queue_t *q, struct bio * bi) { mddev_t *mddev = q->queuedata; -- cgit v1.2.3 From 46031f9a38a9773021f1872abc713d62467ac22e Mon Sep 17 00:00:00 2001 From: "Raz Ben-Jehuda(caro)" Date: Sun, 10 Dec 2006 02:20:47 -0800 Subject: [PATCH] md: allow reads that have bypassed the cache to be retried on failure If a bypass-the-cache read fails, we simply try again through the cache. If it fails again it will trigger normal recovery precedures. update 1: From: NeilBrown 1/ chunk_aligned_read and retry_aligned_read assume that data_disks == raid_disks - 1 which is not true for raid6. So when an aligned read request bypasses the cache, we can get the wrong data. 2/ The cloned bio is being used-after-free in raid5_align_endio (to test BIO_UPTODATE). 3/ We forgot to add rdev->data_offset when submitting a bio for aligned-read 4/ clone_bio calls blk_recount_segments and then we change bi_bdev, so we need to invalidate the segment counts. 5/ We don't de-reference the rdev when the read completes. This means we need to record the rdev to so it is still available in the end_io routine. Fortunately bi_next in the original bio is unused at this point so we can stuff it in there. 6/ We leak a cloned bio if the target rdev is not usable. From: NeilBrown update 2: 1/ When aligned requests fail (read error) they need to be retried via the normal method (stripe cache). As we cannot be sure that we can process a single read in one go (we may not be able to allocate all the stripes needed) we store a bio-being-retried and a list of bioes-that-still-need-to-be-retried. When find a bio that needs to be retried, we should add it to the list, not to single-bio... 2/ We were never incrementing 'scnt' when resubmitting failed aligned requests. [akpm@osdl.org: build fix] Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 160 insertions(+), 5 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 269b7771a30..2ac2e56a1a4 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -134,6 +134,8 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) if (!test_bit(STRIPE_EXPANDING, &sh->state)) { list_add_tail(&sh->lru, &conf->inactive_list); wake_up(&conf->wait_for_stripe); + if (conf->retry_read_aligned) + md_wakeup_thread(conf->mddev->thread); } } } @@ -2644,19 +2646,81 @@ static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) ((sector & (chunk_sectors - 1)) + bio_sectors); } +/* + * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) + * later sampled by raid5d. + */ +static void add_bio_to_retry(struct bio *bi,raid5_conf_t *conf) +{ + unsigned long flags; + + spin_lock_irqsave(&conf->device_lock, flags); + + bi->bi_next = conf->retry_read_aligned_list; + conf->retry_read_aligned_list = bi; + + spin_unlock_irqrestore(&conf->device_lock, flags); + md_wakeup_thread(conf->mddev->thread); +} + + +static struct bio *remove_bio_from_retry(raid5_conf_t *conf) +{ + struct bio *bi; + + bi = conf->retry_read_aligned; + if (bi) { + conf->retry_read_aligned = NULL; + return bi; + } + bi = conf->retry_read_aligned_list; + if(bi) { + conf->retry_read_aligned = bi->bi_next; + bi->bi_next = NULL; + bi->bi_phys_segments = 1; /* biased count of active stripes */ + bi->bi_hw_segments = 0; /* count of processed stripes */ + } + + return bi; +} + + /* * The "raid5_align_endio" should check if the read succeeded and if it * did, call bio_endio on the original bio (having bio_put the new bio * first). * If the read failed.. */ -int raid5_align_endio(struct bio *bi, unsigned int bytes , int error) +static int raid5_align_endio(struct bio *bi, unsigned int bytes, int error) { struct bio* raid_bi = bi->bi_private; + mddev_t *mddev; + raid5_conf_t *conf; + int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); + mdk_rdev_t *rdev; + if (bi->bi_size) return 1; bio_put(bi); - bio_endio(raid_bi, bytes, error); + + mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata; + conf = mddev_to_conf(mddev); + rdev = (void*)raid_bi->bi_next; + raid_bi->bi_next = NULL; + + rdev_dec_pending(rdev, conf->mddev); + + if (!error && uptodate) { + bio_endio(raid_bi, bytes, 0); + if (atomic_dec_and_test(&conf->active_aligned_reads)) + wake_up(&conf->wait_for_stripe); + return 0; + } + + + PRINTK("raid5_align_endio : io error...handing IO for a retry\n"); + + add_bio_to_retry(raid_bi, conf); return 0; } @@ -2665,7 +2729,7 @@ static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio) mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev_to_conf(mddev); const unsigned int raid_disks = conf->raid_disks; - const unsigned int data_disks = raid_disks - 1; + const unsigned int data_disks = raid_disks - conf->max_degraded; unsigned int dd_idx, pd_idx; struct bio* align_bi; mdk_rdev_t *rdev; @@ -2699,13 +2763,25 @@ static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio) rcu_read_lock(); rdev = rcu_dereference(conf->disks[dd_idx].rdev); if (rdev && test_bit(In_sync, &rdev->flags)) { - align_bi->bi_bdev = rdev->bdev; atomic_inc(&rdev->nr_pending); rcu_read_unlock(); + raid_bio->bi_next = (void*)rdev; + align_bi->bi_bdev = rdev->bdev; + align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); + align_bi->bi_sector += rdev->data_offset; + + spin_lock_irq(&conf->device_lock); + wait_event_lock_irq(conf->wait_for_stripe, + conf->quiesce == 0, + conf->device_lock, /* nothing */); + atomic_inc(&conf->active_aligned_reads); + spin_unlock_irq(&conf->device_lock); + generic_make_request(align_bi); return 1; } else { rcu_read_unlock(); + bio_put(align_bi); return 0; } } @@ -3050,6 +3126,72 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski return STRIPE_SECTORS; } +static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) +{ + /* We may not be able to submit a whole bio at once as there + * may not be enough stripe_heads available. + * We cannot pre-allocate enough stripe_heads as we may need + * more than exist in the cache (if we allow ever large chunks). + * So we do one stripe head at a time and record in + * ->bi_hw_segments how many have been done. + * + * We *know* that this entire raid_bio is in one chunk, so + * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. + */ + struct stripe_head *sh; + int dd_idx, pd_idx; + sector_t sector, logical_sector, last_sector; + int scnt = 0; + int remaining; + int handled = 0; + + logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); + sector = raid5_compute_sector( logical_sector, + conf->raid_disks, + conf->raid_disks - conf->max_degraded, + &dd_idx, + &pd_idx, + conf); + last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); + + for (; logical_sector < last_sector; + logical_sector += STRIPE_SECTORS, scnt++) { + + if (scnt < raid_bio->bi_hw_segments) + /* already done this stripe */ + continue; + + sh = get_active_stripe(conf, sector, conf->raid_disks, pd_idx, 1); + + if (!sh) { + /* failed to get a stripe - must wait */ + raid_bio->bi_hw_segments = scnt; + conf->retry_read_aligned = raid_bio; + return handled; + } + + set_bit(R5_ReadError, &sh->dev[dd_idx].flags); + add_stripe_bio(sh, raid_bio, dd_idx, 0); + handle_stripe(sh, NULL); + release_stripe(sh); + handled++; + } + spin_lock_irq(&conf->device_lock); + remaining = --raid_bio->bi_phys_segments; + spin_unlock_irq(&conf->device_lock); + if (remaining == 0) { + int bytes = raid_bio->bi_size; + + raid_bio->bi_size = 0; + raid_bio->bi_end_io(raid_bio, bytes, 0); + } + if (atomic_dec_and_test(&conf->active_aligned_reads)) + wake_up(&conf->wait_for_stripe); + return handled; +} + + + /* * This is our raid5 kernel thread. * @@ -3071,6 +3213,7 @@ static void raid5d (mddev_t *mddev) spin_lock_irq(&conf->device_lock); while (1) { struct list_head *first; + struct bio *bio; if (conf->seq_flush != conf->seq_write) { int seq = conf->seq_flush; @@ -3087,6 +3230,16 @@ static void raid5d (mddev_t *mddev) !list_empty(&conf->delayed_list)) raid5_activate_delayed(conf); + while ((bio = remove_bio_from_retry(conf))) { + int ok; + spin_unlock_irq(&conf->device_lock); + ok = retry_aligned_read(conf, bio); + spin_lock_irq(&conf->device_lock); + if (!ok) + break; + handled++; + } + if (list_empty(&conf->handle_list)) break; @@ -3274,6 +3427,7 @@ static int run(mddev_t *mddev) INIT_LIST_HEAD(&conf->inactive_list); atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); + atomic_set(&conf->active_aligned_reads, 0); PRINTK("raid5: run(%s) called.\n", mdname(mddev)); @@ -3796,7 +3950,8 @@ static void raid5_quiesce(mddev_t *mddev, int state) spin_lock_irq(&conf->device_lock); conf->quiesce = 1; wait_event_lock_irq(conf->wait_for_stripe, - atomic_read(&conf->active_stripes) == 0, + atomic_read(&conf->active_stripes) == 0 && + atomic_read(&conf->active_aligned_reads) == 0, conf->device_lock, /* nothing */); spin_unlock_irq(&conf->device_lock); break; -- cgit v1.2.3 From 5248861511d6aae4997a5aa7152824d87587b0b6 Mon Sep 17 00:00:00 2001 From: "Raz Ben-Jehuda(caro)" Date: Sun, 10 Dec 2006 02:20:48 -0800 Subject: [PATCH] md: enable bypassing cache for reads Call the chunk_aligned_read where appropriate. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2ac2e56a1a4..2e676e33744 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2808,6 +2808,11 @@ static int make_request(request_queue_t *q, struct bio * bi) disk_stat_inc(mddev->gendisk, ios[rw]); disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi)); + if (bio_data_dir(bi) == READ && + mddev->reshape_position == MaxSector && + chunk_aligned_read(q,bi)) + return 0; + logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); last_sector = bi->bi_sector + (bi->bi_size>>9); bi->bi_next = NULL; -- cgit v1.2.3 From b875e531fc82db592d6093594593d5cafde0a1cd Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sun, 10 Dec 2006 02:20:49 -0800 Subject: [PATCH] md: fix innocuous bug in raid6 stripe_to_pdidx stripe_to_pdidx finds the index of the parity disk for a given stripe. It assumes raid5 in that it uses "disks-1" to determine the number of data disks. This is incorrect for raid6 but fortunately the two usages cancel each other out. The only way that 'data_disks' affects the calculation of pd_idx in raid5_compute_sector is when it is divided into the sector number. But as that sector number is calculated by multiplying in the wrong value of 'data_disks' the division produces the right value. So it is innocuous but needs to be fixed. Also change the calculation of raid_disks in compute_blocknr to make it more obviously correct (it seems at first to always use disks-1 too). Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2e676e33744..d855f9f59b0 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -823,7 +823,8 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, static sector_t compute_blocknr(struct stripe_head *sh, int i) { raid5_conf_t *conf = sh->raid_conf; - int raid_disks = sh->disks, data_disks = raid_disks - 1; + int raid_disks = sh->disks; + int data_disks = raid_disks - conf->max_degraded; sector_t new_sector = sh->sector, check; int sectors_per_chunk = conf->chunk_size >> 9; sector_t stripe; @@ -859,7 +860,6 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) } break; case 6: - data_disks = raid_disks - 2; if (i == raid6_next_disk(sh->pd_idx, raid_disks)) return 0; /* It is the Q disk */ switch (conf->algorithm) { @@ -1355,8 +1355,10 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) int pd_idx, dd_idx; int chunk_offset = sector_div(stripe, sectors_per_chunk); - raid5_compute_sector(stripe*(disks-1)*sectors_per_chunk - + chunk_offset, disks, disks-1, &dd_idx, &pd_idx, conf); + raid5_compute_sector(stripe * (disks - conf->max_degraded) + *sectors_per_chunk + chunk_offset, + disks, disks - conf->max_degraded, + &dd_idx, &pd_idx, conf); return pd_idx; } -- cgit v1.2.3 From b8c6b645563d641df91fdcfd84a9c73c91d75b61 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sun, 10 Dec 2006 02:20:50 -0800 Subject: [PATCH] md: remove some old ifdefed-out code from raid5.c There are some vestiges of old code that was used for bypassing the stripe cache on reads in raid5.c. This was never updated after the change from buffer_heads to bios, but was left as a reminder. That functionality has nowe been implemented in a completely different way, so the old code can go. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 63 ------------------------------------------------------ 1 file changed, 63 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d855f9f59b0..347caab99ca 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -544,35 +544,7 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done, } if (uptodate) { -#if 0 - struct bio *bio; - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - /* we can return a buffer if we bypassed the cache or - * if the top buffer is not in highmem. If there are - * multiple buffers, leave the extra work to - * handle_stripe - */ - buffer = sh->bh_read[i]; - if (buffer && - (!PageHighMem(buffer->b_page) - || buffer->b_page == bh->b_page ) - ) { - sh->bh_read[i] = buffer->b_reqnext; - buffer->b_reqnext = NULL; - } else - buffer = NULL; - spin_unlock_irqrestore(&conf->device_lock, flags); - if (sh->bh_page[i]==bh->b_page) - set_buffer_uptodate(bh); - if (buffer) { - if (buffer->b_page != bh->b_page) - memcpy(buffer->b_data, bh->b_data, bh->b_size); - buffer->b_end_io(buffer, 1); - } -#else set_bit(R5_UPTODATE, &sh->dev[i].flags); -#endif if (test_bit(R5_ReadError, &sh->dev[i].flags)) { rdev = conf->disks[i].rdev; printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n", @@ -618,14 +590,6 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done, } } rdev_dec_pending(conf->disks[i].rdev, conf->mddev); -#if 0 - /* must restore b_page before unlocking buffer... */ - if (sh->bh_page[i] != bh->b_page) { - bh->b_page = sh->bh_page[i]; - bh->b_data = page_address(bh->b_page); - clear_buffer_uptodate(bh); - } -#endif clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); @@ -1619,15 +1583,6 @@ static void handle_stripe5(struct stripe_head *sh) } else if (test_bit(R5_Insync, &dev->flags)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); -#if 0 - /* if I am just reading this block and we don't have - a failed drive, or any pending writes then sidestep the cache */ - if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext && - ! syncing && !failed && !to_write) { - sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page; - sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data; - } -#endif locked++; PRINTK("Reading block %d (sync=%d)\n", i, syncing); @@ -1645,9 +1600,6 @@ static void handle_stripe5(struct stripe_head *sh) dev = &sh->dev[i]; if ((dev->towrite || i == sh->pd_idx) && (!test_bit(R5_LOCKED, &dev->flags) -#if 0 -|| sh->bh_page[i]!=bh->b_page -#endif ) && !test_bit(R5_UPTODATE, &dev->flags)) { if (test_bit(R5_Insync, &dev->flags) @@ -1659,9 +1611,6 @@ static void handle_stripe5(struct stripe_head *sh) /* Would I have to read this buffer for reconstruct_write */ if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && (!test_bit(R5_LOCKED, &dev->flags) -#if 0 -|| sh->bh_page[i] != bh->b_page -#endif ) && !test_bit(R5_UPTODATE, &dev->flags)) { if (test_bit(R5_Insync, &dev->flags)) rcw++; @@ -2197,15 +2146,6 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) } else if (test_bit(R5_Insync, &dev->flags)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); -#if 0 - /* if I am just reading this block and we don't have - a failed drive, or any pending writes then sidestep the cache */ - if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext && - ! syncing && !failed && !to_write) { - sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page; - sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data; - } -#endif locked++; PRINTK("Reading block %d (sync=%d)\n", i, syncing); @@ -2224,9 +2164,6 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) if (!test_bit(R5_OVERWRITE, &dev->flags) && i != pd_idx && i != qd_idx && (!test_bit(R5_LOCKED, &dev->flags) -#if 0 - || sh->bh_page[i] != bh->b_page -#endif ) && !test_bit(R5_UPTODATE, &dev->flags)) { if (test_bit(R5_Insync, &dev->flags)) rcw++; -- cgit v1.2.3 From c2b00852fbae4f8c45c2651530ded3bd01bde814 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sun, 10 Dec 2006 02:20:51 -0800 Subject: [PATCH] md: return a non-zero error to bi_end_io as appropriate in raid5 Currently raid5 depends on clearing the BIO_UPTODATE flag to signal an error to higher levels. While this should be sufficient, it is safer to explicitly set the error code as well - less room for confusion. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 347caab99ca..377f8bc9b78 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1818,7 +1818,9 @@ static void handle_stripe5(struct stripe_head *sh) return_bi = bi->bi_next; bi->bi_next = NULL; bi->bi_size = 0; - bi->bi_end_io(bi, bytes, 0); + bi->bi_end_io(bi, bytes, + test_bit(BIO_UPTODATE, &bi->bi_flags) + ? 0 : -EIO); } for (i=disks; i-- ;) { int rw; @@ -2359,7 +2361,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) return_bi = bi->bi_next; bi->bi_next = NULL; bi->bi_size = 0; - bi->bi_end_io(bi, bytes, 0); + bi->bi_end_io(bi, bytes, + test_bit(BIO_UPTODATE, &bi->bi_flags) + ? 0 : -EIO); } for (i=disks; i-- ;) { int rw; @@ -2859,7 +2863,9 @@ static int make_request(request_queue_t *q, struct bio * bi) if ( rw == WRITE ) md_write_end(mddev); bi->bi_size = 0; - bi->bi_end_io(bi, bytes, 0); + bi->bi_end_io(bi, bytes, + test_bit(BIO_UPTODATE, &bi->bi_flags) + ? 0 : -EIO); } return 0; } @@ -3127,7 +3133,9 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) int bytes = raid_bio->bi_size; raid_bio->bi_size = 0; - raid_bio->bi_end_io(raid_bio, bytes, 0); + raid_bio->bi_end_io(raid_bio, bytes, + test_bit(BIO_UPTODATE, &raid_bio->bi_flags) + ? 0 : -EIO); } if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_stripe); -- cgit v1.2.3 From 802ba064c49f655d20fed563f2a4924c8256ea10 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 13 Dec 2006 00:34:13 -0800 Subject: [PATCH] md: Don't assume that READ==0 and WRITE==1 - use the names explicitly Thanks Jens for alerting me to this. Cc: Jens Axboe Cc: Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 377f8bc9b78..be008f034ad 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1827,16 +1827,16 @@ static void handle_stripe5(struct stripe_head *sh) struct bio *bi; mdk_rdev_t *rdev; if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) - rw = 1; + rw = WRITE; else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) - rw = 0; + rw = READ; else continue; bi = &sh->dev[i].req; bi->bi_rw = rw; - if (rw) + if (rw == WRITE) bi->bi_end_io = raid5_end_write_request; else bi->bi_end_io = raid5_end_read_request; @@ -1872,7 +1872,7 @@ static void handle_stripe5(struct stripe_head *sh) atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); generic_make_request(bi); } else { - if (rw == 1) + if (rw == WRITE) set_bit(STRIPE_DEGRADED, &sh->state); PRINTK("skip op %ld on disc %d for sector %llu\n", bi->bi_rw, i, (unsigned long long)sh->sector); @@ -2370,16 +2370,16 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) struct bio *bi; mdk_rdev_t *rdev; if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) - rw = 1; + rw = WRITE; else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) - rw = 0; + rw = READ; else continue; bi = &sh->dev[i].req; bi->bi_rw = rw; - if (rw) + if (rw == WRITE) bi->bi_end_io = raid5_end_write_request; else bi->bi_end_io = raid5_end_read_request; @@ -2415,7 +2415,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); generic_make_request(bi); } else { - if (rw == 1) + if (rw == WRITE) set_bit(STRIPE_DEGRADED, &sh->state); PRINTK("skip op %ld on disc %d for sector %llu\n", bi->bi_rw, i, (unsigned long long)sh->sector); @@ -2567,7 +2567,7 @@ static int raid5_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio_ unsigned int chunk_sectors = mddev->chunk_size >> 9; unsigned int bio_sectors = bio->bi_size >> 9; - if (bio_data_dir(bio)) + if (bio_data_dir(bio) == WRITE) return biovec->bv_len; /* always allow writes to be mergeable */ max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; @@ -2751,7 +2751,7 @@ static int make_request(request_queue_t *q, struct bio * bi) disk_stat_inc(mddev->gendisk, ios[rw]); disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi)); - if (bio_data_dir(bi) == READ && + if (rw == READ && mddev->reshape_position == MaxSector && chunk_aligned_read(q,bi)) return 0; -- cgit v1.2.3 From 2a2275d630b982e5f90206f9bc497f6695a3ec5d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 26 Jan 2007 00:57:11 -0800 Subject: [PATCH] md: fix potential memalloc deadlock in md If a GFP_KERNEL allocation is attempted in md while the mddev_lock is held, it is possible for a deadlock to eventuate. This happens if the array was marked 'clean', and the memalloc triggers a write-out to the md device. For the writeout to succeed, the array must be marked 'dirty', and that requires getting the mddev_lock. So, before attempting a GFP_KERNEL allocation while holding the lock, make sure the array is marked 'dirty' (unless it is currently read-only). Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index be008f034ad..8a30b297ac3 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -405,6 +405,8 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) if (newsize <= conf->pool_size) return 0; /* never bother to shrink */ + md_allow_write(conf->mddev); + /* Step 1 */ sc = kmem_cache_create(conf->cache_name[1-conf->active_name], sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), @@ -3250,6 +3252,7 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) else break; } + md_allow_write(mddev); while (new > conf->max_nr_stripes) { if (grow_one_stripe(conf)) conf->max_nr_stripes++; -- cgit v1.2.3 From c20086de9319ac406f1e96ad459763c9f9965b18 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 26 Jan 2007 00:57:14 -0800 Subject: [PATCH] md: remove unnecessary printk when raid5 gets an unaligned read. raid5_mergeable_bvec tries to ensure that raid5 never sees a read request that does not fit within just one chunk. However as we must always accept a single-page read, that is not always possible. So when "in_chunk_boundary" fails, it might be unusual, but it is not a problem and printing a message every time is a bad idea. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 8a30b297ac3..467c16982d0 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2680,7 +2680,7 @@ static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio) mdk_rdev_t *rdev; if (!in_chunk_boundary(mddev, raid_bio)) { - printk("chunk_aligned_read : non aligned\n"); + PRINTK("chunk_aligned_read : non aligned\n"); return 0; } /* -- cgit v1.2.3 From 387bb17374c5fa057462d00d4ba941d49f45de4d Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Thu, 8 Feb 2007 14:20:29 -0800 Subject: [PATCH] md: fix various bugs with aligned reads in RAID5 It is possible for raid5 to be sent a bio that is too big for an underlying device. So if it is a READ that we pass stright down to a device, it will fail and confuse RAID5. So in 'chunk_aligned_read' we check that the bio fits within the parameters for the target device and if it doesn't fit, fall back on reading through the stripe cache and making lots of one-page requests. Note that this is the earliest time we can check against the device because earlier we don't have a lock on the device, so it could change underneath us. Also, the code for handling a retry through the cache when a read fails has not been tested and was badly broken. This patch fixes that code. Signed-off-by: Neil Brown Cc: "Kai" Cc: Cc: Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 467c16982d0..11c3d7bfa79 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2620,7 +2620,7 @@ static struct bio *remove_bio_from_retry(raid5_conf_t *conf) } bi = conf->retry_read_aligned_list; if(bi) { - conf->retry_read_aligned = bi->bi_next; + conf->retry_read_aligned_list = bi->bi_next; bi->bi_next = NULL; bi->bi_phys_segments = 1; /* biased count of active stripes */ bi->bi_hw_segments = 0; /* count of processed stripes */ @@ -2669,6 +2669,27 @@ static int raid5_align_endio(struct bio *bi, unsigned int bytes, int error) return 0; } +static int bio_fits_rdev(struct bio *bi) +{ + request_queue_t *q = bdev_get_queue(bi->bi_bdev); + + if ((bi->bi_size>>9) > q->max_sectors) + return 0; + blk_recount_segments(q, bi); + if (bi->bi_phys_segments > q->max_phys_segments || + bi->bi_hw_segments > q->max_hw_segments) + return 0; + + if (q->merge_bvec_fn) + /* it's too hard to apply the merge_bvec_fn at this stage, + * just just give up + */ + return 0; + + return 1; +} + + static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio) { mddev_t *mddev = q->queuedata; @@ -2715,6 +2736,13 @@ static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio) align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); align_bi->bi_sector += rdev->data_offset; + if (!bio_fits_rdev(align_bi)) { + /* too big in some way */ + bio_put(align_bi); + rdev_dec_pending(rdev, mddev); + return 0; + } + spin_lock_irq(&conf->device_lock); wait_event_lock_irq(conf->wait_for_stripe, conf->quiesce == 0, @@ -3107,7 +3135,9 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); for (; logical_sector < last_sector; - logical_sector += STRIPE_SECTORS, scnt++) { + logical_sector += STRIPE_SECTORS, + sector += STRIPE_SECTORS, + scnt++) { if (scnt < raid_bio->bi_hw_segments) /* already done this stripe */ @@ -3123,7 +3153,13 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) } set_bit(R5_ReadError, &sh->dev[dd_idx].flags); - add_stripe_bio(sh, raid_bio, dd_idx, 0); + if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { + release_stripe(sh); + raid_bio->bi_hw_segments = scnt; + conf->retry_read_aligned = raid_bio; + return handled; + } + handle_stripe(sh, NULL); release_stripe(sh); handled++; -- cgit v1.2.3