From 4b6d287f627b5fb6a49f78f9e81649ff98c62bb7 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:23:47 -0700 Subject: [PATCH] md: add write-behind support for md/raid1 If a device is flagged 'WriteMostly' and the array has a bitmap, and the bitmap superblock indicates that write_behind is allowed, then write_behind is enabled for WriteMostly devices. Write requests will be acknowledges as complete to the caller (via b_end_io) when all non-WriteMostly devices have completed the write, but will not be cleared from the bitmap until all devices complete. This requires memory allocation to make a local copy of the data being written. If there is insufficient memory, then we fall-back on normal write semantics. Signed-Off-By: Paul Clements Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid1.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 117 insertions(+), 7 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 28839a8193f..ba7f5f25616 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -222,8 +222,17 @@ static void raid_end_bio_io(r1bio_t *r1_bio) { struct bio *bio = r1_bio->master_bio; - bio_endio(bio, bio->bi_size, - test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); + /* if nobody has done the final endio yet, do it now */ + if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { + PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n", + (bio_data_dir(bio) == WRITE) ? "write" : "read", + (unsigned long long) bio->bi_sector, + (unsigned long long) bio->bi_sector + + (bio->bi_size >> 9) - 1); + + bio_endio(bio, bio->bi_size, + test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); + } free_r1bio(r1_bio); } @@ -292,7 +301,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); - int mirror; + int mirror, behind; conf_t *conf = mddev_to_conf(r1_bio->mddev); if (bio->bi_size) @@ -323,16 +332,46 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int update_head_pos(mirror, r1_bio); + behind = test_bit(R1BIO_BehindIO, &r1_bio->state); + if (behind) { + if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) + atomic_dec(&r1_bio->behind_remaining); + + /* In behind mode, we ACK the master bio once the I/O has safely + * reached all non-writemostly disks. Setting the Returned bit + * ensures that this gets done only once -- we don't ever want to + * return -EIO here, instead we'll wait */ + + if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && + test_bit(R1BIO_Uptodate, &r1_bio->state)) { + /* Maybe we can return now */ + if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { + struct bio *mbio = r1_bio->master_bio; + PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", + (unsigned long long) mbio->bi_sector, + (unsigned long long) mbio->bi_sector + + (mbio->bi_size >> 9) - 1); + bio_endio(mbio, mbio->bi_size, 0); + } + } + } /* * * Let's see if all mirrored write operations have finished * already. */ if (atomic_dec_and_test(&r1_bio->remaining)) { + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { + /* free extra copy of the data pages */ + int i = bio->bi_vcnt; + while (i--) + __free_page(bio->bi_io_vec[i].bv_page); + } /* clear the bitmap if all writes complete successfully */ bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, r1_bio->sectors, - !test_bit(R1BIO_Degraded, &r1_bio->state)); + !test_bit(R1BIO_Degraded, &r1_bio->state), + behind); md_write_end(r1_bio->mddev); raid_end_bio_io(r1_bio); } @@ -562,6 +601,39 @@ static void device_barrier(conf_t *conf, sector_t sect) spin_unlock_irq(&conf->resync_lock); } +/* duplicate the data pages for behind I/O */ +static struct page **alloc_behind_pages(struct bio *bio) +{ + int i; + struct bio_vec *bvec; + struct page **pages = kmalloc(bio->bi_vcnt * sizeof(struct page *), + GFP_NOIO); + if (unlikely(!pages)) + goto do_sync_io; + + memset(pages, 0, bio->bi_vcnt * sizeof(struct page *)); + + bio_for_each_segment(bvec, bio, i) { + pages[i] = alloc_page(GFP_NOIO); + if (unlikely(!pages[i])) + goto do_sync_io; + memcpy(kmap(pages[i]) + bvec->bv_offset, + kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); + kunmap(pages[i]); + kunmap(bvec->bv_page); + } + + return pages; + +do_sync_io: + if (pages) + for (i = 0; i < bio->bi_vcnt && pages[i]; i++) + __free_page(pages[i]); + kfree(pages); + PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); + return NULL; +} + static int make_request(request_queue_t *q, struct bio * bio) { mddev_t *mddev = q->queuedata; @@ -574,6 +646,7 @@ static int make_request(request_queue_t *q, struct bio * bio) struct bitmap *bitmap = mddev->bitmap; unsigned long flags; struct bio_list bl; + struct page **behind_pages = NULL; if (unlikely(bio_barrier(bio))) { bio_endio(bio, bio->bi_size, -EOPNOTSUPP); @@ -613,8 +686,6 @@ static int make_request(request_queue_t *q, struct bio * bio) r1_bio->mddev = mddev; r1_bio->sector = bio->bi_sector; - r1_bio->state = 0; - if (bio_data_dir(bio) == READ) { /* * read balancing logic: @@ -675,13 +746,22 @@ static int make_request(request_queue_t *q, struct bio * bio) } rcu_read_unlock(); + BUG_ON(targets == 0); /* we never fail the last device */ + if (targets < conf->raid_disks) { /* array is degraded, we will not clear the bitmap * on I/O completion (see raid1_end_write_request) */ set_bit(R1BIO_Degraded, &r1_bio->state); } + /* do behind I/O ? */ + if (bitmap && + atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind && + (behind_pages = alloc_behind_pages(bio)) != NULL) + set_bit(R1BIO_BehindIO, &r1_bio->state); + atomic_set(&r1_bio->remaining, 0); + atomic_set(&r1_bio->behind_remaining, 0); bio_list_init(&bl); for (i = 0; i < disks; i++) { @@ -698,12 +778,31 @@ static int make_request(request_queue_t *q, struct bio * bio) mbio->bi_rw = WRITE; mbio->bi_private = r1_bio; + if (behind_pages) { + struct bio_vec *bvec; + int j; + + /* Yes, I really want the '__' version so that + * we clear any unused pointer in the io_vec, rather + * than leave them unchanged. This is important + * because when we come to free the pages, we won't + * know the originial bi_idx, so we just free + * them all + */ + __bio_for_each_segment(bvec, mbio, j, 0) + bvec->bv_page = behind_pages[j]; + if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) + atomic_inc(&r1_bio->behind_remaining); + } + atomic_inc(&r1_bio->remaining); bio_list_add(&bl, mbio); } + kfree(behind_pages); /* the behind pages are attached to the bios now */ - bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors); + bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, + test_bit(R1BIO_BehindIO, &r1_bio->state)); spin_lock_irqsave(&conf->device_lock, flags); bio_list_merge(&conf->pending_bio_list, &bl); bio_list_init(&bl); @@ -1471,6 +1570,17 @@ out: static int stop(mddev_t *mddev) { conf_t *conf = mddev_to_conf(mddev); + struct bitmap *bitmap = mddev->bitmap; + int behind_wait = 0; + + /* wait for behind writes to complete */ + while (bitmap && atomic_read(&bitmap->behind_writes) > 0) { + behind_wait++; + printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(HZ); /* wait a second */ + /* need to kick something here to make sure I/O goes? */ + } md_unregister_thread(mddev->thread); mddev->thread = NULL; -- cgit v1.2.3