diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bitmap.c | 183 | ||||
-rw-r--r-- | drivers/md/dm-exception-store.c | 9 | ||||
-rw-r--r-- | drivers/md/dm-raid1.c | 12 | ||||
-rw-r--r-- | drivers/md/linear.c | 100 | ||||
-rw-r--r-- | drivers/md/md.c | 227 | ||||
-rw-r--r-- | drivers/md/multipath.c | 5 | ||||
-rw-r--r-- | drivers/md/raid0.c | 5 | ||||
-rw-r--r-- | drivers/md/raid1.c | 234 | ||||
-rw-r--r-- | drivers/md/raid10.c | 46 | ||||
-rw-r--r-- | drivers/md/raid5.c | 138 | ||||
-rw-r--r-- | drivers/md/raid6main.c | 138 |
11 files changed, 856 insertions, 241 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 41df4cda66e..2fba2bbe72d 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -270,19 +270,20 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde if (!page) return ERR_PTR(-ENOMEM); - do { - ITERATE_RDEV(mddev, rdev, tmp) - if (rdev->in_sync && !rdev->faulty) - goto found; - return ERR_PTR(-EIO); - found: + ITERATE_RDEV(mddev, rdev, tmp) { + if (! rdev->in_sync || rdev->faulty) + continue; + target = (rdev->sb_offset << 1) + offset + index * (PAGE_SIZE/512); - } while (!sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)); + if (sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)) { + page->index = index; + return page; + } + } + return ERR_PTR(-EIO); - page->index = index; - return page; } static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int wait) @@ -437,6 +438,7 @@ void bitmap_print_sb(struct bitmap *bitmap) printk(KERN_DEBUG " daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep)); printk(KERN_DEBUG " sync size: %llu KB\n", (unsigned long long)le64_to_cpu(sb->sync_size)/2); + printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind)); kunmap(bitmap->sb_page); } @@ -445,7 +447,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) { char *reason = NULL; bitmap_super_t *sb; - unsigned long chunksize, daemon_sleep; + unsigned long chunksize, daemon_sleep, write_behind; unsigned long bytes_read; unsigned long long events; int err = -EINVAL; @@ -474,6 +476,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) chunksize = le32_to_cpu(sb->chunksize); daemon_sleep = le32_to_cpu(sb->daemon_sleep); + write_behind = le32_to_cpu(sb->write_behind); /* verify that the bitmap-specific fields are valid */ if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) @@ -485,7 +488,9 @@ static int bitmap_read_sb(struct bitmap *bitmap) else if ((1 << ffz(~chunksize)) != chunksize) reason = "bitmap chunksize not a power of 2"; else if (daemon_sleep < 1 || daemon_sleep > 15) - reason = "daemon sleep period out of range"; + reason = "daemon sleep period out of range (1-15s)"; + else if (write_behind > COUNTER_MAX) + reason = "write-behind limit out of range (0 - 16383)"; if (reason) { printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n", bmname(bitmap), reason); @@ -518,8 +523,12 @@ success: /* assign fields using values from superblock */ bitmap->chunksize = chunksize; bitmap->daemon_sleep = daemon_sleep; + bitmap->daemon_lastrun = jiffies; + bitmap->max_write_behind = write_behind; bitmap->flags |= sb->state; bitmap->events_cleared = le64_to_cpu(sb->events_cleared); + if (sb->state & BITMAP_STALE) + bitmap->events_cleared = bitmap->mddev->events; err = 0; out: kunmap(bitmap->sb_page); @@ -617,7 +626,7 @@ static void bitmap_file_unmap(struct bitmap *bitmap) page_cache_release(sb_page); } -static void bitmap_stop_daemons(struct bitmap *bitmap); +static void bitmap_stop_daemon(struct bitmap *bitmap); /* dequeue the next item in a page list -- don't call from irq context */ static struct page_list *dequeue_page(struct bitmap *bitmap) @@ -659,7 +668,7 @@ static void bitmap_file_put(struct bitmap *bitmap) bitmap->file = NULL; spin_unlock_irqrestore(&bitmap->lock, flags); - bitmap_stop_daemons(bitmap); + bitmap_stop_daemon(bitmap); drain_write_queues(bitmap); @@ -818,7 +827,7 @@ int bitmap_unplug(struct bitmap *bitmap) return 0; } -static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset); +static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); /* * bitmap_init_from_disk -- called at bitmap_create time to initialize * the in-memory bitmap from the on-disk bitmap -- also, sets up the * memory mapping of the bitmap file @@ -826,8 +835,11 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset); * if there's no bitmap file, or if the bitmap file had been * previously kicked from the array, we mark all the bits as * 1's in order to cause a full resync. + * + * We ignore all bits for sectors that end earlier than 'start'. + * This is used when reading an out-of-date bitmap... */ -static int bitmap_init_from_disk(struct bitmap *bitmap) +static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) { unsigned long i, chunks, index, oldindex, bit; struct page *page = NULL, *oldpage = NULL; @@ -914,7 +926,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap) * whole page and write it out */ memset(page_address(page) + offset, 0xff, - PAGE_SIZE - offset); + PAGE_SIZE - offset); ret = write_page(bitmap, page, 1); if (ret) { kunmap(page); @@ -928,8 +940,11 @@ static int bitmap_init_from_disk(struct bitmap *bitmap) } if (test_bit(bit, page_address(page))) { /* if the disk bit is set, set the memory bit */ - bitmap_set_memory_bits(bitmap, i << CHUNK_BLOCK_SHIFT(bitmap)); + bitmap_set_memory_bits(bitmap, i << CHUNK_BLOCK_SHIFT(bitmap), + ((i+1) << (CHUNK_BLOCK_SHIFT(bitmap)) >= start) + ); bit_cnt++; + set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); } } @@ -1141,6 +1156,9 @@ static void bitmap_writeback_daemon(mddev_t *mddev) err = -EINTR; goto out; } + if (bitmap == NULL) + /* about to be stopped. */ + return; PRINTK("%s: bitmap writeback daemon woke up...\n", bmname(bitmap)); /* wait on bitmap page writebacks */ @@ -1170,21 +1188,12 @@ static void bitmap_writeback_daemon(mddev_t *mddev) } } -static int bitmap_start_daemon(struct bitmap *bitmap, mdk_thread_t **ptr, +static mdk_thread_t *bitmap_start_daemon(struct bitmap *bitmap, void (*func)(mddev_t *), char *name) { mdk_thread_t *daemon; - unsigned long flags; char namebuf[32]; - spin_lock_irqsave(&bitmap->lock, flags); - *ptr = NULL; - - if (!bitmap->file) /* no need for daemon if there's no backing file */ - goto out_unlock; - - spin_unlock_irqrestore(&bitmap->lock, flags); - #ifdef INJECT_FATAL_FAULT_2 daemon = NULL; #else @@ -1194,47 +1203,32 @@ static int bitmap_start_daemon(struct bitmap *bitmap, mdk_thread_t **ptr, if (!daemon) { printk(KERN_ERR "%s: failed to start bitmap daemon\n", bmname(bitmap)); - return -ECHILD; + return ERR_PTR(-ECHILD); } - spin_lock_irqsave(&bitmap->lock, flags); - *ptr = daemon; - md_wakeup_thread(daemon); /* start it running */ PRINTK("%s: %s daemon (pid %d) started...\n", bmname(bitmap), name, daemon->tsk->pid); -out_unlock: - spin_unlock_irqrestore(&bitmap->lock, flags); - return 0; -} -static int bitmap_start_daemons(struct bitmap *bitmap) -{ - int err = bitmap_start_daemon(bitmap, &bitmap->writeback_daemon, - bitmap_writeback_daemon, "bitmap_wb"); - return err; + return daemon; } -static void bitmap_stop_daemon(struct bitmap *bitmap, mdk_thread_t **ptr) +static void bitmap_stop_daemon(struct bitmap *bitmap) { - mdk_thread_t *daemon; - unsigned long flags; - - spin_lock_irqsave(&bitmap->lock, flags); - daemon = *ptr; - *ptr = NULL; - spin_unlock_irqrestore(&bitmap->lock, flags); - if (daemon) - md_unregister_thread(daemon); /* destroy the thread */ -} + /* the daemon can't stop itself... it'll just exit instead... */ + if (bitmap->writeback_daemon && ! IS_ERR(bitmap->writeback_daemon) && + current->pid != bitmap->writeback_daemon->tsk->pid) { + mdk_thread_t *daemon; + unsigned long flags; -static void bitmap_stop_daemons(struct bitmap *bitmap) -{ - /* the daemons can't stop themselves... they'll just exit instead... */ - if (bitmap->writeback_daemon && - current->pid != bitmap->writeback_daemon->tsk->pid) - bitmap_stop_daemon(bitmap, &bitmap->writeback_daemon); + spin_lock_irqsave(&bitmap->lock, flags); + daemon = bitmap->writeback_daemon; + bitmap->writeback_daemon = NULL; + spin_unlock_irqrestore(&bitmap->lock, flags); + if (daemon && ! IS_ERR(daemon)) + md_unregister_thread(daemon); /* destroy the thread */ + } } static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, @@ -1274,9 +1268,16 @@ static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, } } -int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors) +int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind) { if (!bitmap) return 0; + + if (behind) { + atomic_inc(&bitmap->behind_writes); + PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n", + atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); + } + while (sectors) { int blocks; bitmap_counter_t *bmc; @@ -1311,9 +1312,15 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect } void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, - int success) + int success, int behind) { if (!bitmap) return; + if (behind) { + atomic_dec(&bitmap->behind_writes); + PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n", + atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); + } + while (sectors) { int blocks; unsigned long flags; @@ -1424,7 +1431,7 @@ void bitmap_close_sync(struct bitmap *bitmap) } } -static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset) +static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) { /* For each chunk covered by any of these sectors, set the * counter to 1 and set resync_needed. They should all @@ -1441,7 +1448,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset) } if (! *bmc) { struct page *page; - *bmc = 1 | NEEDED_MASK; + *bmc = 1 | (needed?NEEDED_MASK:0); bitmap_count_page(bitmap, offset, 1); page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)); set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); @@ -1476,17 +1483,14 @@ void bitmap_flush(mddev_t *mddev) /* * free memory that was allocated */ -void bitmap_destroy(mddev_t *mddev) +static void bitmap_free(struct bitmap *bitmap) { unsigned long k, pages; struct bitmap_page *bp; - struct bitmap *bitmap = mddev->bitmap; if (!bitmap) /* there was no bitmap */ return; - mddev->bitmap = NULL; /* disconnect from the md device */ - /* release the bitmap file and kill the daemon */ bitmap_file_put(bitmap); @@ -1504,6 +1508,17 @@ void bitmap_destroy(mddev_t *mddev) kfree(bp); kfree(bitmap); } +void bitmap_destroy(mddev_t *mddev) +{ + struct bitmap *bitmap = mddev->bitmap; + + if (!bitmap) /* there was no bitmap */ + return; + + mddev->bitmap = NULL; /* disconnect from the md device */ + + bitmap_free(bitmap); +} /* * initialize the bitmap structure @@ -1517,6 +1532,7 @@ int bitmap_create(mddev_t *mddev) unsigned long pages; struct file *file = mddev->bitmap_file; int err; + sector_t start; BUG_ON(sizeof(bitmap_super_t) != 256); @@ -1533,15 +1549,15 @@ int bitmap_create(mddev_t *mddev) spin_lock_init(&bitmap->lock); bitmap->mddev = mddev; - mddev->bitmap = bitmap; spin_lock_init(&bitmap->write_lock); INIT_LIST_HEAD(&bitmap->complete_pages); init_waitqueue_head(&bitmap->write_wait); bitmap->write_pool = mempool_create(WRITE_POOL_SIZE, write_pool_alloc, write_pool_free, NULL); + err = -ENOMEM; if (!bitmap->write_pool) - return -ENOMEM; + goto error; bitmap->file = file; bitmap->offset = mddev->bitmap_offset; @@ -1549,7 +1565,7 @@ int bitmap_create(mddev_t *mddev) /* read superblock from bitmap file (this sets bitmap->chunksize) */ err = bitmap_read_sb(bitmap); if (err) - return err; + goto error; bitmap->chunkshift = find_first_bit(&bitmap->chunksize, sizeof(bitmap->chunksize)); @@ -1573,27 +1589,44 @@ int bitmap_create(mddev_t *mddev) #else bitmap->bp = kmalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL); #endif + err = -ENOMEM; if (!bitmap->bp) - return -ENOMEM; + goto error; memset(bitmap->bp, 0, pages * sizeof(*bitmap->bp)); bitmap->flags |= BITMAP_ACTIVE; /* now that we have some pages available, initialize the in-memory * bitmap from the on-disk bitmap */ - err = bitmap_init_from_disk(bitmap); + start = 0; + if (mddev->degraded == 0 + || bitmap->events_cleared == mddev->events) + /* no need to keep dirty bits to optimise a re-add of a missing device */ + start = mddev->recovery_cp; + err = bitmap_init_from_disk(bitmap, start); if (err) - return err; + goto error; printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", pages, bmname(bitmap)); - /* kick off the bitmap daemons */ - err = bitmap_start_daemons(bitmap); - if (err) - return err; + mddev->bitmap = bitmap; + + if (file) + /* kick off the bitmap writeback daemon */ + bitmap->writeback_daemon = + bitmap_start_daemon(bitmap, + bitmap_writeback_daemon, + "bitmap_wb"); + + if (IS_ERR(bitmap->writeback_daemon)) + return PTR_ERR(bitmap->writeback_daemon); return bitmap_update_sb(bitmap); + + error: + bitmap_free(bitmap); + return err; } /* the bitmap API -- for raid personalities */ diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index 17212b4201a..cc07bbebbb1 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c @@ -568,12 +568,9 @@ int dm_create_persistent(struct exception_store *store, uint32_t chunk_size) bad: dm_io_put(sectors_to_pages(chunk_size)); - if (ps) { - if (ps->area) - free_area(ps); - - kfree(ps); - } + if (ps && ps->area) + free_area(ps); + kfree(ps); return r; } diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index b08df8b9b2c..86328251375 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -375,16 +375,18 @@ static void rh_inc(struct region_hash *rh, region_t region) read_lock(&rh->hash_lock); reg = __rh_find(rh, region); + + atomic_inc(®->pending); + + spin_lock_irq(&rh->region_lock); if (reg->state == RH_CLEAN) { rh->log->type->mark_region(rh->log, reg->key); - spin_lock_irq(&rh->region_lock); reg->state = RH_DIRTY; list_del_init(®->list); /* take off the clean list */ - spin_unlock_irq(&rh->region_lock); } + spin_unlock_irq(&rh->region_lock); - atomic_inc(®->pending); read_unlock(&rh->hash_lock); } @@ -408,6 +410,10 @@ static void rh_dec(struct region_hash *rh, region_t region) if (atomic_dec_and_test(®->pending)) { spin_lock_irqsave(&rh->region_lock, flags); + if (atomic_read(®->pending)) { /* check race */ + spin_unlock_irqrestore(&rh->region_lock, flags); + return; + } if (reg->state == RH_RECOVERING) { list_add_tail(®->list, &rh->quiesced_regions); } else { diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 8d740013d74..bb279fad2fd 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -38,7 +38,8 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) /* * sector_div(a,b) returns the remainer and sets a to a/b */ - (void)sector_div(block, conf->smallest->size); + block >>= conf->preshift; + (void)sector_div(block, conf->hash_spacing); hash = conf->hash_table[block]; while ((sector>>1) >= (hash->size + hash->offset)) @@ -47,7 +48,7 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) } /** - * linear_mergeable_bvec -- tell bio layer if a two requests can be merged + * linear_mergeable_bvec -- tell bio layer if two requests can be merged * @q: request queue * @bio: the buffer head that's been built up so far * @biovec: the request that could be merged to it. @@ -116,7 +117,7 @@ static int linear_run (mddev_t *mddev) dev_info_t **table; mdk_rdev_t *rdev; int i, nb_zone, cnt; - sector_t start; + sector_t min_spacing; sector_t curr_offset; struct list_head *tmp; @@ -127,11 +128,6 @@ static int linear_run (mddev_t *mddev) memset(conf, 0, sizeof(*conf) + mddev->raid_disks*sizeof(dev_info_t)); mddev->private = conf; - /* - * Find the smallest device. - */ - - conf->smallest = NULL; cnt = 0; mddev->array_size = 0; @@ -159,8 +155,6 @@ static int linear_run (mddev_t *mddev) disk->size = rdev->size; mddev->array_size += rdev->size; - if (!conf->smallest || (disk->size < conf->smallest->size)) - conf->smallest = disk; cnt++; } if (cnt != mddev->raid_disks) { @@ -168,6 +162,36 @@ static int linear_run (mddev_t *mddev) goto out; } + min_spacing = mddev->array_size; + sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *)); + + /* min_spacing is the minimum spacing that will fit the hash + * table in one PAGE. This may be much smaller than needed. + * We find the smallest non-terminal set of consecutive devices + * that is larger than min_spacing as use the size of that as + * the actual spacing + */ + conf->hash_spacing = mddev->array_size; + for (i=0; i < cnt-1 ; i++) { + sector_t sz = 0; + int j; + for (j=i; i<cnt-1 && sz < min_spacing ; j++) + sz += conf->disks[j].size; + if (sz >= min_spacing && sz < conf->hash_spacing) + conf->hash_spacing = sz; + } + + /* hash_spacing may be too large for sector_div to work with, + * so we might need to pre-shift + */ + conf->preshift = 0; + if (sizeof(sector_t) > sizeof(u32)) { + sector_t space = conf->hash_spacing; + while (space > (sector_t)(~(u32)0)) { + space >>= 1; + conf->preshift++; + } + } /* * This code was restructured to work around a gcc-2.95.3 internal * compiler error. Alter it with care. @@ -177,39 +201,52 @@ static int linear_run (mddev_t *mddev) unsigned round; unsigned long base; - sz = mddev->array_size; - base = conf->smallest->size; + sz = mddev->array_size >> conf->preshift; + sz += 1; /* force round-up */ + base = conf->hash_spacing >> conf->preshift; round = sector_div(sz, base); - nb_zone = conf->nr_zones = sz + (round ? 1 : 0); + nb_zone = sz + (round ? 1 : 0); } - - conf->hash_table = kmalloc (sizeof (dev_info_t*) * nb_zone, + BUG_ON(nb_zone > PAGE_SIZE / sizeof(struct dev_info *)); + + conf->hash_table = kmalloc (sizeof (struct dev_info *) * nb_zone, GFP_KERNEL); if (!conf->hash_table) goto out; /* * Here we generate the linear hash table + * First calculate the device offsets. */ + conf->disks[0].offset = 0; + for (i=1; i<mddev->raid_disks; i++) + conf->disks[i].offset = + conf->disks[i-1].offset + + conf->disks[i-1].size; + table = conf->hash_table; - start = 0; curr_offset = 0; - for (i = 0; i < cnt; i++) { - dev_info_t *disk = conf->disks + i; + i = 0; + for (curr_offset = 0; + curr_offset < mddev->array_size; + curr_offset += conf->hash_spacing) { - disk->offset = curr_offset; - curr_offset += disk->size; + while (i < mddev->raid_disks-1 && + curr_offset >= conf->disks[i+1].offset) + i++; - /* 'curr_offset' is the end of this disk - * 'start' is the start of table + *table ++ = conf->disks + i; + } + + if (conf->preshift) { + conf->hash_spacing >>= conf->preshift; + /* round hash_spacing up so that when we divide by it, + * we err on the side of "too-low", which is safest. */ - while (start < curr_offset) { - *table++ = disk; - start += conf->smallest->size; - } + conf->hash_spacing++; } - if (table-conf->hash_table != nb_zone) - BUG(); + + BUG_ON(table - conf->hash_table > nb_zone); blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); mddev->queue->unplug_fn = linear_unplug; @@ -238,6 +275,11 @@ static int linear_make_request (request_queue_t *q, struct bio *bio) dev_info_t *tmp_dev; sector_t block; + if (unlikely(bio_barrier(bio))) { + bio_endio(bio, bio->bi_size, -EOPNOTSUPP); + return 0; + } + if (bio_data_dir(bio)==WRITE) { disk_stat_inc(mddev->gendisk, writes); disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); @@ -294,7 +336,7 @@ static void linear_status (struct seq_file *seq, mddev_t *mddev) sector_t s = 0; seq_printf(seq, " "); - for (j = 0; j < conf->nr_zones; j++) + for (j = 0; j < mddev->raid_disks; j++) { char b[BDEVNAME_SIZE]; s += conf->smallest_size; diff --git a/drivers/md/md.c b/drivers/md/md.c index 20ca80b7dc2..2897df90df4 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -34,6 +34,7 @@ #include <linux/module.h> #include <linux/config.h> +#include <linux/kthread.h> #include <linux/linkage.h> #include <linux/raid/md.h> #include <linux/raid/bitmap.h> @@ -73,7 +74,7 @@ static DEFINE_SPINLOCK(pers_lock); * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' * is 1000 KB/sec, so the extra system load does not show up that much. * Increase it if you want to have more _guaranteed_ speed. Note that - * the RAID driver will use the maximum available bandwith if the IO + * the RAID driver will use the maximum available bandwidth if the IO * subsystem is idle. There is also an 'absolute maximum' reconstruction * speed limit - in case reconstruction slows down your system despite * idle IO detection. @@ -393,7 +394,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size, return ret; } -static int read_disk_sb(mdk_rdev_t * rdev) +static int read_disk_sb(mdk_rdev_t * rdev, int size) { char b[BDEVNAME_SIZE]; if (!rdev->sb_page) { @@ -404,7 +405,7 @@ static int read_disk_sb(mdk_rdev_t * rdev) return 0; - if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) + if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ)) goto fail; rdev->sb_loaded = 1; return 0; @@ -531,7 +532,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version sb_offset = calc_dev_sboffset(rdev->bdev); rdev->sb_offset = sb_offset; - ret = read_disk_sb(rdev); + ret = read_disk_sb(rdev, MD_SB_BYTES); if (ret) return ret; ret = -EINVAL; @@ -564,6 +565,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version rdev->preferred_minor = sb->md_minor; rdev->data_offset = 0; + rdev->sb_size = MD_SB_BYTES; if (sb->level == LEVEL_MULTIPATH) rdev->desc_nr = -1; @@ -623,6 +625,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->size = sb->size; mddev->events = md_event(sb); mddev->bitmap_offset = 0; + mddev->default_bitmap_offset = MD_SB_BYTES >> 9; if (sb->state & (1<<MD_SB_CLEAN)) mddev->recovery_cp = MaxSector; @@ -643,12 +646,12 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && mddev->bitmap_file == NULL) { - if (mddev->level != 1) { + if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6) { /* FIXME use a better test */ printk(KERN_WARNING "md: bitmaps only support for raid1\n"); return -EINVAL; } - mddev->bitmap_offset = (MD_SB_BYTES >> 9); + mddev->bitmap_offset = mddev->default_bitmap_offset; } } else if (mddev->pers == NULL) { @@ -669,6 +672,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) if (mddev->level != LEVEL_MULTIPATH) { rdev->faulty = 0; + rdev->flags = 0; desc = sb->disks + rdev->desc_nr; if (desc->state & (1<<MD_DISK_FAULTY)) @@ -678,6 +682,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) rdev->in_sync = 1; rdev->raid_disk = desc->raid_disk; } + if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) + set_bit(WriteMostly, &rdev->flags); } else /* MULTIPATH are always insync */ rdev->in_sync = 1; return 0; @@ -706,6 +712,8 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) int i; int active=0, working=0,failed=0,spare=0,nr_disks=0; + rdev->sb_size = MD_SB_BYTES; + sb = (mdp_super_t*)page_address(rdev->sb_page); memset(sb, 0, sizeof(*sb)); @@ -776,6 +784,8 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) spare++; working++; } + if (test_bit(WriteMostly, &rdev2->flags)) + d->state |= (1<<MD_DISK_WRITEMOSTLY); } /* now set the "removed" and "faulty" bits on any missing devices */ @@ -831,6 +841,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) int ret; sector_t sb_offset; char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; + int bmask; /* * Calculate the position of the superblock. @@ -859,7 +870,10 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) } rdev->sb_offset = sb_offset; - ret = read_disk_sb(rdev); + /* superblock is rarely larger than 1K, but it can be larger, + * and it is safe to read 4k, so we do that + */ + ret = read_disk_sb(rdev, 4096); if (ret) return ret; @@ -869,7 +883,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) sb->major_version != cpu_to_le32(1) || le32_to_cpu(sb->max_dev) > (4096-256)/2 || le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || - sb->feature_map != 0) + (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) return -EINVAL; if (calc_sb_1_csum(sb) != sb->sb_csum) { @@ -885,6 +899,11 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) rdev->preferred_minor = 0xffff; rdev->data_offset = le64_to_cpu(sb->data_offset); + rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; + bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; + if (rdev->sb_size & bmask) + rdev-> sb_size = (rdev->sb_size | bmask)+1; + if (refdev == 0) return 1; else { @@ -939,13 +958,15 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->size = le64_to_cpu(sb->size)/2; mddev->events = le64_to_cpu(sb->events); mddev->bitmap_offset = 0; + mddev->default_bitmap_offset = 0; + mddev->default_bitmap_offset = 1024; mddev->recovery_cp = le64_to_cpu(sb->resync_offset); memcpy(mddev->uuid, sb->set_uuid, 16); mddev->max_disks = (4096-256)/2; - if ((le32_to_cpu(sb->feature_map) & 1) && + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && mddev->bitmap_file == NULL ) { if (mddev->level != 1) { printk(KERN_WARNING "md: bitmaps only supported for raid1\n"); @@ -986,6 +1007,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) rdev->raid_disk = role; break; } + rdev->flags = 0; + if (sb->devflags & WriteMostly1) + set_bit(WriteMostly, &rdev->flags); } else /* MULTIPATH are always insync */ rdev->in_sync = 1; @@ -1017,7 +1041,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) if (mddev->bitmap && mddev->bitmap_file == NULL) { sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); - sb->feature_map = cpu_to_le32(1); + sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); } max_dev = 0; @@ -1363,7 +1387,7 @@ repeat: dprintk("%s ", bdevname(rdev->bdev,b)); if (!rdev->faulty) { md_super_write(mddev,rdev, - rdev->sb_offset<<1, MD_SB_BYTES, + rdev->sb_offset<<1, rdev->sb_size, rdev->sb_page); dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", bdevname(rdev->bdev,b), @@ -2073,6 +2097,8 @@ static int get_array_info(mddev_t * mddev, void __user * arg) info.state = 0; if (mddev->in_sync) info.state = (1<<MD_SB_CLEAN); + if (mddev->bitmap && mddev->bitmap_offset) + info.state = (1<<MD_SB_BITMAP_PRESENT); info.active_disks = active; info.working_disks = working; info.failed_disks = failed; @@ -2087,7 +2113,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg) return 0; } -static int get_bitmap_file(mddev_t * mddev, void * arg) +static int get_bitmap_file(mddev_t * mddev, void __user * arg) { mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ char *ptr, *buf = NULL; @@ -2146,6 +2172,8 @@ static int get_disk_info(mddev_t * mddev, void __user * arg) info.state |= (1<<MD_DISK_ACTIVE); info.state |= (1<<MD_DISK_SYNC); } + if (test_bit(WriteMostly, &rdev->flags)) + info.state |= (1<<MD_DISK_WRITEMOSTLY); } else { info.major = info.minor = 0; info.raid_disk = -1; @@ -2210,8 +2238,11 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) mdname(mddev)); return -EINVAL; } - rdev = md_import_device(dev, mddev->major_version, - mddev->minor_version); + if (mddev->persistent) + rdev = md_import_device(dev, mddev->major_version, + mddev->minor_version); + else + rdev = md_import_device(dev, -1, -1); if (IS_ERR(rdev)) { printk(KERN_WARNING "md: md_import_device returned %ld\n", @@ -2231,6 +2262,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) rdev->saved_raid_disk = rdev->raid_disk; rdev->in_sync = 0; /* just to be sure */ + if (info->state & (1<<MD_DISK_WRITEMOSTLY)) + set_bit(WriteMostly, &rdev->flags); + rdev->raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); if (err) @@ -2271,6 +2305,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) else rdev->in_sync = 0; + if (info->state & (1<<MD_DISK_WRITEMOSTLY)) + set_bit(WriteMostly, &rdev->flags); + err = bind_rdev_to_array(rdev, mddev); if (err) { export_rdev(rdev); @@ -2430,25 +2467,51 @@ static int set_bitmap_file(mddev_t *mddev, int fd) { int err; - if (mddev->pers) - return -EBUSY; + if (mddev->pers) { + if (!mddev->pers->quiesce) + return -EBUSY; + if (mddev->recovery || mddev->sync_thread) + return -EBUSY; + /* we should be able to change the bitmap.. */ + } - mddev->bitmap_file = fget(fd); - if (mddev->bitmap_file == NULL) { - printk(KERN_ERR "%s: error: failed to get bitmap file\n", - mdname(mddev)); - return -EBADF; - } + if (fd >= 0) { + if (mddev->bitmap) + return -EEXIST; /* cannot add when bitmap is present */ + mddev->bitmap_file = fget(fd); - err = deny_bitmap_write_access(mddev->bitmap_file); - if (err) { - printk(KERN_ERR "%s: error: bitmap file is already in use\n", - mdname(mddev)); - fput(mddev->bitmap_file); - mddev->bitmap_file = NULL; - } else + if (mddev->bitmap_file == NULL) { + printk(KERN_ERR "%s: error: failed to get bitmap file\n", + mdname(mddev)); + return -EBADF; + } + + err = deny_bitmap_write_access(mddev->bitmap_file); + if (err) { + printk(KERN_ERR "%s: error: bitmap file is already in use\n", + mdname(mddev)); + fput(mddev->bitmap_file); + mddev->bitmap_file = NULL; + return err; + } mddev->bitmap_offset = 0; /* file overrides offset */ + } else if (mddev->bitmap == NULL) + return -ENOENT; /* cannot remove what isn't there */ + err = 0; + if (mddev->pers) { + mddev->pers->quiesce(mddev, 1); + if (fd >= 0) + err = bitmap_create(mddev); + if (fd < 0 || err) + bitmap_destroy(mddev); + mddev->pers->quiesce(mddev, 0); + } else if (fd < 0) { + if (mddev->bitmap_file) + fput(mddev->bitmap_file); + mddev->bitmap_file = NULL; + } + return err; } @@ -2528,6 +2591,11 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) { int rv = 0; int cnt = 0; + int state = 0; + + /* calculate expected state,ignoring low bits */ + if (mddev->bitmap && mddev->bitmap_offset) + state |= (1 << MD_SB_BITMAP_PRESENT); if (mddev->major_version != info->major_version || mddev->minor_version != info->minor_version || @@ -2536,12 +2604,16 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) mddev->level != info->level || /* mddev->layout != info->layout || */ !mddev->persistent != info->not_persistent|| - mddev->chunk_size != info->chunk_size ) + mddev->chunk_size != info->chunk_size || + /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ + ((state^info->state) & 0xfffffe00) + ) return -EINVAL; /* Check there is only one change */ if (mddev->size != info->size) cnt++; if (mddev->raid_disks != info->raid_disks) cnt++; if (mddev->layout != info->layout) cnt++; + if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; if (cnt == 0) return 0; if (cnt > 1) return -EINVAL; @@ -2620,6 +2692,35 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) } } } + if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { + if (mddev->pers->quiesce == NULL) + return -EINVAL; + if (mddev->recovery || mddev->sync_thread) + return -EBUSY; + if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { + /* add the bitmap */ + if (mddev->bitmap) + return -EEXIST; + if (mddev->default_bitmap_offset == 0) + return -EINVAL; + mddev->bitmap_offset = mddev->default_bitmap_offset; + mddev->pers->quiesce(mddev, 1); + rv = bitmap_create(mddev); + if (rv) + bitmap_destroy(mddev); + mddev->pers->quiesce(mddev, 0); + } else { + /* remove the bitmap */ + if (!mddev->bitmap) + return -ENOENT; + if (mddev->bitmap->file) + return -EINVAL; + mddev->pers->quiesce(mddev, 1); + bitmap_destroy(mddev); + mddev->pers->quiesce(mddev, 0); + mddev->bitmap_offset = 0; + } + } md_update_sb(mddev); return rv; } @@ -2781,7 +2882,7 @@ static int md_ioctl(struct inode *inode, struct file *file, goto done_unlock; case GET_BITMAP_FILE: - err = get_bitmap_file(mddev, (void *)arg); + err = get_bitmap_file(mddev, argp); goto done_unlock; case GET_DISK_INFO: @@ -2950,18 +3051,6 @@ static int md_thread(void * arg) { mdk_thread_t *thread = arg; - lock_kernel(); - - /* - * Detach thread - */ - - daemonize(thread->name, mdname(thread->mddev)); - - current->exit_signal = SIGCHLD; - allow_signal(SIGKILL); - thread->tsk = current; - /* * md_thread is a 'system-thread', it's priority should be very * high. We avoid resource deadlocks individually in each @@ -2973,14 +3062,14 @@ static int md_thread(void * arg) * bdflush, otherwise bdflush will deadlock if there are too * many dirty RAID5 blocks. */ - unlock_kernel(); complete(thread->event); - while (thread->run) { + while (!kthread_should_stop()) { void (*run)(mddev_t *); wait_event_interruptible_timeout(thread->wqueue, - test_bit(THREAD_WAKEUP, &thread->flags), + test_bit(THREAD_WAKEUP, &thread->flags) + || kthread_should_stop(), thread->timeout); try_to_freeze(); @@ -2989,11 +3078,8 @@ static int md_thread(void * arg) run = thread->run; if (run) run(thread->mddev); - - if (signal_pending(current)) - flush_signals(current); } - complete(thread->event); + return 0; } @@ -3010,11 +3096,9 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, const char *name) { mdk_thread_t *thread; - int ret; struct completion event; - thread = (mdk_thread_t *) kmalloc - (sizeof(mdk_thread_t), GFP_KERNEL); + thread = kmalloc(sizeof(mdk_thread_t), GFP_KERNEL); if (!thread) return NULL; @@ -3027,8 +3111,8 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, thread->mddev = mddev; thread->name = name; thread->timeout = MAX_SCHEDULE_TIMEOUT; - ret = kernel_thread(md_thread, thread, 0); - if (ret < 0) { + thread->tsk = kthread_run(md_thread, thread, mdname(thread->mddev)); + if (IS_ERR(thread->tsk)) { kfree(thread); return NULL; } @@ -3038,21 +3122,9 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, void md_unregister_thread(mdk_thread_t *thread) { - struct completion event; - - init_completion(&event); - - thread->event = &event; - - /* As soon as ->run is set to NULL, the task could disappear, - * so we need to hold tasklist_lock until we have sent the signal - */ dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); - read_lock(&tasklist_lock); - thread->run = NULL; - send_sig(SIGKILL, thread->tsk, 1); - read_unlock(&tasklist_lock); - wait_for_completion(&event); + + kthread_stop(thread->tsk); kfree(thread); } @@ -3259,10 +3331,13 @@ static int md_seq_show(struct seq_file *seq, void *v) char b[BDEVNAME_SIZE]; seq_printf(seq, " %s[%d]", bdevname(rdev->bdev,b), rdev->desc_nr); + if (test_bit(WriteMostly, &rdev->flags)) + seq_printf(seq, "(W)"); if (rdev->faulty) { seq_printf(seq, "(F)"); continue; - } + } else if (rdev->raid_disk < 0) + seq_printf(seq, "(S)"); /* spare */ size += rdev->size; } @@ -3274,6 +3349,15 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\n %llu blocks", (unsigned long long)size); } + if (mddev->persistent) { + if (mddev->major_version != 0 || + mddev->minor_version != 90) { + seq_printf(seq," super %d.%d", + mddev->major_version, + mddev->minor_version); + } + } else + seq_printf(seq, " super non-persistent"); if (mddev->pers) { mddev->pers->status (seq, mddev); @@ -3416,7 +3500,6 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok) */ void md_write_start(mddev_t *mddev, struct bio *bi) { - DEFINE_WAIT(w); if (bio_data_dir(bi) != WRITE) return; @@ -3533,7 +3616,7 @@ static void md_do_sync(mddev_t *mddev) printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" " %d KB/sec/disc.\n", sysctl_speed_limit_min); - printk(KERN_INFO "md: using maximum available idle IO bandwith " + printk(KERN_INFO "md: using maximum available idle IO bandwidth " "(but not more than %d KB/sec) for reconstruction.\n", sysctl_speed_limit_max); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 2d2ca7fa026..286342375fb 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -169,6 +169,11 @@ static int multipath_make_request (request_queue_t *q, struct bio * bio) struct multipath_bh * mp_bh; struct multipath_info *multipath; + if (unlikely(bio_barrier(bio))) { + bio_endio(bio, bio->bi_size, -EOPNOTSUPP); + return 0; + } + mp_bh = mempool_alloc(conf->pool, GFP_NOIO); mp_bh->master_bio = bio; diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 2120710172c..f6757259ce7 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -404,6 +404,11 @@ static int raid0_make_request (request_queue_t *q, struct bio *bio) unsigned long chunk; sector_t block, rsect; + if (unlikely(bio_barrier(bio))) { + bio_endio(bio, bio->bi_size, -EOPNOTSUPP); + return 0; + } + if (bio_data_dir(bio)==WRITE) { disk_stat_inc(mddev->gendisk, writes); disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 51d9645ed09..a93ca478142 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -222,8 +222,17 @@ static void raid_end_bio_io(r1bio_t *r1_bio) { struct bio *bio = r1_bio->master_bio; - bio_endio(bio, bio->bi_size, - test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); + /* if nobody has done the final endio yet, do it now */ + if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { + PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n", + (bio_data_dir(bio) == WRITE) ? "write" : "read", + (unsigned long long) bio->bi_sector, + (unsigned long long) bio->bi_sector + + (bio->bi_size >> 9) - 1); + + bio_endio(bio, bio->bi_size, + test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); + } free_r1bio(r1_bio); } @@ -292,7 +301,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); - int mirror; + int mirror, behind; conf_t *conf = mddev_to_conf(r1_bio->mddev); if (bio->bi_size) @@ -323,16 +332,46 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int update_head_pos(mirror, r1_bio); + behind = test_bit(R1BIO_BehindIO, &r1_bio->state); + if (behind) { + if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) + atomic_dec(&r1_bio->behind_remaining); + + /* In behind mode, we ACK the master bio once the I/O has safely + * reached all non-writemostly disks. Setting the Returned bit + * ensures that this gets done only once -- we don't ever want to + * return -EIO here, instead we'll wait */ + + if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && + test_bit(R1BIO_Uptodate, &r1_bio->state)) { + /* Maybe we can return now */ + if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { + struct bio *mbio = r1_bio->master_bio; + PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", + (unsigned long long) mbio->bi_sector, + (unsigned long long) mbio->bi_sector + + (mbio->bi_size >> 9) - 1); + bio_endio(mbio, mbio->bi_size, 0); + } + } + } /* * * Let's see if all mirrored write operations have finished * already. */ if (atomic_dec_and_test(&r1_bio->remaining)) { + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { + /* free extra copy of the data pages */ + int i = bio->bi_vcnt; + while (i--) + __free_page(bio->bi_io_vec[i].bv_page); + } /* clear the bitmap if all writes complete successfully */ bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, r1_bio->sectors, - !test_bit(R1BIO_Degraded, &r1_bio->state)); + !test_bit(R1BIO_Degraded, &r1_bio->state), + behind); md_write_end(r1_bio->mddev); raid_end_bio_io(r1_bio); } @@ -360,13 +399,14 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) { const unsigned long this_sector = r1_bio->sector; int new_disk = conf->last_used, disk = new_disk; + int wonly_disk = -1; const int sectors = r1_bio->sectors; sector_t new_distance, current_distance; - mdk_rdev_t *new_rdev, *rdev; + mdk_rdev_t *rdev; rcu_read_lock(); /* - * Check if it if we can balance. We can balance on the whole + * Check if we can balance. We can balance on the whole * device if no resync is going on, or below the resync window. * We take the first readable disk when above the resync window. */ @@ -376,11 +416,16 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) /* Choose the first operation device, for consistancy */ new_disk = 0; - while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || - !new_rdev->in_sync) { - new_disk++; - if (new_disk == conf->raid_disks) { - new_disk = -1; + for (rdev = conf->mirrors[new_disk].rdev; + !rdev || !rdev->in_sync + || test_bit(WriteMostly, &rdev->flags); + rdev = conf->mirrors[++new_disk].rdev) { + + if (rdev && rdev->in_sync) + wonly_disk = new_disk; + + if (new_disk == conf->raid_disks - 1) { + new_disk = wonly_disk; break; } } @@ -389,16 +434,26 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) /* make sure the disk is operational */ - while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || - !new_rdev->in_sync) { + for (rdev = conf->mirrors[new_disk].rdev; + !rdev || !rdev->in_sync || + test_bit(WriteMostly, &rdev->flags); + rdev = conf->mirrors[new_disk].rdev) { + + if (rdev && rdev->in_sync) + wonly_disk = new_disk; + if (new_disk <= 0) new_disk = conf->raid_disks; new_disk--; if (new_disk == disk) { - new_disk = -1; - goto rb_out; + new_disk = wonly_disk; + break; } } + + if (new_disk < 0) + goto rb_out; + disk = new_disk; /* now disk == new_disk == starting point for search */ @@ -419,37 +474,41 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) disk = conf->raid_disks; disk--; - if ((rdev=conf->mirrors[disk].rdev) == NULL || - !rdev->in_sync) + rdev = conf->mirrors[disk].rdev; + + if (!rdev || + !rdev->in_sync || + test_bit(WriteMostly, &rdev->flags)) continue; if (!atomic_read(&rdev->nr_pending)) { new_disk = disk; - new_rdev = rdev; break; } new_distance = abs(this_sector - conf->mirrors[disk].head_position); if (new_distance < current_distance) { current_distance = new_distance; new_disk = disk; - new_rdev = rdev; } } while (disk != conf->last_used); -rb_out: + rb_out: if (new_disk >= 0) { - conf->next_seq_sect = this_sector + sectors; - conf->last_used = new_disk; - atomic_inc(&new_rdev->nr_pending); - if (!new_rdev->in_sync) { + rdev = conf->mirrors[new_disk].rdev; + if (!rdev) + goto retry; + atomic_inc(&rdev->nr_pending); + if (!rdev->in_sync) { /* cannot risk returning a device that failed * before we inc'ed nr_pending */ - atomic_dec(&new_rdev->nr_pending); + atomic_dec(&rdev->nr_pending); goto retry; } + conf->next_seq_sect = this_sector + sectors; + conf->last_used = new_disk; } rcu_read_unlock(); @@ -542,6 +601,39 @@ static void device_barrier(conf_t *conf, sector_t sect) spin_unlock_irq(&conf->resync_lock); } +/* duplicate the data pages for behind I/O */ +static struct page **alloc_behind_pages(struct bio *bio) +{ + int i; + struct bio_vec *bvec; + struct page **pages = kmalloc(bio->bi_vcnt * sizeof(struct page *), + GFP_NOIO); + if (unlikely(!pages)) + goto do_sync_io; + + memset(pages, 0, bio->bi_vcnt * sizeof(struct page *)); + + bio_for_each_segment(bvec, bio, i) { + pages[i] = alloc_page(GFP_NOIO); + if (unlikely(!pages[i])) + goto do_sync_io; + memcpy(kmap(pages[i]) + bvec->bv_offset, + kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); + kunmap(pages[i]); + kunmap(bvec->bv_page); + } + + return pages; + +do_sync_io: + if (pages) + for (i = 0; i < bio->bi_vcnt && pages[i]; i++) + __free_page(pages[i]); + kfree(pages); + PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); + return NULL; +} + static int make_request(request_queue_t *q, struct bio * bio) { mddev_t *mddev = q->queuedata; @@ -554,7 +646,12 @@ static int make_request(request_queue_t *q, struct bio * bio) struct bitmap *bitmap = mddev->bitmap; unsigned long flags; struct bio_list bl; + struct page **behind_pages = NULL; + if (unlikely(bio_barrier(bio))) { + bio_endio(bio, bio->bi_size, -EOPNOTSUPP); + return 0; + } /* * Register the new request and wait if the reconstruction @@ -589,8 +686,6 @@ static int make_request(request_queue_t *q, struct bio * bio) r1_bio->mddev = mddev; r1_bio->sector = bio->bi_sector; - r1_bio->state = 0; - if (bio_data_dir(bio) == READ) { /* * read balancing logic: @@ -651,13 +746,22 @@ static int make_request(request_queue_t *q, struct bio * bio) } rcu_read_unlock(); + BUG_ON(targets == 0); /* we never fail the last device */ + if (targets < conf->raid_disks) { /* array is degraded, we will not clear the bitmap * on I/O completion (see raid1_end_write_request) */ set_bit(R1BIO_Degraded, &r1_bio->state); } + /* do behind I/O ? */ + if (bitmap && + atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind && + (behind_pages = alloc_behind_pages(bio)) != NULL) + set_bit(R1BIO_BehindIO, &r1_bio->state); + atomic_set(&r1_bio->remaining, 0); + atomic_set(&r1_bio->behind_remaining, 0); bio_list_init(&bl); for (i = 0; i < disks; i++) { @@ -674,12 +778,31 @@ static int make_request(request_queue_t *q, struct bio * bio) mbio->bi_rw = WRITE; mbio->bi_private = r1_bio; + if (behind_pages) { + struct bio_vec *bvec; + int j; + + /* Yes, I really want the '__' version so that + * we clear any unused pointer in the io_vec, rather + * than leave them unchanged. This is important + * because when we come to free the pages, we won't + * know the originial bi_idx, so we just free + * them all + */ + __bio_for_each_segment(bvec, mbio, j, 0) + bvec->bv_page = behind_pages[j]; + if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) + atomic_inc(&r1_bio->behind_remaining); + } + atomic_inc(&r1_bio->remaining); bio_list_add(&bl, mbio); } + kfree(behind_pages); /* the behind pages are attached to the bios now */ - bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors); + bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, + test_bit(R1BIO_BehindIO, &r1_bio->state)); spin_lock_irqsave(&conf->device_lock, flags); bio_list_merge(&conf->pending_bio_list, &bl); bio_list_init(&bl); @@ -1105,6 +1228,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i sector_t max_sector, nr_sectors; int disk; int i; + int wonly; int write_targets = 0; int sync_blocks; int still_degraded = 0; @@ -1160,14 +1284,21 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i */ disk = conf->last_used; /* make sure disk is operational */ - + wonly = disk; while (conf->mirrors[disk].rdev == NULL || - !conf->mirrors[disk].rdev->in_sync) { + !conf->mirrors[disk].rdev->in_sync || + test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags) + ) { + if (conf->mirrors[disk].rdev && + conf->mirrors[disk].rdev->in_sync) + wonly = disk; if (disk <= 0) disk = conf->raid_disks; disk--; - if (disk == conf->last_used) + if (disk == conf->last_used) { + disk = wonly; break; + } } conf->last_used = disk; atomic_inc(&conf->mirrors[disk].rdev->nr_pending); @@ -1439,6 +1570,17 @@ out: static int stop(mddev_t *mddev) { conf_t *conf = mddev_to_conf(mddev); + struct bitmap *bitmap = mddev->bitmap; + int behind_wait = 0; + + /* wait for behind writes to complete */ + while (bitmap && atomic_read(&bitmap->behind_writes) > 0) { + behind_wait++; + printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(HZ); /* wait a second */ + /* need to kick something here to make sure I/O goes? */ + } md_unregister_thread(mddev->thread); mddev->thread = NULL; @@ -1561,6 +1703,35 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks) return 0; } +static void raid1_quiesce(mddev_t *mddev, int state) +{ + conf_t *conf = mddev_to_conf(mddev); + + switch(state) { + case 1: + spin_lock_irq(&conf->resync_lock); + conf->barrier++; + wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, + conf->resync_lock, raid1_unplug(mddev->queue)); + spin_unlock_irq(&conf->resync_lock); + break; + case 0: + spin_lock_irq(&conf->resync_lock); + conf->barrier--; + spin_unlock_irq(&conf->resync_lock); + wake_up(&conf->wait_resume); + wake_up(&conf->wait_idle); + break; + } + if (mddev->thread) { + if (mddev->bitmap) + mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; + else + mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + md_wakeup_thread(mddev->thread); + } +} + static mdk_personality_t raid1_personality = { @@ -1577,6 +1748,7 @@ static mdk_personality_t raid1_personality = .sync_request = sync_request, .resize = raid1_resize, .reshape = raid1_reshape, + .quiesce = raid1_quiesce, }; static int __init raid_init(void) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 62ebb1bc72b..5bd1e9ec899 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -538,7 +538,8 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) } - current_distance = abs(this_sector - conf->mirrors[disk].head_position); + current_distance = abs(r10_bio->devs[slot].addr - + conf->mirrors[disk].head_position); /* Find the disk whose head is closest */ @@ -668,6 +669,11 @@ static int make_request(request_queue_t *q, struct bio * bio) int i; int chunk_sects = conf->chunk_mask + 1; + if (unlikely(bio_barrier(bio))) { + bio_endio(bio, bio->bi_size, -EOPNOTSUPP); + return 0; + } + /* If this request crosses a chunk boundary, we need to * split it. This will only happen for 1 PAGE (or less) requests. */ @@ -900,6 +906,27 @@ static void close_sync(conf_t *conf) conf->r10buf_pool = NULL; } +/* check if there are enough drives for + * every block to appear on atleast one + */ +static int enough(conf_t *conf) +{ + int first = 0; + + do { + int n = conf->copies; + int cnt = 0; + while (n--) { + if (conf->mirrors[first].rdev) + cnt++; + first = (first+1) % conf->raid_disks; + } + if (cnt == 0) + return 0; + } while (first != 0); + return 1; +} + static int raid10_spare_active(mddev_t *mddev) { int i; @@ -938,6 +965,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) * very different from resync */ return 0; + if (!enough(conf)) + return 0; for (mirror=0; mirror < mddev->raid_disks; mirror++) if ( !(p=conf->mirrors+mirror)->rdev) { @@ -1445,7 +1474,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i } } if (j == conf->copies) { - BUG(); + /* Cannot recover, so abort the recovery */ + put_buf(r10_bio); + r10_bio = rb2; + if (!test_and_set_bit(MD_RECOVERY_ERR, &mddev->recovery)) + printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n", + mdname(mddev)); + break; } } if (biolist == NULL) { @@ -1678,9 +1713,10 @@ static int run(mddev_t *mddev) init_waitqueue_head(&conf->wait_idle); init_waitqueue_head(&conf->wait_resume); - if (!conf->working_disks) { - printk(KERN_ERR "raid10: no operational mirrors for %s\n", - mdname(mddev)); + /* need to check that every block has at least one working mirror */ + if (!enough(conf)) { + printk(KERN_ERR "raid10: not enough operational mirrors for %s\n", + mdname(mddev)); goto out_free_conf; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 43f231a467d..4683ca24c04 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -24,6 +24,8 @@ #include <linux/bitops.h> #include <asm/atomic.h> +#include <linux/raid/bitmap.h> + /* * Stripe cache */ @@ -79,8 +81,13 @@ static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) if (test_bit(STRIPE_HANDLE, &sh->state)) { if (test_bit(STRIPE_DELAYED, &sh->state)) list_add_tail(&sh->lru, &conf->delayed_list); - else + else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && + conf->seq_write == sh->bm_seq) + list_add_tail(&sh->lru, &conf->bitmap_list); + else { + clear_bit(STRIPE_BIT_DELAY, &sh->state); list_add_tail(&sh->lru, &conf->handle_list); + } md_wakeup_thread(conf->mddev->thread); } else { if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { @@ -244,6 +251,9 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector spin_lock_irq(&conf->device_lock); do { + wait_event_lock_irq(conf->wait_for_stripe, + conf->quiesce == 0, + conf->device_lock, /* nothing */); sh = __find_stripe(conf, sector); if (!sh) { if (!conf->inactive_blocked) @@ -803,6 +813,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in { struct bio **bip; raid5_conf_t *conf = sh->raid_conf; + int firstwrite=0; PRINTK("adding bh b#%llu to stripe s#%llu\n", (unsigned long long)bi->bi_sector, @@ -811,9 +822,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in spin_lock(&sh->lock); spin_lock_irq(&conf->device_lock); - if (forwrite) + if (forwrite) { bip = &sh->dev[dd_idx].towrite; - else + if (*bip == NULL && sh->dev[dd_idx].written == NULL) + firstwrite = 1; + } else bip = &sh->dev[dd_idx].toread; while (*bip && (*bip)->bi_sector < bi->bi_sector) { if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) @@ -836,6 +849,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in (unsigned long long)bi->bi_sector, (unsigned long long)sh->sector, dd_idx); + if (conf->mddev->bitmap && firstwrite) { + sh->bm_seq = conf->seq_write; + bitmap_startwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, 0); + set_bit(STRIPE_BIT_DELAY, &sh->state); + } + if (forwrite) { /* check if page is covered */ sector_t sector = sh->dev[dd_idx].sector; @@ -958,12 +978,13 @@ static void handle_stripe(struct stripe_head *sh) * need to be failed */ if (failed > 1 && to_read+to_write+written) { - spin_lock_irq(&conf->device_lock); for (i=disks; i--; ) { + int bitmap_end = 0; + spin_lock_irq(&conf->device_lock); /* fail all writes first */ bi = sh->dev[i].towrite; sh->dev[i].towrite = NULL; - if (bi) to_write--; + if (bi) { to_write--; bitmap_end = 1; } if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); @@ -981,6 +1002,7 @@ static void handle_stripe(struct stripe_head *sh) /* and fail all 'written' */ bi = sh->dev[i].written; sh->dev[i].written = NULL; + if (bi) bitmap_end = 1; while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); @@ -1009,8 +1031,11 @@ static void handle_stripe(struct stripe_head *sh) bi = nextbi; } } + spin_unlock_irq(&conf->device_lock); + if (bitmap_end) + bitmap_endwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, 0, 0); } - spin_unlock_irq(&conf->device_lock); } if (failed > 1 && syncing) { md_done_sync(conf->mddev, STRIPE_SECTORS,0); @@ -1038,6 +1063,7 @@ static void handle_stripe(struct stripe_head *sh) test_bit(R5_UPTODATE, &dev->flags) ) { /* We can return any write requests */ struct bio *wbi, *wbi2; + int bitmap_end = 0; PRINTK("Return write for disc %d\n", i); spin_lock_irq(&conf->device_lock); wbi = dev->written; @@ -1051,7 +1077,13 @@ static void handle_stripe(struct stripe_head *sh) } wbi = wbi2; } + if (dev->towrite == NULL) + bitmap_end = 1; spin_unlock_irq(&conf->device_lock); + if (bitmap_end) + bitmap_endwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, + !test_bit(STRIPE_DEGRADED, &sh->state), 0); } } } @@ -1175,7 +1207,8 @@ static void handle_stripe(struct stripe_head *sh) } } /* now if nothing is locked, and if we have enough data, we can start a write request */ - if (locked == 0 && (rcw == 0 ||rmw == 0)) { + if (locked == 0 && (rcw == 0 ||rmw == 0) && + !test_bit(STRIPE_BIT_DELAY, &sh->state)) { PRINTK("Computing parity...\n"); compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); /* now every locked buffer is ready to be written */ @@ -1231,6 +1264,7 @@ static void handle_stripe(struct stripe_head *sh) dev = &sh->dev[failed_num]; set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantwrite, &dev->flags); + clear_bit(STRIPE_DEGRADED, &sh->state); locked++; set_bit(STRIPE_INSYNC, &sh->state); set_bit(R5_Syncio, &dev->flags); @@ -1298,6 +1332,8 @@ static void handle_stripe(struct stripe_head *sh) bi->bi_next = NULL; generic_make_request(bi); } else { + if (rw == 1) + set_bit(STRIPE_DEGRADED, &sh->state); PRINTK("skip op %ld on disc %d for sector %llu\n", bi->bi_rw, i, (unsigned long long)sh->sector); clear_bit(R5_LOCKED, &sh->dev[i].flags); @@ -1322,6 +1358,20 @@ static inline void raid5_activate_delayed(raid5_conf_t *conf) } } +static inline void activate_bit_delay(raid5_conf_t *conf) +{ + /* device_lock is held */ + struct list_head head; + list_add(&head, &conf->bitmap_list); + list_del_init(&conf->bitmap_list); + while (!list_empty(&head)) { + struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); + list_del_init(&sh->lru); + atomic_inc(&sh->count); + __release_stripe(conf, sh); + } +} + static void unplug_slaves(mddev_t *mddev) { raid5_conf_t *conf = mddev_to_conf(mddev); @@ -1354,8 +1404,10 @@ static void raid5_unplug_device(request_queue_t *q) spin_lock_irqsave(&conf->device_lock, flags); - if (blk_remove_plug(q)) + if (blk_remove_plug(q)) { + conf->seq_flush++; raid5_activate_delayed(conf); + } md_wakeup_thread(mddev->thread); spin_unlock_irqrestore(&conf->device_lock, flags); @@ -1411,6 +1463,11 @@ static int make_request (request_queue_t *q, struct bio * bi) sector_t logical_sector, last_sector; struct stripe_head *sh; + if (unlikely(bio_barrier(bi))) { + bio_endio(bi, bi->bi_size, -EOPNOTSUPP); + return 0; + } + md_write_start(mddev, bi); if (bio_data_dir(bi)==WRITE) { @@ -1488,10 +1545,20 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i sector_t first_sector; int raid_disks = conf->raid_disks; int data_disks = raid_disks-1; + sector_t max_sector = mddev->size << 1; + int sync_blocks; - if (sector_nr >= mddev->size <<1) { + if (sector_nr >= max_sector) { /* just being told to finish up .. nothing much to do */ unplug_slaves(mddev); + + if (mddev->curr_resync < max_sector) /* aborted */ + bitmap_end_sync(mddev->bitmap, mddev->curr_resync, + &sync_blocks, 1); + else /* compelted sync */ + conf->fullsync = 0; + bitmap_close_sync(mddev->bitmap); + return 0; } /* if there is 1 or more failed drives and we are trying @@ -1503,6 +1570,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i *skipped = 1; return rv; } + if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && + !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { + /* we can skip this block, and probably more */ + sync_blocks /= STRIPE_SECTORS; + *skipped = 1; + return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ + } x = sector_nr; chunk_offset = sector_div(x, sectors_per_chunk); @@ -1520,6 +1594,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(1); } + bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0); spin_lock(&sh->lock); set_bit(STRIPE_SYNCING, &sh->state); clear_bit(STRIPE_INSYNC, &sh->state); @@ -1553,6 +1628,13 @@ static void raid5d (mddev_t *mddev) while (1) { struct list_head *first; + if (conf->seq_flush - conf->seq_write > 0) { + int seq = conf->seq_flush; + bitmap_unplug(mddev->bitmap); + conf->seq_write = seq; + activate_bit_delay(conf); + } + if (list_empty(&conf->handle_list) && atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && !blk_queue_plugged(mddev->queue) && @@ -1586,7 +1668,7 @@ static void raid5d (mddev_t *mddev) PRINTK("--- raid5d inactive\n"); } -static int run (mddev_t *mddev) +static int run(mddev_t *mddev) { raid5_conf_t *conf; int raid_disk, memory; @@ -1616,6 +1698,7 @@ static int run (mddev_t *mddev) init_waitqueue_head(&conf->wait_for_overlap); INIT_LIST_HEAD(&conf->handle_list); INIT_LIST_HEAD(&conf->delayed_list); + INIT_LIST_HEAD(&conf->bitmap_list); INIT_LIST_HEAD(&conf->inactive_list); atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); @@ -1727,6 +1810,9 @@ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + /* Ok, everything is just fine now */ + if (mddev->bitmap) + mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; + mddev->queue->unplug_fn = raid5_unplug_device; mddev->queue->issue_flush_fn = raid5_issue_flush; @@ -1907,6 +1993,8 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) rdev->in_sync = 0; rdev->raid_disk = disk; found = 1; + if (rdev->saved_raid_disk != disk) + conf->fullsync = 1; p->rdev = rdev; break; } @@ -1936,6 +2024,35 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) return 0; } +static void raid5_quiesce(mddev_t *mddev, int state) +{ + raid5_conf_t *conf = mddev_to_conf(mddev); + + switch(state) { + case 1: /* stop all writes */ + spin_lock_irq(&conf->device_lock); + conf->quiesce = 1; + wait_event_lock_irq(conf->wait_for_stripe, + atomic_read(&conf->active_stripes) == 0, + conf->device_lock, /* nothing */); + spin_unlock_irq(&conf->device_lock); + break; + + case 0: /* re-enable writes */ + spin_lock_irq(&conf->device_lock); + conf->quiesce = 0; + wake_up(&conf->wait_for_stripe); + spin_unlock_irq(&conf->device_lock); + break; + } + if (mddev->thread) { + if (mddev->bitmap) + mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; + else + mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + md_wakeup_thread(mddev->thread); + } +} static mdk_personality_t raid5_personality= { .name = "raid5", @@ -1950,6 +2067,7 @@ static mdk_personality_t raid5_personality= .spare_active = raid5_spare_active, .sync_request = sync_request, .resize = raid5_resize, + .quiesce = raid5_quiesce, }; static int __init raid5_init (void) diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c index 495dee1d1e8..267eb1430c8 100644 --- a/drivers/md/raid6main.c +++ b/drivers/md/raid6main.c @@ -29,6 +29,8 @@ #include <asm/atomic.h> #include "raid6.h" +#include <linux/raid/bitmap.h> + /* * Stripe cache */ @@ -98,8 +100,13 @@ static inline void __release_stripe(raid6_conf_t *conf, struct stripe_head *sh) if (test_bit(STRIPE_HANDLE, &sh->state)) { if (test_bit(STRIPE_DELAYED, &sh->state)) list_add_tail(&sh->lru, &conf->delayed_list); - else + else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && + conf->seq_write == sh->bm_seq) + list_add_tail(&sh->lru, &conf->bitmap_list); + else { + clear_bit(STRIPE_BIT_DELAY, &sh->state); list_add_tail(&sh->lru, &conf->handle_list); + } md_wakeup_thread(conf->mddev->thread); } else { if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { @@ -262,6 +269,9 @@ static struct stripe_head *get_active_stripe(raid6_conf_t *conf, sector_t sector spin_lock_irq(&conf->device_lock); do { + wait_event_lock_irq(conf->wait_for_stripe, + conf->quiesce == 0, + conf->device_lock, /* nothing */); sh = __find_stripe(conf, sector); if (!sh) { if (!conf->inactive_blocked) @@ -906,6 +916,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in { struct bio **bip; raid6_conf_t *conf = sh->raid_conf; + int firstwrite=0; PRINTK("adding bh b#%llu to stripe s#%llu\n", (unsigned long long)bi->bi_sector, @@ -914,9 +925,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in spin_lock(&sh->lock); spin_lock_irq(&conf->device_lock); - if (forwrite) + if (forwrite) { bip = &sh->dev[dd_idx].towrite; - else + if (*bip == NULL && sh->dev[dd_idx].written == NULL) + firstwrite = 1; + } else bip = &sh->dev[dd_idx].toread; while (*bip && (*bip)->bi_sector < bi->bi_sector) { if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) @@ -939,6 +952,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in (unsigned long long)bi->bi_sector, (unsigned long long)sh->sector, dd_idx); + if (conf->mddev->bitmap && firstwrite) { + sh->bm_seq = conf->seq_write; + bitmap_startwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, 0); + set_bit(STRIPE_BIT_DELAY, &sh->state); + } + if (forwrite) { /* check if page is covered */ sector_t sector = sh->dev[dd_idx].sector; @@ -1066,12 +1086,13 @@ static void handle_stripe(struct stripe_head *sh) * need to be failed */ if (failed > 2 && to_read+to_write+written) { - spin_lock_irq(&conf->device_lock); for (i=disks; i--; ) { + int bitmap_end = 0; + spin_lock_irq(&conf->device_lock); /* fail all writes first */ bi = sh->dev[i].towrite; sh->dev[i].towrite = NULL; - if (bi) to_write--; + if (bi) { to_write--; bitmap_end = 1; } if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); @@ -1089,6 +1110,7 @@ static void handle_stripe(struct stripe_head *sh) /* and fail all 'written' */ bi = sh->dev[i].written; sh->dev[i].written = NULL; + if (bi) bitmap_end = 1; while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); @@ -1117,8 +1139,11 @@ static void handle_stripe(struct stripe_head *sh) bi = nextbi; } } + spin_unlock_irq(&conf->device_lock); + if (bitmap_end) + bitmap_endwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, 0, 0); } - spin_unlock_irq(&conf->device_lock); } if (failed > 2 && syncing) { md_done_sync(conf->mddev, STRIPE_SECTORS,0); @@ -1155,6 +1180,7 @@ static void handle_stripe(struct stripe_head *sh) if (!test_bit(R5_LOCKED, &dev->flags) && test_bit(R5_UPTODATE, &dev->flags) ) { /* We can return any write requests */ + int bitmap_end = 0; struct bio *wbi, *wbi2; PRINTK("Return write for stripe %llu disc %d\n", (unsigned long long)sh->sector, i); @@ -1170,7 +1196,13 @@ static void handle_stripe(struct stripe_head *sh) } wbi = wbi2; } + if (dev->towrite == NULL) + bitmap_end = 1; spin_unlock_irq(&conf->device_lock); + if (bitmap_end) + bitmap_endwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, + !test_bit(STRIPE_DEGRADED, &sh->state), 0); } } } @@ -1285,7 +1317,8 @@ static void handle_stripe(struct stripe_head *sh) } } /* now if nothing is locked, and if we have enough data, we can start a write request */ - if (locked == 0 && rcw == 0) { + if (locked == 0 && rcw == 0 && + !test_bit(STRIPE_BIT_DELAY, &sh->state)) { if ( must_compute > 0 ) { /* We have failed blocks and need to compute them */ switch ( failed ) { @@ -1388,6 +1421,7 @@ static void handle_stripe(struct stripe_head *sh) bdev = &sh->dev[failed_num[1]]; locked += !test_bit(R5_LOCKED, &bdev->flags); set_bit(R5_LOCKED, &bdev->flags); + clear_bit(STRIPE_DEGRADED, &sh->state); set_bit(R5_Wantwrite, &bdev->flags); set_bit(STRIPE_INSYNC, &sh->state); @@ -1457,6 +1491,8 @@ static void handle_stripe(struct stripe_head *sh) bi->bi_next = NULL; generic_make_request(bi); } else { + if (rw == 1) + set_bit(STRIPE_DEGRADED, &sh->state); PRINTK("skip op %ld on disc %d for sector %llu\n", bi->bi_rw, i, (unsigned long long)sh->sector); clear_bit(R5_LOCKED, &sh->dev[i].flags); @@ -1481,6 +1517,20 @@ static inline void raid6_activate_delayed(raid6_conf_t *conf) } } +static inline void activate_bit_delay(raid6_conf_t *conf) +{ + /* device_lock is held */ + struct list_head head; + list_add(&head, &conf->bitmap_list); + list_del_init(&conf->bitmap_list); + while (!list_empty(&head)) { + struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); + list_del_init(&sh->lru); + atomic_inc(&sh->count); + __release_stripe(conf, sh); + } +} + static void unplug_slaves(mddev_t *mddev) { raid6_conf_t *conf = mddev_to_conf(mddev); @@ -1513,8 +1563,10 @@ static void raid6_unplug_device(request_queue_t *q) spin_lock_irqsave(&conf->device_lock, flags); - if (blk_remove_plug(q)) + if (blk_remove_plug(q)) { + conf->seq_flush++; raid6_activate_delayed(conf); + } md_wakeup_thread(mddev->thread); spin_unlock_irqrestore(&conf->device_lock, flags); @@ -1570,6 +1622,11 @@ static int make_request (request_queue_t *q, struct bio * bi) sector_t logical_sector, last_sector; struct stripe_head *sh; + if (unlikely(bio_barrier(bi))) { + bio_endio(bi, bi->bi_size, -EOPNOTSUPP); + return 0; + } + md_write_start(mddev, bi); if (bio_data_dir(bi)==WRITE) { @@ -1647,10 +1704,20 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i sector_t first_sector; int raid_disks = conf->raid_disks; int data_disks = raid_disks - 2; + sector_t max_sector = mddev->size << 1; + int sync_blocks; - if (sector_nr >= mddev->size <<1) { + if (sector_nr >= max_sector) { /* just being told to finish up .. nothing much to do */ unplug_slaves(mddev); + + if (mddev->curr_resync < max_sector) /* aborted */ + bitmap_end_sync(mddev->bitmap, mddev->curr_resync, + &sync_blocks, 1); + else /* compelted sync */ + conf->fullsync = 0; + bitmap_close_sync(mddev->bitmap); + return 0; } /* if there are 2 or more failed drives and we are trying @@ -1662,6 +1729,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i *skipped = 1; return rv; } + if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && + !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { + /* we can skip this block, and probably more */ + sync_blocks /= STRIPE_SECTORS; + *skipped = 1; + return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ + } x = sector_nr; chunk_offset = sector_div(x, sectors_per_chunk); @@ -1679,6 +1753,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(1); } + bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0); spin_lock(&sh->lock); set_bit(STRIPE_SYNCING, &sh->state); clear_bit(STRIPE_INSYNC, &sh->state); @@ -1712,6 +1787,13 @@ static void raid6d (mddev_t *mddev) while (1) { struct list_head *first; + if (conf->seq_flush - conf->seq_write > 0) { + int seq = conf->seq_flush; + bitmap_unplug(mddev->bitmap); + conf->seq_write = seq; + activate_bit_delay(conf); + } + if (list_empty(&conf->handle_list) && atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && !blk_queue_plugged(mddev->queue) && @@ -1745,7 +1827,7 @@ static void raid6d (mddev_t *mddev) PRINTK("--- raid6d inactive\n"); } -static int run (mddev_t *mddev) +static int run(mddev_t *mddev) { raid6_conf_t *conf; int raid_disk, memory; @@ -1775,6 +1857,7 @@ static int run (mddev_t *mddev) init_waitqueue_head(&conf->wait_for_overlap); INIT_LIST_HEAD(&conf->handle_list); INIT_LIST_HEAD(&conf->delayed_list); + INIT_LIST_HEAD(&conf->bitmap_list); INIT_LIST_HEAD(&conf->inactive_list); atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); @@ -1894,6 +1977,9 @@ static int run (mddev_t *mddev) /* Ok, everything is just fine now */ mddev->array_size = mddev->size * (mddev->raid_disks - 2); + if (mddev->bitmap) + mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; + mddev->queue->unplug_fn = raid6_unplug_device; mddev->queue->issue_flush_fn = raid6_issue_flush; return 0; @@ -2071,6 +2157,8 @@ static int raid6_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) rdev->in_sync = 0; rdev->raid_disk = disk; found = 1; + if (rdev->saved_raid_disk != disk) + conf->fullsync = 1; p->rdev = rdev; break; } @@ -2100,6 +2188,35 @@ static int raid6_resize(mddev_t *mddev, sector_t sectors) return 0; } +static void raid6_quiesce(mddev_t *mddev, int state) +{ + raid6_conf_t *conf = mddev_to_conf(mddev); + + switch(state) { + case 1: /* stop all writes */ + spin_lock_irq(&conf->device_lock); + conf->quiesce = 1; + wait_event_lock_irq(conf->wait_for_stripe, + atomic_read(&conf->active_stripes) == 0, + conf->device_lock, /* nothing */); + spin_unlock_irq(&conf->device_lock); + break; + + case 0: /* re-enable writes */ + spin_lock_irq(&conf->device_lock); + conf->quiesce = 0; + wake_up(&conf->wait_for_stripe); + spin_unlock_irq(&conf->device_lock); + break; + } + if (mddev->thread) { + if (mddev->bitmap) + mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; + else + mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + md_wakeup_thread(mddev->thread); + } +} static mdk_personality_t raid6_personality= { .name = "raid6", @@ -2114,6 +2231,7 @@ static mdk_personality_t raid6_personality= .spare_active = raid6_spare_active, .sync_request = sync_request, .resize = raid6_resize, + .quiesce = raid6_quiesce, }; static int __init raid6_init (void) |