diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bitmap.c | 39 | ||||
-rw-r--r-- | drivers/md/faulty.c | 2 | ||||
-rw-r--r-- | drivers/md/linear.c | 2 | ||||
-rw-r--r-- | drivers/md/md.c | 395 | ||||
-rw-r--r-- | drivers/md/mktables.c | 187 | ||||
-rw-r--r-- | drivers/md/multipath.c | 2 | ||||
-rw-r--r-- | drivers/md/raid0.c | 8 | ||||
-rw-r--r-- | drivers/md/raid1.c | 5 | ||||
-rw-r--r-- | drivers/md/raid10.c | 7 | ||||
-rw-r--r-- | drivers/md/raid5.c | 48 | ||||
-rw-r--r-- | drivers/md/raid6test/test.c | 117 |
11 files changed, 542 insertions, 270 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 1b1ef3130e6..a0585fb6da9 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -237,7 +237,7 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde if (!page) return ERR_PTR(-ENOMEM); - ITERATE_RDEV(mddev, rdev, tmp) { + rdev_for_each(rdev, tmp, mddev) { if (! test_bit(In_sync, &rdev->flags) || test_bit(Faulty, &rdev->flags)) continue; @@ -261,7 +261,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) struct list_head *tmp; mddev_t *mddev = bitmap->mddev; - ITERATE_RDEV(mddev, rdev, tmp) + rdev_for_each(rdev, tmp, mddev) if (test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) { int size = PAGE_SIZE; @@ -1348,14 +1348,38 @@ void bitmap_close_sync(struct bitmap *bitmap) */ sector_t sector = 0; int blocks; - if (!bitmap) return; + if (!bitmap) + return; while (sector < bitmap->mddev->resync_max_sectors) { bitmap_end_sync(bitmap, sector, &blocks, 0); -/* - if (sector < 500) printk("bitmap_close_sync: sec %llu blks %d\n", - (unsigned long long)sector, blocks); -*/ sector += blocks; + sector += blocks; + } +} + +void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) +{ + sector_t s = 0; + int blocks; + + if (!bitmap) + return; + if (sector == 0) { + bitmap->last_end_sync = jiffies; + return; + } + if (time_before(jiffies, (bitmap->last_end_sync + + bitmap->daemon_sleep * HZ))) + return; + wait_event(bitmap->mddev->recovery_wait, + atomic_read(&bitmap->mddev->recovery_active) == 0); + + sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); + s = 0; + while (s < sector && s < bitmap->mddev->resync_max_sectors) { + bitmap_end_sync(bitmap, s, &blocks, 0); + s += blocks; } + bitmap->last_end_sync = jiffies; } static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) @@ -1565,3 +1589,4 @@ EXPORT_SYMBOL(bitmap_start_sync); EXPORT_SYMBOL(bitmap_end_sync); EXPORT_SYMBOL(bitmap_unplug); EXPORT_SYMBOL(bitmap_close_sync); +EXPORT_SYMBOL(bitmap_cond_end_sync); diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index cf2ddce3411..d107ddceefc 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -294,7 +294,7 @@ static int run(mddev_t *mddev) } conf->nfaults = 0; - ITERATE_RDEV(mddev, rdev, tmp) + rdev_for_each(rdev, tmp, mddev) conf->rdev = rdev; mddev->array_size = mddev->size; diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 3dac1cfb818..0b8511776b3 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -122,7 +122,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) cnt = 0; conf->array_size = 0; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { int j = rdev->raid_disk; dev_info_t *disk = conf->disks + j; diff --git a/drivers/md/md.c b/drivers/md/md.c index c28a120b416..5fc326d3970 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -195,7 +195,7 @@ static DEFINE_SPINLOCK(all_mddevs_lock); * Any code which breaks out of this loop while own * a reference to the current mddev and must mddev_put it. */ -#define ITERATE_MDDEV(mddev,tmp) \ +#define for_each_mddev(mddev,tmp) \ \ for (({ spin_lock(&all_mddevs_lock); \ tmp = all_mddevs.next; \ @@ -275,6 +275,7 @@ static mddev_t * mddev_find(dev_t unit) spin_lock_init(&new->write_lock); init_waitqueue_head(&new->sb_wait); new->reshape_position = MaxSector; + new->resync_max = MaxSector; new->queue = blk_alloc_queue(GFP_KERNEL); if (!new->queue) { @@ -310,7 +311,7 @@ static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) mdk_rdev_t * rdev; struct list_head *tmp; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { if (rdev->desc_nr == nr) return rdev; } @@ -322,7 +323,7 @@ static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) struct list_head *tmp; mdk_rdev_t *rdev; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { if (rdev->bdev->bd_dev == dev) return rdev; } @@ -773,12 +774,16 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) __u64 ev1 = md_event(sb); rdev->raid_disk = -1; - rdev->flags = 0; + clear_bit(Faulty, &rdev->flags); + clear_bit(In_sync, &rdev->flags); + clear_bit(WriteMostly, &rdev->flags); + clear_bit(BarriersNotsupp, &rdev->flags); + if (mddev->raid_disks == 0) { mddev->major_version = 0; mddev->minor_version = sb->minor_version; mddev->patch_version = sb->patch_version; - mddev->persistent = ! sb->not_persistent; + mddev->external = 0; mddev->chunk_size = sb->chunk_size; mddev->ctime = sb->ctime; mddev->utime = sb->utime; @@ -904,7 +909,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->size = mddev->size; sb->raid_disks = mddev->raid_disks; sb->md_minor = mddev->md_minor; - sb->not_persistent = !mddev->persistent; + sb->not_persistent = 0; sb->utime = mddev->utime; sb->state = 0; sb->events_hi = (mddev->events>>32); @@ -938,7 +943,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->state |= (1<<MD_SB_BITMAP_PRESENT); sb->disks[0].state = (1<<MD_DISK_REMOVED); - ITERATE_RDEV(mddev,rdev2,tmp) { + rdev_for_each(rdev2, tmp, mddev) { mdp_disk_t *d; int desc_nr; if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) @@ -1153,11 +1158,15 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) __u64 ev1 = le64_to_cpu(sb->events); rdev->raid_disk = -1; - rdev->flags = 0; + clear_bit(Faulty, &rdev->flags); + clear_bit(In_sync, &rdev->flags); + clear_bit(WriteMostly, &rdev->flags); + clear_bit(BarriersNotsupp, &rdev->flags); + if (mddev->raid_disks == 0) { mddev->major_version = 1; mddev->patch_version = 0; - mddev->persistent = 1; + mddev->external = 0; mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); @@ -1286,7 +1295,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) } max_dev = 0; - ITERATE_RDEV(mddev,rdev2,tmp) + rdev_for_each(rdev2, tmp, mddev) if (rdev2->desc_nr+1 > max_dev) max_dev = rdev2->desc_nr+1; @@ -1295,7 +1304,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) for (i=0; i<max_dev;i++) sb->dev_roles[i] = cpu_to_le16(0xfffe); - ITERATE_RDEV(mddev,rdev2,tmp) { + rdev_for_each(rdev2, tmp, mddev) { i = rdev2->desc_nr; if (test_bit(Faulty, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(0xfffe); @@ -1333,8 +1342,8 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) struct list_head *tmp, *tmp2; mdk_rdev_t *rdev, *rdev2; - ITERATE_RDEV(mddev1,rdev,tmp) - ITERATE_RDEV(mddev2, rdev2, tmp2) + rdev_for_each(rdev, tmp, mddev1) + rdev_for_each(rdev2, tmp2, mddev2) if (rdev->bdev->bd_contains == rdev2->bdev->bd_contains) return 1; @@ -1401,7 +1410,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) goto fail; } list_add(&rdev->same_set, &mddev->disks); - bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk); + bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); return 0; fail: @@ -1410,10 +1419,11 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) return err; } -static void delayed_delete(struct work_struct *ws) +static void md_delayed_delete(struct work_struct *ws) { mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work); kobject_del(&rdev->kobj); + kobject_put(&rdev->kobj); } static void unbind_rdev_from_array(mdk_rdev_t * rdev) @@ -1432,7 +1442,8 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) /* We need to delay this, otherwise we can deadlock when * writing to 'remove' to "dev/state" */ - INIT_WORK(&rdev->del_work, delayed_delete); + INIT_WORK(&rdev->del_work, md_delayed_delete); + kobject_get(&rdev->kobj); schedule_work(&rdev->del_work); } @@ -1441,7 +1452,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) * otherwise reused by a RAID array (or any other kernel * subsystem), by bd_claiming the device. */ -static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) +static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared) { int err = 0; struct block_device *bdev; @@ -1453,13 +1464,15 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) __bdevname(dev, b)); return PTR_ERR(bdev); } - err = bd_claim(bdev, rdev); + err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev); if (err) { printk(KERN_ERR "md: could not bd_claim %s.\n", bdevname(bdev, b)); blkdev_put(bdev); return err; } + if (!shared) + set_bit(AllReserved, &rdev->flags); rdev->bdev = bdev; return err; } @@ -1503,7 +1516,7 @@ static void export_array(mddev_t *mddev) struct list_head *tmp; mdk_rdev_t *rdev; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { if (!rdev->mddev) { MD_BUG(); continue; @@ -1581,17 +1594,17 @@ static void md_print_devices(void) printk("md: **********************************\n"); printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); printk("md: **********************************\n"); - ITERATE_MDDEV(mddev,tmp) { + for_each_mddev(mddev, tmp) { if (mddev->bitmap) bitmap_print_sb(mddev->bitmap); else printk("%s: ", mdname(mddev)); - ITERATE_RDEV(mddev,rdev,tmp2) + rdev_for_each(rdev, tmp2, mddev) printk("<%s>", bdevname(rdev->bdev,b)); printk("\n"); - ITERATE_RDEV(mddev,rdev,tmp2) + rdev_for_each(rdev, tmp2, mddev) print_rdev(rdev); } printk("md: **********************************\n"); @@ -1610,7 +1623,7 @@ static void sync_sbs(mddev_t * mddev, int nospares) mdk_rdev_t *rdev; struct list_head *tmp; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { if (rdev->sb_events == mddev->events || (nospares && rdev->raid_disk < 0 && @@ -1696,18 +1709,20 @@ repeat: MD_BUG(); mddev->events --; } - sync_sbs(mddev, nospares); /* * do not write anything to disk if using * nonpersistent superblocks */ if (!mddev->persistent) { - clear_bit(MD_CHANGE_PENDING, &mddev->flags); + if (!mddev->external) + clear_bit(MD_CHANGE_PENDING, &mddev->flags); + spin_unlock_irq(&mddev->write_lock); wake_up(&mddev->sb_wait); return; } + sync_sbs(mddev, nospares); spin_unlock_irq(&mddev->write_lock); dprintk(KERN_INFO @@ -1715,7 +1730,7 @@ repeat: mdname(mddev),mddev->in_sync); bitmap_update_sb(mddev->bitmap); - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { char b[BDEVNAME_SIZE]; dprintk(KERN_INFO "md: "); if (rdev->sb_loaded != 1) @@ -1785,7 +1800,7 @@ static ssize_t state_show(mdk_rdev_t *rdev, char *page) { char *sep = ""; - int len=0; + size_t len = 0; if (test_bit(Faulty, &rdev->flags)) { len+= sprintf(page+len, "%sfaulty",sep); @@ -1887,20 +1902,45 @@ static ssize_t slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) { char *e; + int err; + char nm[20]; int slot = simple_strtoul(buf, &e, 10); if (strncmp(buf, "none", 4)==0) slot = -1; else if (e==buf || (*e && *e!= '\n')) return -EINVAL; - if (rdev->mddev->pers) - /* Cannot set slot in active array (yet) */ - return -EBUSY; - if (slot >= rdev->mddev->raid_disks) - return -ENOSPC; - rdev->raid_disk = slot; - /* assume it is working */ - rdev->flags = 0; - set_bit(In_sync, &rdev->flags); + if (rdev->mddev->pers) { + /* Setting 'slot' on an active array requires also + * updating the 'rd%d' link, and communicating + * with the personality with ->hot_*_disk. + * For now we only support removing + * failed/spare devices. This normally happens automatically, + * but not when the metadata is externally managed. + */ + if (slot != -1) + return -EBUSY; + if (rdev->raid_disk == -1) + return -EEXIST; + /* personality does all needed checks */ + if (rdev->mddev->pers->hot_add_disk == NULL) + return -EINVAL; + err = rdev->mddev->pers-> + hot_remove_disk(rdev->mddev, rdev->raid_disk); + if (err) + return err; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&rdev->mddev->kobj, nm); + set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); + md_wakeup_thread(rdev->mddev->thread); + } else { + if (slot >= rdev->mddev->raid_disks) + return -ENOSPC; + rdev->raid_disk = slot; + /* assume it is working */ + clear_bit(Faulty, &rdev->flags); + clear_bit(WriteMostly, &rdev->flags); + set_bit(In_sync, &rdev->flags); + } return len; } @@ -1923,6 +1963,10 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) return -EINVAL; if (rdev->mddev->pers) return -EBUSY; + if (rdev->size && rdev->mddev->external) + /* Must set offset before size, so overlap checks + * can be sane */ + return -EBUSY; rdev->data_offset = offset; return len; } @@ -1936,16 +1980,69 @@ rdev_size_show(mdk_rdev_t *rdev, char *page) return sprintf(page, "%llu\n", (unsigned long long)rdev->size); } +static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) +{ + /* check if two start/length pairs overlap */ + if (s1+l1 <= s2) + return 0; + if (s2+l2 <= s1) + return 0; + return 1; +} + static ssize_t rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) { char *e; unsigned long long size = simple_strtoull(buf, &e, 10); + unsigned long long oldsize = rdev->size; if (e==buf || (*e && *e != '\n')) return -EINVAL; if (rdev->mddev->pers) return -EBUSY; rdev->size = size; + if (size > oldsize && rdev->mddev->external) { + /* need to check that all other rdevs with the same ->bdev + * do not overlap. We need to unlock the mddev to avoid + * a deadlock. We have already changed rdev->size, and if + * we have to change it back, we will have the lock again. + */ + mddev_t *mddev; + int overlap = 0; + struct list_head *tmp, *tmp2; + + mddev_unlock(rdev->mddev); + for_each_mddev(mddev, tmp) { + mdk_rdev_t *rdev2; + + mddev_lock(mddev); + rdev_for_each(rdev2, tmp2, mddev) + if (test_bit(AllReserved, &rdev2->flags) || + (rdev->bdev == rdev2->bdev && + rdev != rdev2 && + overlaps(rdev->data_offset, rdev->size, + rdev2->data_offset, rdev2->size))) { + overlap = 1; + break; + } + mddev_unlock(mddev); + if (overlap) { + mddev_put(mddev); + break; + } + } + mddev_lock(rdev->mddev); + if (overlap) { + /* Someone else could have slipped in a size + * change here, but doing so is just silly. + * We put oldsize back because we *know* it is + * safe, and trust userspace not to race with + * itself + */ + rdev->size = oldsize; + return -EBUSY; + } + } if (size < rdev->mddev->size || rdev->mddev->size == 0) rdev->mddev->size = size; return len; @@ -1980,12 +2077,18 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); + int rv; if (!entry->store) return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; - return entry->store(rdev, page, length); + rv = mddev_lock(rdev->mddev); + if (!rv) { + rv = entry->store(rdev, page, length); + mddev_unlock(rdev->mddev); + } + return rv; } static void rdev_free(struct kobject *ko) @@ -2029,7 +2132,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi if ((err = alloc_disk_sb(rdev))) goto abort_free; - err = lock_rdev(rdev, newdev); + err = lock_rdev(rdev, newdev, super_format == -2); if (err) goto abort_free; @@ -2099,7 +2202,7 @@ static void analyze_sbs(mddev_t * mddev) char b[BDEVNAME_SIZE]; freshest = NULL; - ITERATE_RDEV(mddev,rdev,tmp) + rdev_for_each(rdev, tmp, mddev) switch (super_types[mddev->major_version]. load_super(rdev, freshest, mddev->minor_version)) { case 1: @@ -2120,7 +2223,7 @@ static void analyze_sbs(mddev_t * mddev) validate_super(mddev, freshest); i = 0; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { if (rdev != freshest) if (super_types[mddev->major_version]. validate_super(mddev, rdev)) { @@ -2215,7 +2318,7 @@ level_show(mddev_t *mddev, char *page) static ssize_t level_store(mddev_t *mddev, const char *buf, size_t len) { - int rv = len; + ssize_t rv = len; if (mddev->pers) return -EBUSY; if (len == 0) @@ -2425,6 +2528,8 @@ array_state_show(mddev_t *mddev, char *page) case 0: if (mddev->in_sync) st = clean; + else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) + st = write_pending; else if (mddev->safemode) st = active_idle; else @@ -2455,11 +2560,9 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) break; case clear: /* stopping an active array */ - if (mddev->pers) { - if (atomic_read(&mddev->active) > 1) - return -EBUSY; - err = do_md_stop(mddev, 0); - } + if (atomic_read(&mddev->active) > 1) + return -EBUSY; + err = do_md_stop(mddev, 0); break; case inactive: /* stopping an active array */ @@ -2467,7 +2570,8 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) if (atomic_read(&mddev->active) > 1) return -EBUSY; err = do_md_stop(mddev, 2); - } + } else + err = 0; /* already inactive */ break; case suspended: break; /* not supported yet */ @@ -2495,9 +2599,15 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) restart_array(mddev); spin_lock_irq(&mddev->write_lock); if (atomic_read(&mddev->writes_pending) == 0) { - mddev->in_sync = 1; - set_bit(MD_CHANGE_CLEAN, &mddev->flags); - } + if (mddev->in_sync == 0) { + mddev->in_sync = 1; + if (mddev->persistent) + set_bit(MD_CHANGE_CLEAN, + &mddev->flags); + } + err = 0; + } else + err = -EBUSY; spin_unlock_irq(&mddev->write_lock); } else { mddev->ro = 0; @@ -2508,7 +2618,8 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) case active: if (mddev->pers) { restart_array(mddev); - clear_bit(MD_CHANGE_CLEAN, &mddev->flags); + if (mddev->external) + clear_bit(MD_CHANGE_CLEAN, &mddev->flags); wake_up(&mddev->sb_wait); err = 0; } else { @@ -2574,7 +2685,9 @@ new_dev_store(mddev_t *mddev, const char *buf, size_t len) if (err < 0) goto out; } - } else + } else if (mddev->external) + rdev = md_import_device(dev, -2, -1); + else rdev = md_import_device(dev, -1, -1); if (IS_ERR(rdev)) @@ -2659,7 +2772,9 @@ __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); /* Metdata version. - * This is either 'none' for arrays with externally managed metadata, + * This is one of + * 'none' for arrays with no metadata (good luck...) + * 'external' for arrays with externally managed metadata, * or N.M for internally known formats */ static ssize_t @@ -2668,6 +2783,8 @@ metadata_show(mddev_t *mddev, char *page) if (mddev->persistent) return sprintf(page, "%d.%d\n", mddev->major_version, mddev->minor_version); + else if (mddev->external) + return sprintf(page, "external:%s\n", mddev->metadata_type); else return sprintf(page, "none\n"); } @@ -2682,6 +2799,21 @@ metadata_store(mddev_t *mddev, const char *buf, size_t len) if (cmd_match(buf, "none")) { mddev->persistent = 0; + mddev->external = 0; + mddev->major_version = 0; + mddev->minor_version = 90; + return len; + } + if (strncmp(buf, "external:", 9) == 0) { + size_t namelen = len-9; + if (namelen >= sizeof(mddev->metadata_type)) + namelen = sizeof(mddev->metadata_type)-1; + strncpy(mddev->metadata_type, buf+9, namelen); + mddev->metadata_type[namelen] = 0; + if (namelen && mddev->metadata_type[namelen-1] == '\n') + mddev->metadata_type[--namelen] = 0; + mddev->persistent = 0; + mddev->external = 1; mddev->major_version = 0; mddev->minor_version = 90; return len; @@ -2698,6 +2830,7 @@ metadata_store(mddev_t *mddev, const char *buf, size_t len) mddev->major_version = major; mddev->minor_version = minor; mddev->persistent = 1; + mddev->external = 0; return len; } @@ -2865,6 +2998,43 @@ sync_completed_show(mddev_t *mddev, char *page) static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); static ssize_t +max_sync_show(mddev_t *mddev, char *page) +{ + if (mddev->resync_max == MaxSector) + return sprintf(page, "max\n"); + else + return sprintf(page, "%llu\n", + (unsigned long long)mddev->resync_max); +} +static ssize_t +max_sync_store(mddev_t *mddev, const char *buf, size_t len) +{ + if (strncmp(buf, "max", 3) == 0) + mddev->resync_max = MaxSector; + else { + char *ep; + unsigned long long max = simple_strtoull(buf, &ep, 10); + if (ep == buf || (*ep != 0 && *ep != '\n')) + return -EINVAL; + if (max < mddev->resync_max && + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + return -EBUSY; + + /* Must be a multiple of chunk_size */ + if (mddev->chunk_size) { + if (max & (sector_t)((mddev->chunk_size>>9)-1)) + return -EINVAL; + } + mddev->resync_max = max; + } + wake_up(&mddev->recovery_wait); + return len; +} + +static struct md_sysfs_entry md_max_sync = +__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); + +static ssize_t suspend_lo_show(mddev_t *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); @@ -2974,6 +3144,7 @@ static struct attribute *md_redundancy_attrs[] = { &md_sync_max.attr, &md_sync_speed.attr, &md_sync_completed.attr, + &md_max_sync.attr, &md_suspend_lo.attr, &md_suspend_hi.attr, &md_bitmap.attr, @@ -3118,8 +3289,11 @@ static int do_md_run(mddev_t * mddev) /* * Analyze all RAID superblock(s) */ - if (!mddev->raid_disks) + if (!mddev->raid_disks) { + if (!mddev->persistent) + return -EINVAL; analyze_sbs(mddev); + } chunk_size = mddev->chunk_size; @@ -3143,7 +3317,7 @@ static int do_md_run(mddev_t * mddev) } /* devices must have minimum size of one chunk */ - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { if (test_bit(Faulty, &rdev->flags)) continue; if (rdev->size < chunk_size / 1024) { @@ -3170,7 +3344,7 @@ static int do_md_run(mddev_t * mddev) * the only valid external interface is through the md * device. */ - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { if (test_bit(Faulty, &rdev->flags)) continue; sync_blockdev(rdev->bdev); @@ -3236,8 +3410,8 @@ static int do_md_run(mddev_t * mddev) mdk_rdev_t *rdev2; struct list_head *tmp2; int warned = 0; - ITERATE_RDEV(mddev, rdev, tmp) { - ITERATE_RDEV(mddev, rdev2, tmp2) { + rdev_for_each(rdev, tmp, mddev) { + rdev_for_each(rdev2, tmp2, mddev) { if (rdev < rdev2 && rdev->bdev->bd_contains == rdev2->bdev->bd_contains) { @@ -3297,7 +3471,7 @@ static int do_md_run(mddev_t * mddev) mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ mddev->in_sync = 1; - ITERATE_RDEV(mddev,rdev,tmp) + rdev_for_each(rdev, tmp, mddev) if (rdev->raid_disk >= 0) { char nm[20]; sprintf(nm, "rd%d", rdev->raid_disk); @@ -3330,7 +3504,7 @@ static int do_md_run(mddev_t * mddev) if (mddev->degraded && !mddev->sync_thread) { struct list_head *rtmp; int spares = 0; - ITERATE_RDEV(mddev,rdev,rtmp) + rdev_for_each(rdev, rtmp, mddev) if (rdev->raid_disk >= 0 && !test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) @@ -3507,14 +3681,14 @@ static int do_md_stop(mddev_t * mddev, int mode) } mddev->bitmap_offset = 0; - ITERATE_RDEV(mddev,rdev,tmp) + rdev_for_each(rdev, tmp, mddev) if (rdev->raid_disk >= 0) { char nm[20]; sprintf(nm, "rd%d", rdev->raid_disk); sysfs_remove_link(&mddev->kobj, nm); } - /* make sure all delayed_delete calls have finished */ + /* make sure all md_delayed_delete calls have finished */ flush_scheduled_work(); export_array(mddev); @@ -3523,7 +3697,10 @@ static int do_md_stop(mddev_t * mddev, int mode) mddev->size = 0; mddev->raid_disks = 0; mddev->recovery_cp = 0; + mddev->resync_max = MaxSector; mddev->reshape_position = MaxSector; + mddev->external = 0; + mddev->persistent = 0; } else if (mddev->pers) printk(KERN_INFO "md: %s switched to read-only mode.\n", @@ -3546,7 +3723,7 @@ static void autorun_array(mddev_t *mddev) printk(KERN_INFO "md: running: "); - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { char b[BDEVNAME_SIZE]; printk("<%s>", bdevname(rdev->bdev,b)); } @@ -3589,7 +3766,7 @@ static void autorun_devices(int part) printk(KERN_INFO "md: considering %s ...\n", bdevname(rdev0->bdev,b)); INIT_LIST_HEAD(&candidates); - ITERATE_RDEV_PENDING(rdev,tmp) + rdev_for_each_list(rdev, tmp, pending_raid_disks) if (super_90_load(rdev, rdev0, 0) >= 0) { printk(KERN_INFO "md: adding %s ...\n", bdevname(rdev->bdev,b)); @@ -3632,7 +3809,8 @@ static void autorun_devices(int part) mddev_unlock(mddev); } else { printk(KERN_INFO "md: created %s\n", mdname(mddev)); - ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { + mddev->persistent = 1; + rdev_for_each_list(rdev, tmp, candidates) { list_del_init(&rdev->same_set); if (bind_rdev_to_array(rdev, mddev)) export_rdev(rdev); @@ -3643,7 +3821,7 @@ static void autorun_devices(int part) /* on success, candidates will be empty, on error * it won't... */ - ITERATE_RDEV_GENERIC(candidates,rdev,tmp) + rdev_for_each_list(rdev, tmp, candidates) export_rdev(rdev); mddev_put(mddev); } @@ -3673,7 +3851,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg) struct list_head *tmp; nr=working=active=failed=spare=0; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { nr++; if (test_bit(Faulty, &rdev->flags)) failed++; @@ -3919,8 +4097,6 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) else rdev->raid_disk = -1; - rdev->flags = 0; - if (rdev->raid_disk < mddev->raid_disks) if (info->state & (1<<MD_DISK_SYNC)) set_bit(In_sync, &rdev->flags); @@ -4165,13 +4341,15 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) else mddev->recovery_cp = 0; mddev->persistent = ! info->not_persistent; + mddev->external = 0; mddev->layout = info->layout; mddev->chunk_size = info->chunk_size; mddev->max_disks = MD_SB_DISKS; - mddev->flags = 0; + if (mddev->persistent) + mddev->flags = 0; set_bit(MD_CHANGE_DEVS, &mddev->flags); mddev->default_bitmap_offset = MD_SB_BYTES >> 9; @@ -4213,7 +4391,7 @@ static int update_size(mddev_t *mddev, unsigned long size) */ if (mddev->sync_thread) return -EBUSY; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { sector_t avail; avail = rdev->size * 2; @@ -4471,9 +4649,10 @@ static int md_ioctl(struct inode *inode, struct file *file, */ /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ - if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY - && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE - && cmd != GET_BITMAP_FILE) { + if ((!mddev->raid_disks && !mddev->external) + && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY + && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE + && cmd != GET_BITMAP_FILE) { err = -ENODEV; goto abort_unlock; } @@ -4757,7 +4936,7 @@ static void status_unused(struct seq_file *seq) seq_printf(seq, "unused devices: "); - ITERATE_RDEV_PENDING(rdev,tmp) { + rdev_for_each_list(rdev, tmp, pending_raid_disks) { char b[BDEVNAME_SIZE]; i++; seq_printf(seq, "%s ", @@ -4953,7 +5132,7 @@ static int md_seq_show(struct seq_file *seq, void *v) } size = 0; - ITERATE_RDEV(mddev,rdev,tmp2) { + rdev_for_each(rdev, tmp2, mddev) { char b[BDEVNAME_SIZE]; seq_printf(seq, " %s[%d]", bdevname(rdev->bdev,b), rdev->desc_nr); @@ -4982,7 +5161,10 @@ static int md_seq_show(struct seq_file *seq, void *v) mddev->major_version, mddev->minor_version); } - } else + } else if (mddev->external) + seq_printf(seq, " super external:%s", + mddev->metadata_type); + else seq_printf(seq, " super non-persistent"); if (mddev->pers) { @@ -5106,7 +5288,7 @@ static int is_mddev_idle(mddev_t *mddev) long curr_events; idle = 1; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; curr_events = disk_stat_read(disk, sectors[0]) + disk_stat_read(disk, sectors[1]) - @@ -5283,7 +5465,7 @@ void md_do_sync(mddev_t *mddev) set_bit(MD_RECOVERY_INTR, &mddev->recovery); goto skip; } - ITERATE_MDDEV(mddev2,tmp) { + for_each_mddev(mddev2, tmp) { if (mddev2 == mddev) continue; if (mddev2->curr_resync && @@ -5333,7 +5515,7 @@ void md_do_sync(mddev_t *mddev) /* recovery follows the physical size of devices */ max_sectors = mddev->size << 1; j = MaxSector; - ITERATE_RDEV(mddev,rdev,rtmp) + rdev_for_each(rdev, rtmp, mddev) if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && @@ -5381,8 +5563,16 @@ void md_do_sync(mddev_t *mddev) sector_t sectors; skipped = 0; + if (j >= mddev->resync_max) { + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + wait_event(mddev->recovery_wait, + mddev->resync_max > j + || kthread_should_stop()); + } + if (kthread_should_stop()) + goto interrupted; sectors = mddev->pers->sync_request(mddev, j, &skipped, - currspeed < speed_min(mddev)); + currspeed < speed_min(mddev)); if (sectors == 0) { set_bit(MD_RECOVERY_ERR, &mddev->recovery); goto out; @@ -5424,15 +5614,9 @@ void md_do_sync(mddev_t *mddev) } - if (kthread_should_stop()) { - /* - * got a signal, exit. - */ - printk(KERN_INFO - "md: md_do_sync() got signal ... exiting\n"); - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - goto out; - } + if (kthread_should_stop()) + goto interrupted; + /* * this loop exits only if either when we are slower than @@ -5484,7 +5668,7 @@ void md_do_sync(mddev_t *mddev) } else { if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) mddev->curr_resync = MaxSector; - ITERATE_RDEV(mddev,rdev,rtmp) + rdev_for_each(rdev, rtmp, mddev) if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && @@ -5496,9 +5680,22 @@ void md_do_sync(mddev_t *mddev) skip: mddev->curr_resync = 0; + mddev->resync_max = MaxSector; + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); wake_up(&resync_wait); set_bit(MD_RECOVERY_DONE, &mddev->recovery); md_wakeup_thread(mddev->thread); + return; + + interrupted: + /* + * got a signal, exit. + */ + printk(KERN_INFO + "md: md_do_sync() got signal ... exiting\n"); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + goto out; + } EXPORT_SYMBOL_GPL(md_do_sync); @@ -5509,8 +5706,9 @@ static int remove_and_add_spares(mddev_t *mddev) struct list_head *rtmp; int spares = 0; - ITERATE_RDEV(mddev,rdev,rtmp) + rdev_for_each(rdev, rtmp, mddev) if (rdev->raid_disk >= 0 && + !mddev->external && (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) && atomic_read(&rdev->nr_pending)==0) { @@ -5524,7 +5722,7 @@ static int remove_and_add_spares(mddev_t *mddev) } if (mddev->degraded) { - ITERATE_RDEV(mddev,rdev,rtmp) + rdev_for_each(rdev, rtmp, mddev) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { rdev->recovery_offset = 0; @@ -5589,7 +5787,7 @@ void md_check_recovery(mddev_t *mddev) } if ( ! ( - mddev->flags || + (mddev->flags && !mddev->external) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || test_bit(MD_RECOVERY_DONE, &mddev->recovery) || (mddev->safemode == 1) || @@ -5605,7 +5803,8 @@ void md_check_recovery(mddev_t *mddev) if (mddev->safemode && !atomic_read(&mddev->writes_pending) && !mddev->in_sync && mddev->recovery_cp == MaxSector) { mddev->in_sync = 1; - set_bit(MD_CHANGE_CLEAN, &mddev->flags); + if (mddev->persistent) + set_bit(MD_CHANGE_CLEAN, &mddev->flags); } if (mddev->safemode == 1) mddev->safemode = 0; @@ -5637,7 +5836,7 @@ void md_check_recovery(mddev_t *mddev) * information must be scrapped */ if (!mddev->degraded) - ITERATE_RDEV(mddev,rdev,rtmp) + rdev_for_each(rdev, rtmp, mddev) rdev->saved_raid_disk = -1; mddev->recovery = 0; @@ -5714,7 +5913,7 @@ static int md_notify_reboot(struct notifier_block *this, printk(KERN_INFO "md: stopping all md devices.\n"); - ITERATE_MDDEV(mddev,tmp) + for_each_mddev(mddev, tmp) if (mddev_trylock(mddev)) { do_md_stop (mddev, 1); mddev_unlock(mddev); @@ -5848,7 +6047,7 @@ static __exit void md_exit(void) unregister_reboot_notifier(&md_notifier); unregister_sysctl_table(raid_table_header); remove_proc_entry("mdstat", NULL); - ITERATE_MDDEV(mddev,tmp) { + for_each_mddev(mddev, tmp) { struct gendisk *disk = mddev->gendisk; if (!disk) continue; diff --git a/drivers/md/mktables.c b/drivers/md/mktables.c index adef299908c..b61d5767aae 100644 --- a/drivers/md/mktables.c +++ b/drivers/md/mktables.c @@ -1,13 +1,10 @@ -#ident "$Id: mktables.c,v 1.2 2002/12/12 22:41:27 hpa Exp $" -/* ----------------------------------------------------------------------- * +/* -*- linux-c -*- ------------------------------------------------------- * * - * Copyright 2002 H. Peter Anvin - All Rights Reserved + * Copyright 2002-2007 H. Peter Anvin - All Rights Reserved * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2 or (at your + * option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ @@ -26,100 +23,98 @@ static uint8_t gfmul(uint8_t a, uint8_t b) { - uint8_t v = 0; - - while ( b ) { - if ( b & 1 ) v ^= a; - a = (a << 1) ^ (a & 0x80 ? 0x1d : 0); - b >>= 1; - } - return v; + uint8_t v = 0; + + while (b) { + if (b & 1) + v ^= a; + a = (a << 1) ^ (a & 0x80 ? 0x1d : 0); + b >>= 1; + } + + return v; } static uint8_t gfpow(uint8_t a, int b) { - uint8_t v = 1; - - b %= 255; - if ( b < 0 ) - b += 255; - - while ( b ) { - if ( b & 1 ) v = gfmul(v,a); - a = gfmul(a,a); - b >>= 1; - } - return v; + uint8_t v = 1; + + b %= 255; + if (b < 0) + b += 255; + + while (b) { + if (b & 1) + v = gfmul(v, a); + a = gfmul(a, a); + b >>= 1; + } + + return v; } int main(int argc, char *argv[]) { - int i, j, k; - uint8_t v; - uint8_t exptbl[256], invtbl[256]; - - printf("#include \"raid6.h\"\n"); - - /* Compute multiplication table */ - printf("\nconst u8 __attribute__((aligned(256)))\n" - "raid6_gfmul[256][256] =\n" - "{\n"); - for ( i = 0 ; i < 256 ; i++ ) { - printf("\t{\n"); - for ( j = 0 ; j < 256 ; j += 8 ) { - printf("\t\t"); - for ( k = 0 ; k < 8 ; k++ ) { - printf("0x%02x, ", gfmul(i,j+k)); - } - printf("\n"); - } - printf("\t},\n"); - } - printf("};\n"); - - /* Compute power-of-2 table (exponent) */ - v = 1; - printf("\nconst u8 __attribute__((aligned(256)))\n" - "raid6_gfexp[256] =\n" - "{\n"); - for ( i = 0 ; i < 256 ; i += 8 ) { - printf("\t"); - for ( j = 0 ; j < 8 ; j++ ) { - exptbl[i+j] = v; - printf("0x%02x, ", v); - v = gfmul(v,2); - if ( v == 1 ) v = 0; /* For entry 255, not a real entry */ - } - printf("\n"); - } - printf("};\n"); - - /* Compute inverse table x^-1 == x^254 */ - printf("\nconst u8 __attribute__((aligned(256)))\n" - "raid6_gfinv[256] =\n" - "{\n"); - for ( i = 0 ; i < 256 ; i += 8 ) { - printf("\t"); - for ( j = 0 ; j < 8 ; j++ ) { - invtbl[i+j] = v = gfpow(i+j,254); - printf("0x%02x, ", v); - } - printf("\n"); - } - printf("};\n"); - - /* Compute inv(2^x + 1) (exponent-xor-inverse) table */ - printf("\nconst u8 __attribute__((aligned(256)))\n" - "raid6_gfexi[256] =\n" - "{\n"); - for ( i = 0 ; i < 256 ; i += 8 ) { - printf("\t"); - for ( j = 0 ; j < 8 ; j++ ) { - printf("0x%02x, ", invtbl[exptbl[i+j]^1]); - } - printf("\n"); - } - printf("};\n\n"); - - return 0; + int i, j, k; + uint8_t v; + uint8_t exptbl[256], invtbl[256]; + + printf("#include \"raid6.h\"\n"); + + /* Compute multiplication table */ + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_gfmul[256][256] =\n" + "{\n"); + for (i = 0; i < 256; i++) { + printf("\t{\n"); + for (j = 0; j < 256; j += 8) { + printf("\t\t"); + for (k = 0; k < 8; k++) + printf("0x%02x,%c", gfmul(i, j + k), + (k == 7) ? '\n' : ' '); + } + printf("\t},\n"); + } + printf("};\n"); + + /* Compute power-of-2 table (exponent) */ + v = 1; + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_gfexp[256] =\n" "{\n"); + for (i = 0; i < 256; i += 8) { + printf("\t"); + for (j = 0; j < 8; j++) { + exptbl[i + j] = v; + printf("0x%02x,%c", v, (j == 7) ? '\n' : ' '); + v = gfmul(v, 2); + if (v == 1) + v = 0; /* For entry 255, not a real entry */ + } + } + printf("};\n"); + + /* Compute inverse table x^-1 == x^254 */ + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_gfinv[256] =\n" "{\n"); + for (i = 0; i < 256; i += 8) { + printf("\t"); + for (j = 0; j < 8; j++) { + invtbl[i + j] = v = gfpow(i + j, 254); + printf("0x%02x,%c", v, (j == 7) ? '\n' : ' '); + } + } + printf("};\n"); + + /* Compute inv(2^x + 1) (exponent-xor-inverse) table */ + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_gfexi[256] =\n" "{\n"); + for (i = 0; i < 256; i += 8) { + printf("\t"); + for (j = 0; j < 8; j++) + printf("0x%02x,%c", invtbl[exptbl[i + j] ^ 1], + (j == 7) ? '\n' : ' '); + } + printf("};\n"); + + return 0; } diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index eb631ebed68..3f299d835a2 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -436,7 +436,7 @@ static int multipath_run (mddev_t *mddev) } conf->working_disks = 0; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { disk_idx = rdev->raid_disk; if (disk_idx < 0 || disk_idx >= mddev->raid_disks) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index f8e591708d1..818b4828409 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -72,11 +72,11 @@ static int create_strip_zones (mddev_t *mddev) */ conf->nr_strip_zones = 0; - ITERATE_RDEV(mddev,rdev1,tmp1) { + rdev_for_each(rdev1, tmp1, mddev) { printk("raid0: looking at %s\n", bdevname(rdev1->bdev,b)); c = 0; - ITERATE_RDEV(mddev,rdev2,tmp2) { + rdev_for_each(rdev2, tmp2, mddev) { printk("raid0: comparing %s(%llu)", bdevname(rdev1->bdev,b), (unsigned long long)rdev1->size); @@ -124,7 +124,7 @@ static int create_strip_zones (mddev_t *mddev) cnt = 0; smallest = NULL; zone->dev = conf->devlist; - ITERATE_RDEV(mddev, rdev1, tmp1) { + rdev_for_each(rdev1, tmp1, mddev) { int j = rdev1->raid_disk; if (j < 0 || j >= mddev->raid_disks) { @@ -293,7 +293,7 @@ static int raid0_run (mddev_t *mddev) /* calculate array device size */ mddev->array_size = 0; - ITERATE_RDEV(mddev,rdev,tmp) + rdev_for_each(rdev, tmp, mddev) mddev->array_size += rdev->size; printk("raid0 : md_size is %llu blocks.\n", diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 4a69c416e04..5c7fef091ce 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1684,6 +1684,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i if (!go_faster && conf->nr_waiting) msleep_interruptible(1000); + bitmap_cond_end_sync(mddev->bitmap, sector_nr); raise_barrier(conf); conf->next_resync = sector_nr; @@ -1766,6 +1767,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return rv; } + if (max_sector > mddev->resync_max) + max_sector = mddev->resync_max; /* Don't do IO beyond here */ nr_sectors = 0; sync_blocks = 0; do { @@ -1884,7 +1887,7 @@ static int run(mddev_t *mddev) if (!conf->r1bio_pool) goto out_no_mem; - ITERATE_RDEV(mddev, rdev, tmp) { + rdev_for_each(rdev, tmp, mddev) { disk_idx = rdev->raid_disk; if (disk_idx >= mddev->raid_disks || disk_idx < 0) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 5cdcc938620..017f58113c3 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1657,6 +1657,9 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return (max_sector - sector_nr) + sectors_skipped; } + if (max_sector > mddev->resync_max) + max_sector = mddev->resync_max; /* Don't do IO beyond here */ + /* make sure whole request will fit in a chunk - if chunks * are meaningful */ @@ -1670,6 +1673,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i if (!go_faster && conf->nr_waiting) msleep_interruptible(1000); + bitmap_cond_end_sync(mddev->bitmap, sector_nr); + /* Again, very different code for resync and recovery. * Both must result in an r10bio with a list of bios that * have bi_end_io, bi_sector, bi_bdev set, @@ -2021,7 +2026,7 @@ static int run(mddev_t *mddev) goto out_free_conf; } - ITERATE_RDEV(mddev, rdev, tmp) { + rdev_for_each(rdev, tmp, mddev) { disk_idx = rdev->raid_disk; if (disk_idx >= mddev->raid_disks || disk_idx < 0) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e8c8157b02f..2d6f1a51359 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3159,7 +3159,8 @@ static void raid5_activate_delayed(raid5_conf_t *conf) atomic_inc(&conf->preread_active_stripes); list_add_tail(&sh->lru, &conf->handle_list); } - } + } else + blk_plug_device(conf->mddev->queue); } static void activate_bit_delay(raid5_conf_t *conf) @@ -3549,7 +3550,8 @@ static int make_request(struct request_queue *q, struct bio * bi) goto retry; } finish_wait(&conf->wait_for_overlap, &w); - handle_stripe(sh, NULL); + set_bit(STRIPE_HANDLE, &sh->state); + clear_bit(STRIPE_DELAYED, &sh->state); release_stripe(sh); } else { /* cannot get stripe for read-ahead, just give-up */ @@ -3698,6 +3700,25 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped release_stripe(sh); first_sector += STRIPE_SECTORS; } + /* If this takes us to the resync_max point where we have to pause, + * then we need to write out the superblock. + */ + sector_nr += conf->chunk_size>>9; + if (sector_nr >= mddev->resync_max) { + /* Cannot proceed until we've updated the superblock... */ + wait_event(conf->wait_for_overlap, + atomic_read(&conf->reshape_stripes) == 0); + mddev->reshape_position = conf->expand_progress; + set_bit(MD_CHANGE_DEVS, &mddev->flags); + md_wakeup_thread(mddev->thread); + wait_event(mddev->sb_wait, + !test_bit(MD_CHANGE_DEVS, &mddev->flags) + || kthread_should_stop()); + spin_lock_irq(&conf->device_lock); + conf->expand_lo = mddev->reshape_position; + spin_unlock_irq(&conf->device_lock); + wake_up(&conf->wait_for_overlap); + } return conf->chunk_size>>9; } @@ -3734,6 +3755,12 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) return reshape_request(mddev, sector_nr, skipped); + /* No need to check resync_max as we never do more than one + * stripe, and as resync_max will always be on a chunk boundary, + * if the check in md_do_sync didn't fire, there is no chance + * of overstepping resync_max here + */ + /* if there is too many failed drives and we are trying * to resync, then assert that we are finished, because there is * nothing we can do. @@ -3753,6 +3780,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ } + + bitmap_cond_end_sync(mddev->bitmap, sector_nr); + pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks); sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1); if (sh == NULL) { @@ -3864,7 +3894,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) * During the scan, completed stripes are saved for us by the interrupt * handler, so that they will not have to wait for our next wakeup. */ -static void raid5d (mddev_t *mddev) +static void raid5d(mddev_t *mddev) { struct stripe_head *sh; raid5_conf_t *conf = mddev_to_conf(mddev); @@ -3889,12 +3919,6 @@ static void raid5d (mddev_t *mddev) activate_bit_delay(conf); } - if (list_empty(&conf->handle_list) && - atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && - !blk_queue_plugged(mddev->queue) && - !list_empty(&conf->delayed_list)) - raid5_activate_delayed(conf); - while ((bio = remove_bio_from_retry(conf))) { int ok; spin_unlock_irq(&conf->device_lock); @@ -4108,7 +4132,7 @@ static int run(mddev_t *mddev) pr_debug("raid5: run(%s) called.\n", mdname(mddev)); - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { raid_disk = rdev->raid_disk; if (raid_disk >= conf->raid_disks || raid_disk < 0) @@ -4521,7 +4545,7 @@ static int raid5_start_reshape(mddev_t *mddev) if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; - ITERATE_RDEV(mddev, rdev, rtmp) + rdev_for_each(rdev, rtmp, mddev) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) spares++; @@ -4543,7 +4567,7 @@ static int raid5_start_reshape(mddev_t *mddev) /* Add some new drives, as many as will fit. * We know there are enough to make the newly sized array work. */ - ITERATE_RDEV(mddev, rdev, rtmp) + rdev_for_each(rdev, rtmp, mddev) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { if (raid5_add_disk(mddev, rdev)) { diff --git a/drivers/md/raid6test/test.c b/drivers/md/raid6test/test.c index 0d5cd57accd..559cc41b258 100644 --- a/drivers/md/raid6test/test.c +++ b/drivers/md/raid6test/test.c @@ -1,12 +1,10 @@ /* -*- linux-c -*- ------------------------------------------------------- * * - * Copyright 2002 H. Peter Anvin - All Rights Reserved + * Copyright 2002-2007 H. Peter Anvin - All Rights Reserved * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2 or (at your + * option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ @@ -30,67 +28,87 @@ char *dataptrs[NDISKS]; char data[NDISKS][PAGE_SIZE]; char recovi[PAGE_SIZE], recovj[PAGE_SIZE]; -void makedata(void) +static void makedata(void) { int i, j; - for ( i = 0 ; i < NDISKS ; i++ ) { - for ( j = 0 ; j < PAGE_SIZE ; j++ ) { + for (i = 0; i < NDISKS; i++) { + for (j = 0; j < PAGE_SIZE; j++) data[i][j] = rand(); - } + dataptrs[i] = data[i]; } } +static char disk_type(int d) +{ + switch (d) { + case NDISKS-2: + return 'P'; + case NDISKS-1: + return 'Q'; + default: + return 'D'; + } +} + +static int test_disks(int i, int j) +{ + int erra, errb; + + memset(recovi, 0xf0, PAGE_SIZE); + memset(recovj, 0xba, PAGE_SIZE); + + dataptrs[i] = recovi; + dataptrs[j] = recovj; + + raid6_dual_recov(NDISKS, PAGE_SIZE, i, j, (void **)&dataptrs); + + erra = memcmp(data[i], recovi, PAGE_SIZE); + errb = memcmp(data[j], recovj, PAGE_SIZE); + + if (i < NDISKS-2 && j == NDISKS-1) { + /* We don't implement the DQ failure scenario, since it's + equivalent to a RAID-5 failure (XOR, then recompute Q) */ + erra = errb = 0; + } else { + printf("algo=%-8s faila=%3d(%c) failb=%3d(%c) %s\n", + raid6_call.name, + i, disk_type(i), + j, disk_type(j), + (!erra && !errb) ? "OK" : + !erra ? "ERRB" : + !errb ? "ERRA" : "ERRAB"); + } + + dataptrs[i] = data[i]; + dataptrs[j] = data[j]; + + return erra || errb; +} + int main(int argc, char *argv[]) { - const struct raid6_calls * const * algo; + const struct raid6_calls *const *algo; int i, j; - int erra, errb; + int err = 0; makedata(); - for ( algo = raid6_algos ; *algo ; algo++ ) { - if ( !(*algo)->valid || (*algo)->valid() ) { + for (algo = raid6_algos; *algo; algo++) { + if (!(*algo)->valid || (*algo)->valid()) { raid6_call = **algo; /* Nuke syndromes */ memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE); /* Generate assumed good syndrome */ - raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, (void **)&dataptrs); - - for ( i = 0 ; i < NDISKS-1 ; i++ ) { - for ( j = i+1 ; j < NDISKS ; j++ ) { - memset(recovi, 0xf0, PAGE_SIZE); - memset(recovj, 0xba, PAGE_SIZE); - - dataptrs[i] = recovi; - dataptrs[j] = recovj; - - raid6_dual_recov(NDISKS, PAGE_SIZE, i, j, (void **)&dataptrs); - - erra = memcmp(data[i], recovi, PAGE_SIZE); - errb = memcmp(data[j], recovj, PAGE_SIZE); - - if ( i < NDISKS-2 && j == NDISKS-1 ) { - /* We don't implement the DQ failure scenario, since it's - equivalent to a RAID-5 failure (XOR, then recompute Q) */ - } else { - printf("algo=%-8s faila=%3d(%c) failb=%3d(%c) %s\n", - raid6_call.name, - i, (i==NDISKS-2)?'P':'D', - j, (j==NDISKS-1)?'Q':(j==NDISKS-2)?'P':'D', - (!erra && !errb) ? "OK" : - !erra ? "ERRB" : - !errb ? "ERRA" : - "ERRAB"); - } - - dataptrs[i] = data[i]; - dataptrs[j] = data[j]; - } - } + raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, + (void **)&dataptrs); + + for (i = 0; i < NDISKS-1; i++) + for (j = i+1; j < NDISKS; j++) + err += test_disks(i, j); } printf("\n"); } @@ -99,5 +117,8 @@ int main(int argc, char *argv[]) /* Pick the best algorithm test */ raid6_select_algo(); - return 0; + if (err) + printf("\n*** ERRORS FOUND ***\n"); + + return err; } |