aboutsummaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bitmap.c183
-rw-r--r--drivers/md/dm-exception-store.c9
-rw-r--r--drivers/md/dm-raid1.c12
-rw-r--r--drivers/md/linear.c100
-rw-r--r--drivers/md/md.c227
-rw-r--r--drivers/md/multipath.c5
-rw-r--r--drivers/md/raid0.c5
-rw-r--r--drivers/md/raid1.c234
-rw-r--r--drivers/md/raid10.c46
-rw-r--r--drivers/md/raid5.c138
-rw-r--r--drivers/md/raid6main.c138
11 files changed, 856 insertions, 241 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 41df4cda66e..2fba2bbe72d 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -270,19 +270,20 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde
if (!page)
return ERR_PTR(-ENOMEM);
- do {
- ITERATE_RDEV(mddev, rdev, tmp)
- if (rdev->in_sync && !rdev->faulty)
- goto found;
- return ERR_PTR(-EIO);
- found:
+ ITERATE_RDEV(mddev, rdev, tmp) {
+ if (! rdev->in_sync || rdev->faulty)
+ continue;
+
target = (rdev->sb_offset << 1) + offset + index * (PAGE_SIZE/512);
- } while (!sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ));
+ if (sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)) {
+ page->index = index;
+ return page;
+ }
+ }
+ return ERR_PTR(-EIO);
- page->index = index;
- return page;
}
static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int wait)
@@ -437,6 +438,7 @@ void bitmap_print_sb(struct bitmap *bitmap)
printk(KERN_DEBUG " daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep));
printk(KERN_DEBUG " sync size: %llu KB\n",
(unsigned long long)le64_to_cpu(sb->sync_size)/2);
+ printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind));
kunmap(bitmap->sb_page);
}
@@ -445,7 +447,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
{
char *reason = NULL;
bitmap_super_t *sb;
- unsigned long chunksize, daemon_sleep;
+ unsigned long chunksize, daemon_sleep, write_behind;
unsigned long bytes_read;
unsigned long long events;
int err = -EINVAL;
@@ -474,6 +476,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
chunksize = le32_to_cpu(sb->chunksize);
daemon_sleep = le32_to_cpu(sb->daemon_sleep);
+ write_behind = le32_to_cpu(sb->write_behind);
/* verify that the bitmap-specific fields are valid */
if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
@@ -485,7 +488,9 @@ static int bitmap_read_sb(struct bitmap *bitmap)
else if ((1 << ffz(~chunksize)) != chunksize)
reason = "bitmap chunksize not a power of 2";
else if (daemon_sleep < 1 || daemon_sleep > 15)
- reason = "daemon sleep period out of range";
+ reason = "daemon sleep period out of range (1-15s)";
+ else if (write_behind > COUNTER_MAX)
+ reason = "write-behind limit out of range (0 - 16383)";
if (reason) {
printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n",
bmname(bitmap), reason);
@@ -518,8 +523,12 @@ success:
/* assign fields using values from superblock */
bitmap->chunksize = chunksize;
bitmap->daemon_sleep = daemon_sleep;
+ bitmap->daemon_lastrun = jiffies;
+ bitmap->max_write_behind = write_behind;
bitmap->flags |= sb->state;
bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
+ if (sb->state & BITMAP_STALE)
+ bitmap->events_cleared = bitmap->mddev->events;
err = 0;
out:
kunmap(bitmap->sb_page);
@@ -617,7 +626,7 @@ static void bitmap_file_unmap(struct bitmap *bitmap)
page_cache_release(sb_page);
}
-static void bitmap_stop_daemons(struct bitmap *bitmap);
+static void bitmap_stop_daemon(struct bitmap *bitmap);
/* dequeue the next item in a page list -- don't call from irq context */
static struct page_list *dequeue_page(struct bitmap *bitmap)
@@ -659,7 +668,7 @@ static void bitmap_file_put(struct bitmap *bitmap)
bitmap->file = NULL;
spin_unlock_irqrestore(&bitmap->lock, flags);
- bitmap_stop_daemons(bitmap);
+ bitmap_stop_daemon(bitmap);
drain_write_queues(bitmap);
@@ -818,7 +827,7 @@ int bitmap_unplug(struct bitmap *bitmap)
return 0;
}
-static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset);
+static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed);
/* * bitmap_init_from_disk -- called at bitmap_create time to initialize
* the in-memory bitmap from the on-disk bitmap -- also, sets up the
* memory mapping of the bitmap file
@@ -826,8 +835,11 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset);
* if there's no bitmap file, or if the bitmap file had been
* previously kicked from the array, we mark all the bits as
* 1's in order to cause a full resync.
+ *
+ * We ignore all bits for sectors that end earlier than 'start'.
+ * This is used when reading an out-of-date bitmap...
*/
-static int bitmap_init_from_disk(struct bitmap *bitmap)
+static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
{
unsigned long i, chunks, index, oldindex, bit;
struct page *page = NULL, *oldpage = NULL;
@@ -914,7 +926,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap)
* whole page and write it out
*/
memset(page_address(page) + offset, 0xff,
- PAGE_SIZE - offset);
+ PAGE_SIZE - offset);
ret = write_page(bitmap, page, 1);
if (ret) {
kunmap(page);
@@ -928,8 +940,11 @@ static int bitmap_init_from_disk(struct bitmap *bitmap)
}
if (test_bit(bit, page_address(page))) {
/* if the disk bit is set, set the memory bit */
- bitmap_set_memory_bits(bitmap, i << CHUNK_BLOCK_SHIFT(bitmap));
+ bitmap_set_memory_bits(bitmap, i << CHUNK_BLOCK_SHIFT(bitmap),
+ ((i+1) << (CHUNK_BLOCK_SHIFT(bitmap)) >= start)
+ );
bit_cnt++;
+ set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
}
}
@@ -1141,6 +1156,9 @@ static void bitmap_writeback_daemon(mddev_t *mddev)
err = -EINTR;
goto out;
}
+ if (bitmap == NULL)
+ /* about to be stopped. */
+ return;
PRINTK("%s: bitmap writeback daemon woke up...\n", bmname(bitmap));
/* wait on bitmap page writebacks */
@@ -1170,21 +1188,12 @@ static void bitmap_writeback_daemon(mddev_t *mddev)
}
}
-static int bitmap_start_daemon(struct bitmap *bitmap, mdk_thread_t **ptr,
+static mdk_thread_t *bitmap_start_daemon(struct bitmap *bitmap,
void (*func)(mddev_t *), char *name)
{
mdk_thread_t *daemon;
- unsigned long flags;
char namebuf[32];
- spin_lock_irqsave(&bitmap->lock, flags);
- *ptr = NULL;
-
- if (!bitmap->file) /* no need for daemon if there's no backing file */
- goto out_unlock;
-
- spin_unlock_irqrestore(&bitmap->lock, flags);
-
#ifdef INJECT_FATAL_FAULT_2
daemon = NULL;
#else
@@ -1194,47 +1203,32 @@ static int bitmap_start_daemon(struct bitmap *bitmap, mdk_thread_t **ptr,
if (!daemon) {
printk(KERN_ERR "%s: failed to start bitmap daemon\n",
bmname(bitmap));
- return -ECHILD;
+ return ERR_PTR(-ECHILD);
}
- spin_lock_irqsave(&bitmap->lock, flags);
- *ptr = daemon;
-
md_wakeup_thread(daemon); /* start it running */
PRINTK("%s: %s daemon (pid %d) started...\n",
bmname(bitmap), name, daemon->tsk->pid);
-out_unlock:
- spin_unlock_irqrestore(&bitmap->lock, flags);
- return 0;
-}
-static int bitmap_start_daemons(struct bitmap *bitmap)
-{
- int err = bitmap_start_daemon(bitmap, &bitmap->writeback_daemon,
- bitmap_writeback_daemon, "bitmap_wb");
- return err;
+ return daemon;
}
-static void bitmap_stop_daemon(struct bitmap *bitmap, mdk_thread_t **ptr)
+static void bitmap_stop_daemon(struct bitmap *bitmap)
{
- mdk_thread_t *daemon;
- unsigned long flags;
-
- spin_lock_irqsave(&bitmap->lock, flags);
- daemon = *ptr;
- *ptr = NULL;
- spin_unlock_irqrestore(&bitmap->lock, flags);
- if (daemon)
- md_unregister_thread(daemon); /* destroy the thread */
-}
+ /* the daemon can't stop itself... it'll just exit instead... */
+ if (bitmap->writeback_daemon && ! IS_ERR(bitmap->writeback_daemon) &&
+ current->pid != bitmap->writeback_daemon->tsk->pid) {
+ mdk_thread_t *daemon;
+ unsigned long flags;
-static void bitmap_stop_daemons(struct bitmap *bitmap)
-{
- /* the daemons can't stop themselves... they'll just exit instead... */
- if (bitmap->writeback_daemon &&
- current->pid != bitmap->writeback_daemon->tsk->pid)
- bitmap_stop_daemon(bitmap, &bitmap->writeback_daemon);
+ spin_lock_irqsave(&bitmap->lock, flags);
+ daemon = bitmap->writeback_daemon;
+ bitmap->writeback_daemon = NULL;
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+ if (daemon && ! IS_ERR(daemon))
+ md_unregister_thread(daemon); /* destroy the thread */
+ }
}
static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
@@ -1274,9 +1268,16 @@ static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
}
}
-int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors)
+int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind)
{
if (!bitmap) return 0;
+
+ if (behind) {
+ atomic_inc(&bitmap->behind_writes);
+ PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n",
+ atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);
+ }
+
while (sectors) {
int blocks;
bitmap_counter_t *bmc;
@@ -1311,9 +1312,15 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
}
void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors,
- int success)
+ int success, int behind)
{
if (!bitmap) return;
+ if (behind) {
+ atomic_dec(&bitmap->behind_writes);
+ PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n",
+ atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);
+ }
+
while (sectors) {
int blocks;
unsigned long flags;
@@ -1424,7 +1431,7 @@ void bitmap_close_sync(struct bitmap *bitmap)
}
}
-static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset)
+static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed)
{
/* For each chunk covered by any of these sectors, set the
* counter to 1 and set resync_needed. They should all
@@ -1441,7 +1448,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset)
}
if (! *bmc) {
struct page *page;
- *bmc = 1 | NEEDED_MASK;
+ *bmc = 1 | (needed?NEEDED_MASK:0);
bitmap_count_page(bitmap, offset, 1);
page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap));
set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
@@ -1476,17 +1483,14 @@ void bitmap_flush(mddev_t *mddev)
/*
* free memory that was allocated
*/
-void bitmap_destroy(mddev_t *mddev)
+static void bitmap_free(struct bitmap *bitmap)
{
unsigned long k, pages;
struct bitmap_page *bp;
- struct bitmap *bitmap = mddev->bitmap;
if (!bitmap) /* there was no bitmap */
return;
- mddev->bitmap = NULL; /* disconnect from the md device */
-
/* release the bitmap file and kill the daemon */
bitmap_file_put(bitmap);
@@ -1504,6 +1508,17 @@ void bitmap_destroy(mddev_t *mddev)
kfree(bp);
kfree(bitmap);
}
+void bitmap_destroy(mddev_t *mddev)
+{
+ struct bitmap *bitmap = mddev->bitmap;
+
+ if (!bitmap) /* there was no bitmap */
+ return;
+
+ mddev->bitmap = NULL; /* disconnect from the md device */
+
+ bitmap_free(bitmap);
+}
/*
* initialize the bitmap structure
@@ -1517,6 +1532,7 @@ int bitmap_create(mddev_t *mddev)
unsigned long pages;
struct file *file = mddev->bitmap_file;
int err;
+ sector_t start;
BUG_ON(sizeof(bitmap_super_t) != 256);
@@ -1533,15 +1549,15 @@ int bitmap_create(mddev_t *mddev)
spin_lock_init(&bitmap->lock);
bitmap->mddev = mddev;
- mddev->bitmap = bitmap;
spin_lock_init(&bitmap->write_lock);
INIT_LIST_HEAD(&bitmap->complete_pages);
init_waitqueue_head(&bitmap->write_wait);
bitmap->write_pool = mempool_create(WRITE_POOL_SIZE, write_pool_alloc,
write_pool_free, NULL);
+ err = -ENOMEM;
if (!bitmap->write_pool)
- return -ENOMEM;
+ goto error;
bitmap->file = file;
bitmap->offset = mddev->bitmap_offset;
@@ -1549,7 +1565,7 @@ int bitmap_create(mddev_t *mddev)
/* read superblock from bitmap file (this sets bitmap->chunksize) */
err = bitmap_read_sb(bitmap);
if (err)
- return err;
+ goto error;
bitmap->chunkshift = find_first_bit(&bitmap->chunksize,
sizeof(bitmap->chunksize));
@@ -1573,27 +1589,44 @@ int bitmap_create(mddev_t *mddev)
#else
bitmap->bp = kmalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL);
#endif
+ err = -ENOMEM;
if (!bitmap->bp)
- return -ENOMEM;
+ goto error;
memset(bitmap->bp, 0, pages * sizeof(*bitmap->bp));
bitmap->flags |= BITMAP_ACTIVE;
/* now that we have some pages available, initialize the in-memory
* bitmap from the on-disk bitmap */
- err = bitmap_init_from_disk(bitmap);
+ start = 0;
+ if (mddev->degraded == 0
+ || bitmap->events_cleared == mddev->events)
+ /* no need to keep dirty bits to optimise a re-add of a missing device */
+ start = mddev->recovery_cp;
+ err = bitmap_init_from_disk(bitmap, start);
if (err)
- return err;
+ goto error;
printk(KERN_INFO "created bitmap (%lu pages) for device %s\n",
pages, bmname(bitmap));
- /* kick off the bitmap daemons */
- err = bitmap_start_daemons(bitmap);
- if (err)
- return err;
+ mddev->bitmap = bitmap;
+
+ if (file)
+ /* kick off the bitmap writeback daemon */
+ bitmap->writeback_daemon =
+ bitmap_start_daemon(bitmap,
+ bitmap_writeback_daemon,
+ "bitmap_wb");
+
+ if (IS_ERR(bitmap->writeback_daemon))
+ return PTR_ERR(bitmap->writeback_daemon);
return bitmap_update_sb(bitmap);
+
+ error:
+ bitmap_free(bitmap);
+ return err;
}
/* the bitmap API -- for raid personalities */
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 17212b4201a..cc07bbebbb1 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -568,12 +568,9 @@ int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
bad:
dm_io_put(sectors_to_pages(chunk_size));
- if (ps) {
- if (ps->area)
- free_area(ps);
-
- kfree(ps);
- }
+ if (ps && ps->area)
+ free_area(ps);
+ kfree(ps);
return r;
}
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index b08df8b9b2c..86328251375 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -375,16 +375,18 @@ static void rh_inc(struct region_hash *rh, region_t region)
read_lock(&rh->hash_lock);
reg = __rh_find(rh, region);
+
+ atomic_inc(&reg->pending);
+
+ spin_lock_irq(&rh->region_lock);
if (reg->state == RH_CLEAN) {
rh->log->type->mark_region(rh->log, reg->key);
- spin_lock_irq(&rh->region_lock);
reg->state = RH_DIRTY;
list_del_init(&reg->list); /* take off the clean list */
- spin_unlock_irq(&rh->region_lock);
}
+ spin_unlock_irq(&rh->region_lock);
- atomic_inc(&reg->pending);
read_unlock(&rh->hash_lock);
}
@@ -408,6 +410,10 @@ static void rh_dec(struct region_hash *rh, region_t region)
if (atomic_dec_and_test(&reg->pending)) {
spin_lock_irqsave(&rh->region_lock, flags);
+ if (atomic_read(&reg->pending)) { /* check race */
+ spin_unlock_irqrestore(&rh->region_lock, flags);
+ return;
+ }
if (reg->state == RH_RECOVERING) {
list_add_tail(&reg->list, &rh->quiesced_regions);
} else {
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 8d740013d74..bb279fad2fd 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -38,7 +38,8 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
/*
* sector_div(a,b) returns the remainer and sets a to a/b
*/
- (void)sector_div(block, conf->smallest->size);
+ block >>= conf->preshift;
+ (void)sector_div(block, conf->hash_spacing);
hash = conf->hash_table[block];
while ((sector>>1) >= (hash->size + hash->offset))
@@ -47,7 +48,7 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
}
/**
- * linear_mergeable_bvec -- tell bio layer if a two requests can be merged
+ * linear_mergeable_bvec -- tell bio layer if two requests can be merged
* @q: request queue
* @bio: the buffer head that's been built up so far
* @biovec: the request that could be merged to it.
@@ -116,7 +117,7 @@ static int linear_run (mddev_t *mddev)
dev_info_t **table;
mdk_rdev_t *rdev;
int i, nb_zone, cnt;
- sector_t start;
+ sector_t min_spacing;
sector_t curr_offset;
struct list_head *tmp;
@@ -127,11 +128,6 @@ static int linear_run (mddev_t *mddev)
memset(conf, 0, sizeof(*conf) + mddev->raid_disks*sizeof(dev_info_t));
mddev->private = conf;
- /*
- * Find the smallest device.
- */
-
- conf->smallest = NULL;
cnt = 0;
mddev->array_size = 0;
@@ -159,8 +155,6 @@ static int linear_run (mddev_t *mddev)
disk->size = rdev->size;
mddev->array_size += rdev->size;
- if (!conf->smallest || (disk->size < conf->smallest->size))
- conf->smallest = disk;
cnt++;
}
if (cnt != mddev->raid_disks) {
@@ -168,6 +162,36 @@ static int linear_run (mddev_t *mddev)
goto out;
}
+ min_spacing = mddev->array_size;
+ sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *));
+
+ /* min_spacing is the minimum spacing that will fit the hash
+ * table in one PAGE. This may be much smaller than needed.
+ * We find the smallest non-terminal set of consecutive devices
+ * that is larger than min_spacing as use the size of that as
+ * the actual spacing
+ */
+ conf->hash_spacing = mddev->array_size;
+ for (i=0; i < cnt-1 ; i++) {
+ sector_t sz = 0;
+ int j;
+ for (j=i; i<cnt-1 && sz < min_spacing ; j++)
+ sz += conf->disks[j].size;
+ if (sz >= min_spacing && sz < conf->hash_spacing)
+ conf->hash_spacing = sz;
+ }
+
+ /* hash_spacing may be too large for sector_div to work with,
+ * so we might need to pre-shift
+ */
+ conf->preshift = 0;
+ if (sizeof(sector_t) > sizeof(u32)) {
+ sector_t space = conf->hash_spacing;
+ while (space > (sector_t)(~(u32)0)) {
+ space >>= 1;
+ conf->preshift++;
+ }
+ }
/*
* This code was restructured to work around a gcc-2.95.3 internal
* compiler error. Alter it with care.
@@ -177,39 +201,52 @@ static int linear_run (mddev_t *mddev)
unsigned round;
unsigned long base;
- sz = mddev->array_size;
- base = conf->smallest->size;
+ sz = mddev->array_size >> conf->preshift;
+ sz += 1; /* force round-up */
+ base = conf->hash_spacing >> conf->preshift;
round = sector_div(sz, base);
- nb_zone = conf->nr_zones = sz + (round ? 1 : 0);
+ nb_zone = sz + (round ? 1 : 0);
}
-
- conf->hash_table = kmalloc (sizeof (dev_info_t*) * nb_zone,
+ BUG_ON(nb_zone > PAGE_SIZE / sizeof(struct dev_info *));
+
+ conf->hash_table = kmalloc (sizeof (struct dev_info *) * nb_zone,
GFP_KERNEL);
if (!conf->hash_table)
goto out;
/*
* Here we generate the linear hash table
+ * First calculate the device offsets.
*/
+ conf->disks[0].offset = 0;
+ for (i=1; i<mddev->raid_disks; i++)
+ conf->disks[i].offset =
+ conf->disks[i-1].offset +
+ conf->disks[i-1].size;
+
table = conf->hash_table;
- start = 0;
curr_offset = 0;
- for (i = 0; i < cnt; i++) {
- dev_info_t *disk = conf->disks + i;
+ i = 0;
+ for (curr_offset = 0;
+ curr_offset < mddev->array_size;
+ curr_offset += conf->hash_spacing) {
- disk->offset = curr_offset;
- curr_offset += disk->size;
+ while (i < mddev->raid_disks-1 &&
+ curr_offset >= conf->disks[i+1].offset)
+ i++;
- /* 'curr_offset' is the end of this disk
- * 'start' is the start of table
+ *table ++ = conf->disks + i;
+ }
+
+ if (conf->preshift) {
+ conf->hash_spacing >>= conf->preshift;
+ /* round hash_spacing up so that when we divide by it,
+ * we err on the side of "too-low", which is safest.
*/
- while (start < curr_offset) {
- *table++ = disk;
- start += conf->smallest->size;
- }
+ conf->hash_spacing++;
}
- if (table-conf->hash_table != nb_zone)
- BUG();
+
+ BUG_ON(table - conf->hash_table > nb_zone);
blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
mddev->queue->unplug_fn = linear_unplug;
@@ -238,6 +275,11 @@ static int linear_make_request (request_queue_t *q, struct bio *bio)
dev_info_t *tmp_dev;
sector_t block;
+ if (unlikely(bio_barrier(bio))) {
+ bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
+ return 0;
+ }
+
if (bio_data_dir(bio)==WRITE) {
disk_stat_inc(mddev->gendisk, writes);
disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio));
@@ -294,7 +336,7 @@ static void linear_status (struct seq_file *seq, mddev_t *mddev)
sector_t s = 0;
seq_printf(seq, " ");
- for (j = 0; j < conf->nr_zones; j++)
+ for (j = 0; j < mddev->raid_disks; j++)
{
char b[BDEVNAME_SIZE];
s += conf->smallest_size;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 20ca80b7dc2..2897df90df4 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -34,6 +34,7 @@
#include <linux/module.h>
#include <linux/config.h>
+#include <linux/kthread.h>
#include <linux/linkage.h>
#include <linux/raid/md.h>
#include <linux/raid/bitmap.h>
@@ -73,7 +74,7 @@ static DEFINE_SPINLOCK(pers_lock);
* Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
* is 1000 KB/sec, so the extra system load does not show up that much.
* Increase it if you want to have more _guaranteed_ speed. Note that
- * the RAID driver will use the maximum available bandwith if the IO
+ * the RAID driver will use the maximum available bandwidth if the IO
* subsystem is idle. There is also an 'absolute maximum' reconstruction
* speed limit - in case reconstruction slows down your system despite
* idle IO detection.
@@ -393,7 +394,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size,
return ret;
}
-static int read_disk_sb(mdk_rdev_t * rdev)
+static int read_disk_sb(mdk_rdev_t * rdev, int size)
{
char b[BDEVNAME_SIZE];
if (!rdev->sb_page) {
@@ -404,7 +405,7 @@ static int read_disk_sb(mdk_rdev_t * rdev)
return 0;
- if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ))
+ if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ))
goto fail;
rdev->sb_loaded = 1;
return 0;
@@ -531,7 +532,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
sb_offset = calc_dev_sboffset(rdev->bdev);
rdev->sb_offset = sb_offset;
- ret = read_disk_sb(rdev);
+ ret = read_disk_sb(rdev, MD_SB_BYTES);
if (ret) return ret;
ret = -EINVAL;
@@ -564,6 +565,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
rdev->preferred_minor = sb->md_minor;
rdev->data_offset = 0;
+ rdev->sb_size = MD_SB_BYTES;
if (sb->level == LEVEL_MULTIPATH)
rdev->desc_nr = -1;
@@ -623,6 +625,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
mddev->size = sb->size;
mddev->events = md_event(sb);
mddev->bitmap_offset = 0;
+ mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
if (sb->state & (1<<MD_SB_CLEAN))
mddev->recovery_cp = MaxSector;
@@ -643,12 +646,12 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
mddev->bitmap_file == NULL) {
- if (mddev->level != 1) {
+ if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6) {
/* FIXME use a better test */
printk(KERN_WARNING "md: bitmaps only support for raid1\n");
return -EINVAL;
}
- mddev->bitmap_offset = (MD_SB_BYTES >> 9);
+ mddev->bitmap_offset = mddev->default_bitmap_offset;
}
} else if (mddev->pers == NULL) {
@@ -669,6 +672,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
if (mddev->level != LEVEL_MULTIPATH) {
rdev->faulty = 0;
+ rdev->flags = 0;
desc = sb->disks + rdev->desc_nr;
if (desc->state & (1<<MD_DISK_FAULTY))
@@ -678,6 +682,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
rdev->in_sync = 1;
rdev->raid_disk = desc->raid_disk;
}
+ if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
+ set_bit(WriteMostly, &rdev->flags);
} else /* MULTIPATH are always insync */
rdev->in_sync = 1;
return 0;
@@ -706,6 +712,8 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
int i;
int active=0, working=0,failed=0,spare=0,nr_disks=0;
+ rdev->sb_size = MD_SB_BYTES;
+
sb = (mdp_super_t*)page_address(rdev->sb_page);
memset(sb, 0, sizeof(*sb));
@@ -776,6 +784,8 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
spare++;
working++;
}
+ if (test_bit(WriteMostly, &rdev2->flags))
+ d->state |= (1<<MD_DISK_WRITEMOSTLY);
}
/* now set the "removed" and "faulty" bits on any missing devices */
@@ -831,6 +841,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
int ret;
sector_t sb_offset;
char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
+ int bmask;
/*
* Calculate the position of the superblock.
@@ -859,7 +870,10 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
}
rdev->sb_offset = sb_offset;
- ret = read_disk_sb(rdev);
+ /* superblock is rarely larger than 1K, but it can be larger,
+ * and it is safe to read 4k, so we do that
+ */
+ ret = read_disk_sb(rdev, 4096);
if (ret) return ret;
@@ -869,7 +883,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
sb->major_version != cpu_to_le32(1) ||
le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
- sb->feature_map != 0)
+ (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
return -EINVAL;
if (calc_sb_1_csum(sb) != sb->sb_csum) {
@@ -885,6 +899,11 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
rdev->preferred_minor = 0xffff;
rdev->data_offset = le64_to_cpu(sb->data_offset);
+ rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
+ bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
+ if (rdev->sb_size & bmask)
+ rdev-> sb_size = (rdev->sb_size | bmask)+1;
+
if (refdev == 0)
return 1;
else {
@@ -939,13 +958,15 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
mddev->size = le64_to_cpu(sb->size)/2;
mddev->events = le64_to_cpu(sb->events);
mddev->bitmap_offset = 0;
+ mddev->default_bitmap_offset = 0;
+ mddev->default_bitmap_offset = 1024;
mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
memcpy(mddev->uuid, sb->set_uuid, 16);
mddev->max_disks = (4096-256)/2;
- if ((le32_to_cpu(sb->feature_map) & 1) &&
+ if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
mddev->bitmap_file == NULL ) {
if (mddev->level != 1) {
printk(KERN_WARNING "md: bitmaps only supported for raid1\n");
@@ -986,6 +1007,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
rdev->raid_disk = role;
break;
}
+ rdev->flags = 0;
+ if (sb->devflags & WriteMostly1)
+ set_bit(WriteMostly, &rdev->flags);
} else /* MULTIPATH are always insync */
rdev->in_sync = 1;
@@ -1017,7 +1041,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
if (mddev->bitmap && mddev->bitmap_file == NULL) {
sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
- sb->feature_map = cpu_to_le32(1);
+ sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
}
max_dev = 0;
@@ -1363,7 +1387,7 @@ repeat:
dprintk("%s ", bdevname(rdev->bdev,b));
if (!rdev->faulty) {
md_super_write(mddev,rdev,
- rdev->sb_offset<<1, MD_SB_BYTES,
+ rdev->sb_offset<<1, rdev->sb_size,
rdev->sb_page);
dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
bdevname(rdev->bdev,b),
@@ -2073,6 +2097,8 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
info.state = 0;
if (mddev->in_sync)
info.state = (1<<MD_SB_CLEAN);
+ if (mddev->bitmap && mddev->bitmap_offset)
+ info.state = (1<<MD_SB_BITMAP_PRESENT);
info.active_disks = active;
info.working_disks = working;
info.failed_disks = failed;
@@ -2087,7 +2113,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
return 0;
}
-static int get_bitmap_file(mddev_t * mddev, void * arg)
+static int get_bitmap_file(mddev_t * mddev, void __user * arg)
{
mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
char *ptr, *buf = NULL;
@@ -2146,6 +2172,8 @@ static int get_disk_info(mddev_t * mddev, void __user * arg)
info.state |= (1<<MD_DISK_ACTIVE);
info.state |= (1<<MD_DISK_SYNC);
}
+ if (test_bit(WriteMostly, &rdev->flags))
+ info.state |= (1<<MD_DISK_WRITEMOSTLY);
} else {
info.major = info.minor = 0;
info.raid_disk = -1;
@@ -2210,8 +2238,11 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
mdname(mddev));
return -EINVAL;
}
- rdev = md_import_device(dev, mddev->major_version,
- mddev->minor_version);
+ if (mddev->persistent)
+ rdev = md_import_device(dev, mddev->major_version,
+ mddev->minor_version);
+ else
+ rdev = md_import_device(dev, -1, -1);
if (IS_ERR(rdev)) {
printk(KERN_WARNING
"md: md_import_device returned %ld\n",
@@ -2231,6 +2262,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
rdev->saved_raid_disk = rdev->raid_disk;
rdev->in_sync = 0; /* just to be sure */
+ if (info->state & (1<<MD_DISK_WRITEMOSTLY))
+ set_bit(WriteMostly, &rdev->flags);
+
rdev->raid_disk = -1;
err = bind_rdev_to_array(rdev, mddev);
if (err)
@@ -2271,6 +2305,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
else
rdev->in_sync = 0;
+ if (info->state & (1<<MD_DISK_WRITEMOSTLY))
+ set_bit(WriteMostly, &rdev->flags);
+
err = bind_rdev_to_array(rdev, mddev);
if (err) {
export_rdev(rdev);
@@ -2430,25 +2467,51 @@ static int set_bitmap_file(mddev_t *mddev, int fd)
{
int err;
- if (mddev->pers)
- return -EBUSY;
+ if (mddev->pers) {
+ if (!mddev->pers->quiesce)
+ return -EBUSY;
+ if (mddev->recovery || mddev->sync_thread)
+ return -EBUSY;
+ /* we should be able to change the bitmap.. */
+ }
- mddev->bitmap_file = fget(fd);
- if (mddev->bitmap_file == NULL) {
- printk(KERN_ERR "%s: error: failed to get bitmap file\n",
- mdname(mddev));
- return -EBADF;
- }
+ if (fd >= 0) {
+ if (mddev->bitmap)
+ return -EEXIST; /* cannot add when bitmap is present */
+ mddev->bitmap_file = fget(fd);
- err = deny_bitmap_write_access(mddev->bitmap_file);
- if (err) {
- printk(KERN_ERR "%s: error: bitmap file is already in use\n",
- mdname(mddev));
- fput(mddev->bitmap_file);
- mddev->bitmap_file = NULL;
- } else
+ if (mddev->bitmap_file == NULL) {
+ printk(KERN_ERR "%s: error: failed to get bitmap file\n",
+ mdname(mddev));
+ return -EBADF;
+ }
+
+ err = deny_bitmap_write_access(mddev->bitmap_file);
+ if (err) {
+ printk(KERN_ERR "%s: error: bitmap file is already in use\n",
+ mdname(mddev));
+ fput(mddev->bitmap_file);
+ mddev->bitmap_file = NULL;
+ return err;
+ }
mddev->bitmap_offset = 0; /* file overrides offset */
+ } else if (mddev->bitmap == NULL)
+ return -ENOENT; /* cannot remove what isn't there */
+ err = 0;
+ if (mddev->pers) {
+ mddev->pers->quiesce(mddev, 1);
+ if (fd >= 0)
+ err = bitmap_create(mddev);
+ if (fd < 0 || err)
+ bitmap_destroy(mddev);
+ mddev->pers->quiesce(mddev, 0);
+ } else if (fd < 0) {
+ if (mddev->bitmap_file)
+ fput(mddev->bitmap_file);
+ mddev->bitmap_file = NULL;
+ }
+
return err;
}
@@ -2528,6 +2591,11 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
{
int rv = 0;
int cnt = 0;
+ int state = 0;
+
+ /* calculate expected state,ignoring low bits */
+ if (mddev->bitmap && mddev->bitmap_offset)
+ state |= (1 << MD_SB_BITMAP_PRESENT);
if (mddev->major_version != info->major_version ||
mddev->minor_version != info->minor_version ||
@@ -2536,12 +2604,16 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
mddev->level != info->level ||
/* mddev->layout != info->layout || */
!mddev->persistent != info->not_persistent||
- mddev->chunk_size != info->chunk_size )
+ mddev->chunk_size != info->chunk_size ||
+ /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
+ ((state^info->state) & 0xfffffe00)
+ )
return -EINVAL;
/* Check there is only one change */
if (mddev->size != info->size) cnt++;
if (mddev->raid_disks != info->raid_disks) cnt++;
if (mddev->layout != info->layout) cnt++;
+ if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++;
if (cnt == 0) return 0;
if (cnt > 1) return -EINVAL;
@@ -2620,6 +2692,35 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
}
}
}
+ if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
+ if (mddev->pers->quiesce == NULL)
+ return -EINVAL;
+ if (mddev->recovery || mddev->sync_thread)
+ return -EBUSY;
+ if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
+ /* add the bitmap */
+ if (mddev->bitmap)
+ return -EEXIST;
+ if (mddev->default_bitmap_offset == 0)
+ return -EINVAL;
+ mddev->bitmap_offset = mddev->default_bitmap_offset;
+ mddev->pers->quiesce(mddev, 1);
+ rv = bitmap_create(mddev);
+ if (rv)
+ bitmap_destroy(mddev);
+ mddev->pers->quiesce(mddev, 0);
+ } else {
+ /* remove the bitmap */
+ if (!mddev->bitmap)
+ return -ENOENT;
+ if (mddev->bitmap->file)
+ return -EINVAL;
+ mddev->pers->quiesce(mddev, 1);
+ bitmap_destroy(mddev);
+ mddev->pers->quiesce(mddev, 0);
+ mddev->bitmap_offset = 0;
+ }
+ }
md_update_sb(mddev);
return rv;
}
@@ -2781,7 +2882,7 @@ static int md_ioctl(struct inode *inode, struct file *file,
goto done_unlock;
case GET_BITMAP_FILE:
- err = get_bitmap_file(mddev, (void *)arg);
+ err = get_bitmap_file(mddev, argp);
goto done_unlock;
case GET_DISK_INFO:
@@ -2950,18 +3051,6 @@ static int md_thread(void * arg)
{
mdk_thread_t *thread = arg;
- lock_kernel();
-
- /*
- * Detach thread
- */
-
- daemonize(thread->name, mdname(thread->mddev));
-
- current->exit_signal = SIGCHLD;
- allow_signal(SIGKILL);
- thread->tsk = current;
-
/*
* md_thread is a 'system-thread', it's priority should be very
* high. We avoid resource deadlocks individually in each
@@ -2973,14 +3062,14 @@ static int md_thread(void * arg)
* bdflush, otherwise bdflush will deadlock if there are too
* many dirty RAID5 blocks.
*/
- unlock_kernel();
complete(thread->event);
- while (thread->run) {
+ while (!kthread_should_stop()) {
void (*run)(mddev_t *);
wait_event_interruptible_timeout(thread->wqueue,
- test_bit(THREAD_WAKEUP, &thread->flags),
+ test_bit(THREAD_WAKEUP, &thread->flags)
+ || kthread_should_stop(),
thread->timeout);
try_to_freeze();
@@ -2989,11 +3078,8 @@ static int md_thread(void * arg)
run = thread->run;
if (run)
run(thread->mddev);
-
- if (signal_pending(current))
- flush_signals(current);
}
- complete(thread->event);
+
return 0;
}
@@ -3010,11 +3096,9 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
const char *name)
{
mdk_thread_t *thread;
- int ret;
struct completion event;
- thread = (mdk_thread_t *) kmalloc
- (sizeof(mdk_thread_t), GFP_KERNEL);
+ thread = kmalloc(sizeof(mdk_thread_t), GFP_KERNEL);
if (!thread)
return NULL;
@@ -3027,8 +3111,8 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
thread->mddev = mddev;
thread->name = name;
thread->timeout = MAX_SCHEDULE_TIMEOUT;
- ret = kernel_thread(md_thread, thread, 0);
- if (ret < 0) {
+ thread->tsk = kthread_run(md_thread, thread, mdname(thread->mddev));
+ if (IS_ERR(thread->tsk)) {
kfree(thread);
return NULL;
}
@@ -3038,21 +3122,9 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
void md_unregister_thread(mdk_thread_t *thread)
{
- struct completion event;
-
- init_completion(&event);
-
- thread->event = &event;
-
- /* As soon as ->run is set to NULL, the task could disappear,
- * so we need to hold tasklist_lock until we have sent the signal
- */
dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
- read_lock(&tasklist_lock);
- thread->run = NULL;
- send_sig(SIGKILL, thread->tsk, 1);
- read_unlock(&tasklist_lock);
- wait_for_completion(&event);
+
+ kthread_stop(thread->tsk);
kfree(thread);
}
@@ -3259,10 +3331,13 @@ static int md_seq_show(struct seq_file *seq, void *v)
char b[BDEVNAME_SIZE];
seq_printf(seq, " %s[%d]",
bdevname(rdev->bdev,b), rdev->desc_nr);
+ if (test_bit(WriteMostly, &rdev->flags))
+ seq_printf(seq, "(W)");
if (rdev->faulty) {
seq_printf(seq, "(F)");
continue;
- }
+ } else if (rdev->raid_disk < 0)
+ seq_printf(seq, "(S)"); /* spare */
size += rdev->size;
}
@@ -3274,6 +3349,15 @@ static int md_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "\n %llu blocks",
(unsigned long long)size);
}
+ if (mddev->persistent) {
+ if (mddev->major_version != 0 ||
+ mddev->minor_version != 90) {
+ seq_printf(seq," super %d.%d",
+ mddev->major_version,
+ mddev->minor_version);
+ }
+ } else
+ seq_printf(seq, " super non-persistent");
if (mddev->pers) {
mddev->pers->status (seq, mddev);
@@ -3416,7 +3500,6 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
*/
void md_write_start(mddev_t *mddev, struct bio *bi)
{
- DEFINE_WAIT(w);
if (bio_data_dir(bi) != WRITE)
return;
@@ -3533,7 +3616,7 @@ static void md_do_sync(mddev_t *mddev)
printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
" %d KB/sec/disc.\n", sysctl_speed_limit_min);
- printk(KERN_INFO "md: using maximum available idle IO bandwith "
+ printk(KERN_INFO "md: using maximum available idle IO bandwidth "
"(but not more than %d KB/sec) for reconstruction.\n",
sysctl_speed_limit_max);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 2d2ca7fa026..286342375fb 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -169,6 +169,11 @@ static int multipath_make_request (request_queue_t *q, struct bio * bio)
struct multipath_bh * mp_bh;
struct multipath_info *multipath;
+ if (unlikely(bio_barrier(bio))) {
+ bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
+ return 0;
+ }
+
mp_bh = mempool_alloc(conf->pool, GFP_NOIO);
mp_bh->master_bio = bio;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 2120710172c..f6757259ce7 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -404,6 +404,11 @@ static int raid0_make_request (request_queue_t *q, struct bio *bio)
unsigned long chunk;
sector_t block, rsect;
+ if (unlikely(bio_barrier(bio))) {
+ bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
+ return 0;
+ }
+
if (bio_data_dir(bio)==WRITE) {
disk_stat_inc(mddev->gendisk, writes);
disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio));
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 51d9645ed09..a93ca478142 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -222,8 +222,17 @@ static void raid_end_bio_io(r1bio_t *r1_bio)
{
struct bio *bio = r1_bio->master_bio;
- bio_endio(bio, bio->bi_size,
- test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
+ /* if nobody has done the final endio yet, do it now */
+ if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
+ PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n",
+ (bio_data_dir(bio) == WRITE) ? "write" : "read",
+ (unsigned long long) bio->bi_sector,
+ (unsigned long long) bio->bi_sector +
+ (bio->bi_size >> 9) - 1);
+
+ bio_endio(bio, bio->bi_size,
+ test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
+ }
free_r1bio(r1_bio);
}
@@ -292,7 +301,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
{
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
- int mirror;
+ int mirror, behind;
conf_t *conf = mddev_to_conf(r1_bio->mddev);
if (bio->bi_size)
@@ -323,16 +332,46 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
update_head_pos(mirror, r1_bio);
+ behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
+ if (behind) {
+ if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
+ atomic_dec(&r1_bio->behind_remaining);
+
+ /* In behind mode, we ACK the master bio once the I/O has safely
+ * reached all non-writemostly disks. Setting the Returned bit
+ * ensures that this gets done only once -- we don't ever want to
+ * return -EIO here, instead we'll wait */
+
+ if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
+ test_bit(R1BIO_Uptodate, &r1_bio->state)) {
+ /* Maybe we can return now */
+ if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
+ struct bio *mbio = r1_bio->master_bio;
+ PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
+ (unsigned long long) mbio->bi_sector,
+ (unsigned long long) mbio->bi_sector +
+ (mbio->bi_size >> 9) - 1);
+ bio_endio(mbio, mbio->bi_size, 0);
+ }
+ }
+ }
/*
*
* Let's see if all mirrored write operations have finished
* already.
*/
if (atomic_dec_and_test(&r1_bio->remaining)) {
+ if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
+ /* free extra copy of the data pages */
+ int i = bio->bi_vcnt;
+ while (i--)
+ __free_page(bio->bi_io_vec[i].bv_page);
+ }
/* clear the bitmap if all writes complete successfully */
bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
r1_bio->sectors,
- !test_bit(R1BIO_Degraded, &r1_bio->state));
+ !test_bit(R1BIO_Degraded, &r1_bio->state),
+ behind);
md_write_end(r1_bio->mddev);
raid_end_bio_io(r1_bio);
}
@@ -360,13 +399,14 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
{
const unsigned long this_sector = r1_bio->sector;
int new_disk = conf->last_used, disk = new_disk;
+ int wonly_disk = -1;
const int sectors = r1_bio->sectors;
sector_t new_distance, current_distance;
- mdk_rdev_t *new_rdev, *rdev;
+ mdk_rdev_t *rdev;
rcu_read_lock();
/*
- * Check if it if we can balance. We can balance on the whole
+ * Check if we can balance. We can balance on the whole
* device if no resync is going on, or below the resync window.
* We take the first readable disk when above the resync window.
*/
@@ -376,11 +416,16 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
/* Choose the first operation device, for consistancy */
new_disk = 0;
- while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL ||
- !new_rdev->in_sync) {
- new_disk++;
- if (new_disk == conf->raid_disks) {
- new_disk = -1;
+ for (rdev = conf->mirrors[new_disk].rdev;
+ !rdev || !rdev->in_sync
+ || test_bit(WriteMostly, &rdev->flags);
+ rdev = conf->mirrors[++new_disk].rdev) {
+
+ if (rdev && rdev->in_sync)
+ wonly_disk = new_disk;
+
+ if (new_disk == conf->raid_disks - 1) {
+ new_disk = wonly_disk;
break;
}
}
@@ -389,16 +434,26 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
/* make sure the disk is operational */
- while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL ||
- !new_rdev->in_sync) {
+ for (rdev = conf->mirrors[new_disk].rdev;
+ !rdev || !rdev->in_sync ||
+ test_bit(WriteMostly, &rdev->flags);
+ rdev = conf->mirrors[new_disk].rdev) {
+
+ if (rdev && rdev->in_sync)
+ wonly_disk = new_disk;
+
if (new_disk <= 0)
new_disk = conf->raid_disks;
new_disk--;
if (new_disk == disk) {
- new_disk = -1;
- goto rb_out;
+ new_disk = wonly_disk;
+ break;
}
}
+
+ if (new_disk < 0)
+ goto rb_out;
+
disk = new_disk;
/* now disk == new_disk == starting point for search */
@@ -419,37 +474,41 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
disk = conf->raid_disks;
disk--;
- if ((rdev=conf->mirrors[disk].rdev) == NULL ||
- !rdev->in_sync)
+ rdev = conf->mirrors[disk].rdev;
+
+ if (!rdev ||
+ !rdev->in_sync ||
+ test_bit(WriteMostly, &rdev->flags))
continue;
if (!atomic_read(&rdev->nr_pending)) {
new_disk = disk;
- new_rdev = rdev;
break;
}
new_distance = abs(this_sector - conf->mirrors[disk].head_position);
if (new_distance < current_distance) {
current_distance = new_distance;
new_disk = disk;
- new_rdev = rdev;
}
} while (disk != conf->last_used);
-rb_out:
+ rb_out:
if (new_disk >= 0) {
- conf->next_seq_sect = this_sector + sectors;
- conf->last_used = new_disk;
- atomic_inc(&new_rdev->nr_pending);
- if (!new_rdev->in_sync) {
+ rdev = conf->mirrors[new_disk].rdev;
+ if (!rdev)
+ goto retry;
+ atomic_inc(&rdev->nr_pending);
+ if (!rdev->in_sync) {
/* cannot risk returning a device that failed
* before we inc'ed nr_pending
*/
- atomic_dec(&new_rdev->nr_pending);
+ atomic_dec(&rdev->nr_pending);
goto retry;
}
+ conf->next_seq_sect = this_sector + sectors;
+ conf->last_used = new_disk;
}
rcu_read_unlock();
@@ -542,6 +601,39 @@ static void device_barrier(conf_t *conf, sector_t sect)
spin_unlock_irq(&conf->resync_lock);
}
+/* duplicate the data pages for behind I/O */
+static struct page **alloc_behind_pages(struct bio *bio)
+{
+ int i;
+ struct bio_vec *bvec;
+ struct page **pages = kmalloc(bio->bi_vcnt * sizeof(struct page *),
+ GFP_NOIO);
+ if (unlikely(!pages))
+ goto do_sync_io;
+
+ memset(pages, 0, bio->bi_vcnt * sizeof(struct page *));
+
+ bio_for_each_segment(bvec, bio, i) {
+ pages[i] = alloc_page(GFP_NOIO);
+ if (unlikely(!pages[i]))
+ goto do_sync_io;
+ memcpy(kmap(pages[i]) + bvec->bv_offset,
+ kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
+ kunmap(pages[i]);
+ kunmap(bvec->bv_page);
+ }
+
+ return pages;
+
+do_sync_io:
+ if (pages)
+ for (i = 0; i < bio->bi_vcnt && pages[i]; i++)
+ __free_page(pages[i]);
+ kfree(pages);
+ PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
+ return NULL;
+}
+
static int make_request(request_queue_t *q, struct bio * bio)
{
mddev_t *mddev = q->queuedata;
@@ -554,7 +646,12 @@ static int make_request(request_queue_t *q, struct bio * bio)
struct bitmap *bitmap = mddev->bitmap;
unsigned long flags;
struct bio_list bl;
+ struct page **behind_pages = NULL;
+ if (unlikely(bio_barrier(bio))) {
+ bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
+ return 0;
+ }
/*
* Register the new request and wait if the reconstruction
@@ -589,8 +686,6 @@ static int make_request(request_queue_t *q, struct bio * bio)
r1_bio->mddev = mddev;
r1_bio->sector = bio->bi_sector;
- r1_bio->state = 0;
-
if (bio_data_dir(bio) == READ) {
/*
* read balancing logic:
@@ -651,13 +746,22 @@ static int make_request(request_queue_t *q, struct bio * bio)
}
rcu_read_unlock();
+ BUG_ON(targets == 0); /* we never fail the last device */
+
if (targets < conf->raid_disks) {
/* array is degraded, we will not clear the bitmap
* on I/O completion (see raid1_end_write_request) */
set_bit(R1BIO_Degraded, &r1_bio->state);
}
+ /* do behind I/O ? */
+ if (bitmap &&
+ atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind &&
+ (behind_pages = alloc_behind_pages(bio)) != NULL)
+ set_bit(R1BIO_BehindIO, &r1_bio->state);
+
atomic_set(&r1_bio->remaining, 0);
+ atomic_set(&r1_bio->behind_remaining, 0);
bio_list_init(&bl);
for (i = 0; i < disks; i++) {
@@ -674,12 +778,31 @@ static int make_request(request_queue_t *q, struct bio * bio)
mbio->bi_rw = WRITE;
mbio->bi_private = r1_bio;
+ if (behind_pages) {
+ struct bio_vec *bvec;
+ int j;
+
+ /* Yes, I really want the '__' version so that
+ * we clear any unused pointer in the io_vec, rather
+ * than leave them unchanged. This is important
+ * because when we come to free the pages, we won't
+ * know the originial bi_idx, so we just free
+ * them all
+ */
+ __bio_for_each_segment(bvec, mbio, j, 0)
+ bvec->bv_page = behind_pages[j];
+ if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
+ atomic_inc(&r1_bio->behind_remaining);
+ }
+
atomic_inc(&r1_bio->remaining);
bio_list_add(&bl, mbio);
}
+ kfree(behind_pages); /* the behind pages are attached to the bios now */
- bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors);
+ bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
+ test_bit(R1BIO_BehindIO, &r1_bio->state));
spin_lock_irqsave(&conf->device_lock, flags);
bio_list_merge(&conf->pending_bio_list, &bl);
bio_list_init(&bl);
@@ -1105,6 +1228,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
sector_t max_sector, nr_sectors;
int disk;
int i;
+ int wonly;
int write_targets = 0;
int sync_blocks;
int still_degraded = 0;
@@ -1160,14 +1284,21 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
*/
disk = conf->last_used;
/* make sure disk is operational */
-
+ wonly = disk;
while (conf->mirrors[disk].rdev == NULL ||
- !conf->mirrors[disk].rdev->in_sync) {
+ !conf->mirrors[disk].rdev->in_sync ||
+ test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags)
+ ) {
+ if (conf->mirrors[disk].rdev &&
+ conf->mirrors[disk].rdev->in_sync)
+ wonly = disk;
if (disk <= 0)
disk = conf->raid_disks;
disk--;
- if (disk == conf->last_used)
+ if (disk == conf->last_used) {
+ disk = wonly;
break;
+ }
}
conf->last_used = disk;
atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
@@ -1439,6 +1570,17 @@ out:
static int stop(mddev_t *mddev)
{
conf_t *conf = mddev_to_conf(mddev);
+ struct bitmap *bitmap = mddev->bitmap;
+ int behind_wait = 0;
+
+ /* wait for behind writes to complete */
+ while (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
+ behind_wait++;
+ printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(HZ); /* wait a second */
+ /* need to kick something here to make sure I/O goes? */
+ }
md_unregister_thread(mddev->thread);
mddev->thread = NULL;
@@ -1561,6 +1703,35 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
return 0;
}
+static void raid1_quiesce(mddev_t *mddev, int state)
+{
+ conf_t *conf = mddev_to_conf(mddev);
+
+ switch(state) {
+ case 1:
+ spin_lock_irq(&conf->resync_lock);
+ conf->barrier++;
+ wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
+ conf->resync_lock, raid1_unplug(mddev->queue));
+ spin_unlock_irq(&conf->resync_lock);
+ break;
+ case 0:
+ spin_lock_irq(&conf->resync_lock);
+ conf->barrier--;
+ spin_unlock_irq(&conf->resync_lock);
+ wake_up(&conf->wait_resume);
+ wake_up(&conf->wait_idle);
+ break;
+ }
+ if (mddev->thread) {
+ if (mddev->bitmap)
+ mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
+ else
+ mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
+ md_wakeup_thread(mddev->thread);
+ }
+}
+
static mdk_personality_t raid1_personality =
{
@@ -1577,6 +1748,7 @@ static mdk_personality_t raid1_personality =
.sync_request = sync_request,
.resize = raid1_resize,
.reshape = raid1_reshape,
+ .quiesce = raid1_quiesce,
};
static int __init raid_init(void)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 62ebb1bc72b..5bd1e9ec899 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -538,7 +538,8 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
}
- current_distance = abs(this_sector - conf->mirrors[disk].head_position);
+ current_distance = abs(r10_bio->devs[slot].addr -
+ conf->mirrors[disk].head_position);
/* Find the disk whose head is closest */
@@ -668,6 +669,11 @@ static int make_request(request_queue_t *q, struct bio * bio)
int i;
int chunk_sects = conf->chunk_mask + 1;
+ if (unlikely(bio_barrier(bio))) {
+ bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
+ return 0;
+ }
+
/* If this request crosses a chunk boundary, we need to
* split it. This will only happen for 1 PAGE (or less) requests.
*/
@@ -900,6 +906,27 @@ static void close_sync(conf_t *conf)
conf->r10buf_pool = NULL;
}
+/* check if there are enough drives for
+ * every block to appear on atleast one
+ */
+static int enough(conf_t *conf)
+{
+ int first = 0;
+
+ do {
+ int n = conf->copies;
+ int cnt = 0;
+ while (n--) {
+ if (conf->mirrors[first].rdev)
+ cnt++;
+ first = (first+1) % conf->raid_disks;
+ }
+ if (cnt == 0)
+ return 0;
+ } while (first != 0);
+ return 1;
+}
+
static int raid10_spare_active(mddev_t *mddev)
{
int i;
@@ -938,6 +965,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
* very different from resync
*/
return 0;
+ if (!enough(conf))
+ return 0;
for (mirror=0; mirror < mddev->raid_disks; mirror++)
if ( !(p=conf->mirrors+mirror)->rdev) {
@@ -1445,7 +1474,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
}
}
if (j == conf->copies) {
- BUG();
+ /* Cannot recover, so abort the recovery */
+ put_buf(r10_bio);
+ r10_bio = rb2;
+ if (!test_and_set_bit(MD_RECOVERY_ERR, &mddev->recovery))
+ printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n",
+ mdname(mddev));
+ break;
}
}
if (biolist == NULL) {
@@ -1678,9 +1713,10 @@ static int run(mddev_t *mddev)
init_waitqueue_head(&conf->wait_idle);
init_waitqueue_head(&conf->wait_resume);
- if (!conf->working_disks) {
- printk(KERN_ERR "raid10: no operational mirrors for %s\n",
- mdname(mddev));
+ /* need to check that every block has at least one working mirror */
+ if (!enough(conf)) {
+ printk(KERN_ERR "raid10: not enough operational mirrors for %s\n",
+ mdname(mddev));
goto out_free_conf;
}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 43f231a467d..4683ca24c04 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -24,6 +24,8 @@
#include <linux/bitops.h>
#include <asm/atomic.h>
+#include <linux/raid/bitmap.h>
+
/*
* Stripe cache
*/
@@ -79,8 +81,13 @@ static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
if (test_bit(STRIPE_HANDLE, &sh->state)) {
if (test_bit(STRIPE_DELAYED, &sh->state))
list_add_tail(&sh->lru, &conf->delayed_list);
- else
+ else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+ conf->seq_write == sh->bm_seq)
+ list_add_tail(&sh->lru, &conf->bitmap_list);
+ else {
+ clear_bit(STRIPE_BIT_DELAY, &sh->state);
list_add_tail(&sh->lru, &conf->handle_list);
+ }
md_wakeup_thread(conf->mddev->thread);
} else {
if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
@@ -244,6 +251,9 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
spin_lock_irq(&conf->device_lock);
do {
+ wait_event_lock_irq(conf->wait_for_stripe,
+ conf->quiesce == 0,
+ conf->device_lock, /* nothing */);
sh = __find_stripe(conf, sector);
if (!sh) {
if (!conf->inactive_blocked)
@@ -803,6 +813,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
{
struct bio **bip;
raid5_conf_t *conf = sh->raid_conf;
+ int firstwrite=0;
PRINTK("adding bh b#%llu to stripe s#%llu\n",
(unsigned long long)bi->bi_sector,
@@ -811,9 +822,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
spin_lock(&sh->lock);
spin_lock_irq(&conf->device_lock);
- if (forwrite)
+ if (forwrite) {
bip = &sh->dev[dd_idx].towrite;
- else
+ if (*bip == NULL && sh->dev[dd_idx].written == NULL)
+ firstwrite = 1;
+ } else
bip = &sh->dev[dd_idx].toread;
while (*bip && (*bip)->bi_sector < bi->bi_sector) {
if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
@@ -836,6 +849,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
(unsigned long long)bi->bi_sector,
(unsigned long long)sh->sector, dd_idx);
+ if (conf->mddev->bitmap && firstwrite) {
+ sh->bm_seq = conf->seq_write;
+ bitmap_startwrite(conf->mddev->bitmap, sh->sector,
+ STRIPE_SECTORS, 0);
+ set_bit(STRIPE_BIT_DELAY, &sh->state);
+ }
+
if (forwrite) {
/* check if page is covered */
sector_t sector = sh->dev[dd_idx].sector;
@@ -958,12 +978,13 @@ static void handle_stripe(struct stripe_head *sh)
* need to be failed
*/
if (failed > 1 && to_read+to_write+written) {
- spin_lock_irq(&conf->device_lock);
for (i=disks; i--; ) {
+ int bitmap_end = 0;
+ spin_lock_irq(&conf->device_lock);
/* fail all writes first */
bi = sh->dev[i].towrite;
sh->dev[i].towrite = NULL;
- if (bi) to_write--;
+ if (bi) { to_write--; bitmap_end = 1; }
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
@@ -981,6 +1002,7 @@ static void handle_stripe(struct stripe_head *sh)
/* and fail all 'written' */
bi = sh->dev[i].written;
sh->dev[i].written = NULL;
+ if (bi) bitmap_end = 1;
while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -1009,8 +1031,11 @@ static void handle_stripe(struct stripe_head *sh)
bi = nextbi;
}
}
+ spin_unlock_irq(&conf->device_lock);
+ if (bitmap_end)
+ bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+ STRIPE_SECTORS, 0, 0);
}
- spin_unlock_irq(&conf->device_lock);
}
if (failed > 1 && syncing) {
md_done_sync(conf->mddev, STRIPE_SECTORS,0);
@@ -1038,6 +1063,7 @@ static void handle_stripe(struct stripe_head *sh)
test_bit(R5_UPTODATE, &dev->flags) ) {
/* We can return any write requests */
struct bio *wbi, *wbi2;
+ int bitmap_end = 0;
PRINTK("Return write for disc %d\n", i);
spin_lock_irq(&conf->device_lock);
wbi = dev->written;
@@ -1051,7 +1077,13 @@ static void handle_stripe(struct stripe_head *sh)
}
wbi = wbi2;
}
+ if (dev->towrite == NULL)
+ bitmap_end = 1;
spin_unlock_irq(&conf->device_lock);
+ if (bitmap_end)
+ bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+ STRIPE_SECTORS,
+ !test_bit(STRIPE_DEGRADED, &sh->state), 0);
}
}
}
@@ -1175,7 +1207,8 @@ static void handle_stripe(struct stripe_head *sh)
}
}
/* now if nothing is locked, and if we have enough data, we can start a write request */
- if (locked == 0 && (rcw == 0 ||rmw == 0)) {
+ if (locked == 0 && (rcw == 0 ||rmw == 0) &&
+ !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
PRINTK("Computing parity...\n");
compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
/* now every locked buffer is ready to be written */
@@ -1231,6 +1264,7 @@ static void handle_stripe(struct stripe_head *sh)
dev = &sh->dev[failed_num];
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantwrite, &dev->flags);
+ clear_bit(STRIPE_DEGRADED, &sh->state);
locked++;
set_bit(STRIPE_INSYNC, &sh->state);
set_bit(R5_Syncio, &dev->flags);
@@ -1298,6 +1332,8 @@ static void handle_stripe(struct stripe_head *sh)
bi->bi_next = NULL;
generic_make_request(bi);
} else {
+ if (rw == 1)
+ set_bit(STRIPE_DEGRADED, &sh->state);
PRINTK("skip op %ld on disc %d for sector %llu\n",
bi->bi_rw, i, (unsigned long long)sh->sector);
clear_bit(R5_LOCKED, &sh->dev[i].flags);
@@ -1322,6 +1358,20 @@ static inline void raid5_activate_delayed(raid5_conf_t *conf)
}
}
+static inline void activate_bit_delay(raid5_conf_t *conf)
+{
+ /* device_lock is held */
+ struct list_head head;
+ list_add(&head, &conf->bitmap_list);
+ list_del_init(&conf->bitmap_list);
+ while (!list_empty(&head)) {
+ struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
+ list_del_init(&sh->lru);
+ atomic_inc(&sh->count);
+ __release_stripe(conf, sh);
+ }
+}
+
static void unplug_slaves(mddev_t *mddev)
{
raid5_conf_t *conf = mddev_to_conf(mddev);
@@ -1354,8 +1404,10 @@ static void raid5_unplug_device(request_queue_t *q)
spin_lock_irqsave(&conf->device_lock, flags);
- if (blk_remove_plug(q))
+ if (blk_remove_plug(q)) {
+ conf->seq_flush++;
raid5_activate_delayed(conf);
+ }
md_wakeup_thread(mddev->thread);
spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -1411,6 +1463,11 @@ static int make_request (request_queue_t *q, struct bio * bi)
sector_t logical_sector, last_sector;
struct stripe_head *sh;
+ if (unlikely(bio_barrier(bi))) {
+ bio_endio(bi, bi->bi_size, -EOPNOTSUPP);
+ return 0;
+ }
+
md_write_start(mddev, bi);
if (bio_data_dir(bi)==WRITE) {
@@ -1488,10 +1545,20 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
sector_t first_sector;
int raid_disks = conf->raid_disks;
int data_disks = raid_disks-1;
+ sector_t max_sector = mddev->size << 1;
+ int sync_blocks;
- if (sector_nr >= mddev->size <<1) {
+ if (sector_nr >= max_sector) {
/* just being told to finish up .. nothing much to do */
unplug_slaves(mddev);
+
+ if (mddev->curr_resync < max_sector) /* aborted */
+ bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
+ &sync_blocks, 1);
+ else /* compelted sync */
+ conf->fullsync = 0;
+ bitmap_close_sync(mddev->bitmap);
+
return 0;
}
/* if there is 1 or more failed drives and we are trying
@@ -1503,6 +1570,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
*skipped = 1;
return rv;
}
+ if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
+ !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
+ /* we can skip this block, and probably more */
+ sync_blocks /= STRIPE_SECTORS;
+ *skipped = 1;
+ return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
+ }
x = sector_nr;
chunk_offset = sector_div(x, sectors_per_chunk);
@@ -1520,6 +1594,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
set_current_state(TASK_UNINTERRUPTIBLE);
schedule_timeout(1);
}
+ bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0);
spin_lock(&sh->lock);
set_bit(STRIPE_SYNCING, &sh->state);
clear_bit(STRIPE_INSYNC, &sh->state);
@@ -1553,6 +1628,13 @@ static void raid5d (mddev_t *mddev)
while (1) {
struct list_head *first;
+ if (conf->seq_flush - conf->seq_write > 0) {
+ int seq = conf->seq_flush;
+ bitmap_unplug(mddev->bitmap);
+ conf->seq_write = seq;
+ activate_bit_delay(conf);
+ }
+
if (list_empty(&conf->handle_list) &&
atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
!blk_queue_plugged(mddev->queue) &&
@@ -1586,7 +1668,7 @@ static void raid5d (mddev_t *mddev)
PRINTK("--- raid5d inactive\n");
}
-static int run (mddev_t *mddev)
+static int run(mddev_t *mddev)
{
raid5_conf_t *conf;
int raid_disk, memory;
@@ -1616,6 +1698,7 @@ static int run (mddev_t *mddev)
init_waitqueue_head(&conf->wait_for_overlap);
INIT_LIST_HEAD(&conf->handle_list);
INIT_LIST_HEAD(&conf->delayed_list);
+ INIT_LIST_HEAD(&conf->bitmap_list);
INIT_LIST_HEAD(&conf->inactive_list);
atomic_set(&conf->active_stripes, 0);
atomic_set(&conf->preread_active_stripes, 0);
@@ -1727,6 +1810,9 @@ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
/* Ok, everything is just fine now */
+ if (mddev->bitmap)
+ mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
+
mddev->queue->unplug_fn = raid5_unplug_device;
mddev->queue->issue_flush_fn = raid5_issue_flush;
@@ -1907,6 +1993,8 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
rdev->in_sync = 0;
rdev->raid_disk = disk;
found = 1;
+ if (rdev->saved_raid_disk != disk)
+ conf->fullsync = 1;
p->rdev = rdev;
break;
}
@@ -1936,6 +2024,35 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
return 0;
}
+static void raid5_quiesce(mddev_t *mddev, int state)
+{
+ raid5_conf_t *conf = mddev_to_conf(mddev);
+
+ switch(state) {
+ case 1: /* stop all writes */
+ spin_lock_irq(&conf->device_lock);
+ conf->quiesce = 1;
+ wait_event_lock_irq(conf->wait_for_stripe,
+ atomic_read(&conf->active_stripes) == 0,
+ conf->device_lock, /* nothing */);
+ spin_unlock_irq(&conf->device_lock);
+ break;
+
+ case 0: /* re-enable writes */
+ spin_lock_irq(&conf->device_lock);
+ conf->quiesce = 0;
+ wake_up(&conf->wait_for_stripe);
+ spin_unlock_irq(&conf->device_lock);
+ break;
+ }
+ if (mddev->thread) {
+ if (mddev->bitmap)
+ mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
+ else
+ mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
+ md_wakeup_thread(mddev->thread);
+ }
+}
static mdk_personality_t raid5_personality=
{
.name = "raid5",
@@ -1950,6 +2067,7 @@ static mdk_personality_t raid5_personality=
.spare_active = raid5_spare_active,
.sync_request = sync_request,
.resize = raid5_resize,
+ .quiesce = raid5_quiesce,
};
static int __init raid5_init (void)
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 495dee1d1e8..267eb1430c8 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -29,6 +29,8 @@
#include <asm/atomic.h>
#include "raid6.h"
+#include <linux/raid/bitmap.h>
+
/*
* Stripe cache
*/
@@ -98,8 +100,13 @@ static inline void __release_stripe(raid6_conf_t *conf, struct stripe_head *sh)
if (test_bit(STRIPE_HANDLE, &sh->state)) {
if (test_bit(STRIPE_DELAYED, &sh->state))
list_add_tail(&sh->lru, &conf->delayed_list);
- else
+ else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+ conf->seq_write == sh->bm_seq)
+ list_add_tail(&sh->lru, &conf->bitmap_list);
+ else {
+ clear_bit(STRIPE_BIT_DELAY, &sh->state);
list_add_tail(&sh->lru, &conf->handle_list);
+ }
md_wakeup_thread(conf->mddev->thread);
} else {
if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
@@ -262,6 +269,9 @@ static struct stripe_head *get_active_stripe(raid6_conf_t *conf, sector_t sector
spin_lock_irq(&conf->device_lock);
do {
+ wait_event_lock_irq(conf->wait_for_stripe,
+ conf->quiesce == 0,
+ conf->device_lock, /* nothing */);
sh = __find_stripe(conf, sector);
if (!sh) {
if (!conf->inactive_blocked)
@@ -906,6 +916,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
{
struct bio **bip;
raid6_conf_t *conf = sh->raid_conf;
+ int firstwrite=0;
PRINTK("adding bh b#%llu to stripe s#%llu\n",
(unsigned long long)bi->bi_sector,
@@ -914,9 +925,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
spin_lock(&sh->lock);
spin_lock_irq(&conf->device_lock);
- if (forwrite)
+ if (forwrite) {
bip = &sh->dev[dd_idx].towrite;
- else
+ if (*bip == NULL && sh->dev[dd_idx].written == NULL)
+ firstwrite = 1;
+ } else
bip = &sh->dev[dd_idx].toread;
while (*bip && (*bip)->bi_sector < bi->bi_sector) {
if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
@@ -939,6 +952,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
(unsigned long long)bi->bi_sector,
(unsigned long long)sh->sector, dd_idx);
+ if (conf->mddev->bitmap && firstwrite) {
+ sh->bm_seq = conf->seq_write;
+ bitmap_startwrite(conf->mddev->bitmap, sh->sector,
+ STRIPE_SECTORS, 0);
+ set_bit(STRIPE_BIT_DELAY, &sh->state);
+ }
+
if (forwrite) {
/* check if page is covered */
sector_t sector = sh->dev[dd_idx].sector;
@@ -1066,12 +1086,13 @@ static void handle_stripe(struct stripe_head *sh)
* need to be failed
*/
if (failed > 2 && to_read+to_write+written) {
- spin_lock_irq(&conf->device_lock);
for (i=disks; i--; ) {
+ int bitmap_end = 0;
+ spin_lock_irq(&conf->device_lock);
/* fail all writes first */
bi = sh->dev[i].towrite;
sh->dev[i].towrite = NULL;
- if (bi) to_write--;
+ if (bi) { to_write--; bitmap_end = 1; }
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
@@ -1089,6 +1110,7 @@ static void handle_stripe(struct stripe_head *sh)
/* and fail all 'written' */
bi = sh->dev[i].written;
sh->dev[i].written = NULL;
+ if (bi) bitmap_end = 1;
while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -1117,8 +1139,11 @@ static void handle_stripe(struct stripe_head *sh)
bi = nextbi;
}
}
+ spin_unlock_irq(&conf->device_lock);
+ if (bitmap_end)
+ bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+ STRIPE_SECTORS, 0, 0);
}
- spin_unlock_irq(&conf->device_lock);
}
if (failed > 2 && syncing) {
md_done_sync(conf->mddev, STRIPE_SECTORS,0);
@@ -1155,6 +1180,7 @@ static void handle_stripe(struct stripe_head *sh)
if (!test_bit(R5_LOCKED, &dev->flags) &&
test_bit(R5_UPTODATE, &dev->flags) ) {
/* We can return any write requests */
+ int bitmap_end = 0;
struct bio *wbi, *wbi2;
PRINTK("Return write for stripe %llu disc %d\n",
(unsigned long long)sh->sector, i);
@@ -1170,7 +1196,13 @@ static void handle_stripe(struct stripe_head *sh)
}
wbi = wbi2;
}
+ if (dev->towrite == NULL)
+ bitmap_end = 1;
spin_unlock_irq(&conf->device_lock);
+ if (bitmap_end)
+ bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+ STRIPE_SECTORS,
+ !test_bit(STRIPE_DEGRADED, &sh->state), 0);
}
}
}
@@ -1285,7 +1317,8 @@ static void handle_stripe(struct stripe_head *sh)
}
}
/* now if nothing is locked, and if we have enough data, we can start a write request */
- if (locked == 0 && rcw == 0) {
+ if (locked == 0 && rcw == 0 &&
+ !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
if ( must_compute > 0 ) {
/* We have failed blocks and need to compute them */
switch ( failed ) {
@@ -1388,6 +1421,7 @@ static void handle_stripe(struct stripe_head *sh)
bdev = &sh->dev[failed_num[1]];
locked += !test_bit(R5_LOCKED, &bdev->flags);
set_bit(R5_LOCKED, &bdev->flags);
+ clear_bit(STRIPE_DEGRADED, &sh->state);
set_bit(R5_Wantwrite, &bdev->flags);
set_bit(STRIPE_INSYNC, &sh->state);
@@ -1457,6 +1491,8 @@ static void handle_stripe(struct stripe_head *sh)
bi->bi_next = NULL;
generic_make_request(bi);
} else {
+ if (rw == 1)
+ set_bit(STRIPE_DEGRADED, &sh->state);
PRINTK("skip op %ld on disc %d for sector %llu\n",
bi->bi_rw, i, (unsigned long long)sh->sector);
clear_bit(R5_LOCKED, &sh->dev[i].flags);
@@ -1481,6 +1517,20 @@ static inline void raid6_activate_delayed(raid6_conf_t *conf)
}
}
+static inline void activate_bit_delay(raid6_conf_t *conf)
+{
+ /* device_lock is held */
+ struct list_head head;
+ list_add(&head, &conf->bitmap_list);
+ list_del_init(&conf->bitmap_list);
+ while (!list_empty(&head)) {
+ struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
+ list_del_init(&sh->lru);
+ atomic_inc(&sh->count);
+ __release_stripe(conf, sh);
+ }
+}
+
static void unplug_slaves(mddev_t *mddev)
{
raid6_conf_t *conf = mddev_to_conf(mddev);
@@ -1513,8 +1563,10 @@ static void raid6_unplug_device(request_queue_t *q)
spin_lock_irqsave(&conf->device_lock, flags);
- if (blk_remove_plug(q))
+ if (blk_remove_plug(q)) {
+ conf->seq_flush++;
raid6_activate_delayed(conf);
+ }
md_wakeup_thread(mddev->thread);
spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -1570,6 +1622,11 @@ static int make_request (request_queue_t *q, struct bio * bi)
sector_t logical_sector, last_sector;
struct stripe_head *sh;
+ if (unlikely(bio_barrier(bi))) {
+ bio_endio(bi, bi->bi_size, -EOPNOTSUPP);
+ return 0;
+ }
+
md_write_start(mddev, bi);
if (bio_data_dir(bi)==WRITE) {
@@ -1647,10 +1704,20 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
sector_t first_sector;
int raid_disks = conf->raid_disks;
int data_disks = raid_disks - 2;
+ sector_t max_sector = mddev->size << 1;
+ int sync_blocks;
- if (sector_nr >= mddev->size <<1) {
+ if (sector_nr >= max_sector) {
/* just being told to finish up .. nothing much to do */
unplug_slaves(mddev);
+
+ if (mddev->curr_resync < max_sector) /* aborted */
+ bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
+ &sync_blocks, 1);
+ else /* compelted sync */
+ conf->fullsync = 0;
+ bitmap_close_sync(mddev->bitmap);
+
return 0;
}
/* if there are 2 or more failed drives and we are trying
@@ -1662,6 +1729,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
*skipped = 1;
return rv;
}
+ if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
+ !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
+ /* we can skip this block, and probably more */
+ sync_blocks /= STRIPE_SECTORS;
+ *skipped = 1;
+ return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
+ }
x = sector_nr;
chunk_offset = sector_div(x, sectors_per_chunk);
@@ -1679,6 +1753,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
set_current_state(TASK_UNINTERRUPTIBLE);
schedule_timeout(1);
}
+ bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0);
spin_lock(&sh->lock);
set_bit(STRIPE_SYNCING, &sh->state);
clear_bit(STRIPE_INSYNC, &sh->state);
@@ -1712,6 +1787,13 @@ static void raid6d (mddev_t *mddev)
while (1) {
struct list_head *first;
+ if (conf->seq_flush - conf->seq_write > 0) {
+ int seq = conf->seq_flush;
+ bitmap_unplug(mddev->bitmap);
+ conf->seq_write = seq;
+ activate_bit_delay(conf);
+ }
+
if (list_empty(&conf->handle_list) &&
atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
!blk_queue_plugged(mddev->queue) &&
@@ -1745,7 +1827,7 @@ static void raid6d (mddev_t *mddev)
PRINTK("--- raid6d inactive\n");
}
-static int run (mddev_t *mddev)
+static int run(mddev_t *mddev)
{
raid6_conf_t *conf;
int raid_disk, memory;
@@ -1775,6 +1857,7 @@ static int run (mddev_t *mddev)
init_waitqueue_head(&conf->wait_for_overlap);
INIT_LIST_HEAD(&conf->handle_list);
INIT_LIST_HEAD(&conf->delayed_list);
+ INIT_LIST_HEAD(&conf->bitmap_list);
INIT_LIST_HEAD(&conf->inactive_list);
atomic_set(&conf->active_stripes, 0);
atomic_set(&conf->preread_active_stripes, 0);
@@ -1894,6 +1977,9 @@ static int run (mddev_t *mddev)
/* Ok, everything is just fine now */
mddev->array_size = mddev->size * (mddev->raid_disks - 2);
+ if (mddev->bitmap)
+ mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
+
mddev->queue->unplug_fn = raid6_unplug_device;
mddev->queue->issue_flush_fn = raid6_issue_flush;
return 0;
@@ -2071,6 +2157,8 @@ static int raid6_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
rdev->in_sync = 0;
rdev->raid_disk = disk;
found = 1;
+ if (rdev->saved_raid_disk != disk)
+ conf->fullsync = 1;
p->rdev = rdev;
break;
}
@@ -2100,6 +2188,35 @@ static int raid6_resize(mddev_t *mddev, sector_t sectors)
return 0;
}
+static void raid6_quiesce(mddev_t *mddev, int state)
+{
+ raid6_conf_t *conf = mddev_to_conf(mddev);
+
+ switch(state) {
+ case 1: /* stop all writes */
+ spin_lock_irq(&conf->device_lock);
+ conf->quiesce = 1;
+ wait_event_lock_irq(conf->wait_for_stripe,
+ atomic_read(&conf->active_stripes) == 0,
+ conf->device_lock, /* nothing */);
+ spin_unlock_irq(&conf->device_lock);
+ break;
+
+ case 0: /* re-enable writes */
+ spin_lock_irq(&conf->device_lock);
+ conf->quiesce = 0;
+ wake_up(&conf->wait_for_stripe);
+ spin_unlock_irq(&conf->device_lock);
+ break;
+ }
+ if (mddev->thread) {
+ if (mddev->bitmap)
+ mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
+ else
+ mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
+ md_wakeup_thread(mddev->thread);
+ }
+}
static mdk_personality_t raid6_personality=
{
.name = "raid6",
@@ -2114,6 +2231,7 @@ static mdk_personality_t raid6_personality=
.spare_active = raid6_spare_active,
.sync_request = sync_request,
.resize = raid6_resize,
+ .quiesce = raid6_quiesce,
};
static int __init raid6_init (void)