diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Kconfig | 21 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 17 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 506 | ||||
-rw-r--r-- | drivers/md/dm-emc.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-exception-store.c | 176 | ||||
-rw-r--r-- | drivers/md/dm-linear.c | 19 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 83 | ||||
-rw-r--r-- | drivers/md/dm-raid1.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-snap.c | 351 | ||||
-rw-r--r-- | drivers/md/dm-snap.h | 17 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 109 | ||||
-rw-r--r-- | drivers/md/dm.c | 113 | ||||
-rw-r--r-- | drivers/md/dm.h | 7 | ||||
-rw-r--r-- | drivers/md/linear.c | 15 | ||||
-rw-r--r-- | drivers/md/md.c | 275 | ||||
-rw-r--r-- | drivers/md/multipath.c | 27 | ||||
-rw-r--r-- | drivers/md/raid0.c | 17 | ||||
-rw-r--r-- | drivers/md/raid1.c | 247 | ||||
-rw-r--r-- | drivers/md/raid10.c | 261 | ||||
-rw-r--r-- | drivers/md/raid5.c | 74 |
20 files changed, 1390 insertions, 952 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index bf869ed03ee..c92c1521546 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -2,6 +2,8 @@ # Block device driver configuration # +if BLOCK + menu "Multi-device support (RAID and LVM)" config MD @@ -136,16 +138,16 @@ config MD_RAID456 If unsure, say Y. config MD_RAID5_RESHAPE - bool "Support adding drives to a raid-5 array (experimental)" - depends on MD_RAID456 && EXPERIMENTAL + bool "Support adding drives to a raid-5 array" + depends on MD_RAID456 + default y ---help--- A RAID-5 set can be expanded by adding extra drives. This requires "restriping" the array which means (almost) every block must be written to a different place. This option allows such restriping to be done while the array - is online. However it is still EXPERIMENTAL code. It should - work, but please be sure that you have backups. + is online. You will need mdadm version 2.4.1 or later to use this feature safely. During the early stage of reshape there is @@ -162,6 +164,8 @@ config MD_RAID5_RESHAPE There should be enough spares already present to make the new array workable. + If unsure, say Y. + config MD_MULTIPATH tristate "Multipath I/O support" depends on BLK_DEV_MD @@ -199,6 +203,14 @@ config BLK_DEV_DM If unsure, say N. +config DM_DEBUG + boolean "Device mapper debugging support" + depends on BLK_DEV_DM && EXPERIMENTAL + ---help--- + Enable this for messages that may help debug device-mapper problems. + + If unsure, say N. + config DM_CRYPT tristate "Crypt target support" depends on BLK_DEV_DM && EXPERIMENTAL @@ -251,3 +263,4 @@ config DM_MULTIPATH_EMC endmenu +endif diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index ecc56765d94..8e67634e79a 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -613,6 +613,7 @@ static inline unsigned long file_page_offset(unsigned long chunk) static inline struct page *filemap_get_page(struct bitmap *bitmap, unsigned long chunk) { + if (file_page_index(chunk) >= bitmap->file_pages) return NULL; return bitmap->filemap[file_page_index(chunk) - file_page_index(0)]; } @@ -739,6 +740,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) } page = filemap_get_page(bitmap, chunk); + if (!page) return; bit = file_page_offset(chunk); /* set the bit */ @@ -1322,6 +1324,18 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n } +/* dirty the memory and file bits for bitmap chunks "s" to "e" */ +void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) +{ + unsigned long chunk; + + for (chunk = s; chunk <= e; chunk++) { + sector_t sec = chunk << CHUNK_BLOCK_SHIFT(bitmap); + bitmap_set_memory_bits(bitmap, sec, 1); + bitmap_file_set_bit(bitmap, sec); + } +} + /* * flush out any pending updates */ @@ -1430,8 +1444,7 @@ int bitmap_create(mddev_t *mddev) if (err) goto error; - bitmap->chunkshift = find_first_bit(&bitmap->chunksize, - sizeof(bitmap->chunksize)); + bitmap->chunkshift = ffz(~bitmap->chunksize); /* now that chunksize and chunkshift are set, we can use these macros */ chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) / diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index bdbd34993a8..655d816760e 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1,6 +1,7 @@ /* * Copyright (C) 2003 Christophe Saout <christophe@saout.de> * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> + * Copyright (C) 2006 Red Hat, Inc. All rights reserved. * * This file is released under the GPL. */ @@ -22,17 +23,19 @@ #include "dm.h" #define DM_MSG_PREFIX "crypt" +#define MESG_STR(x) x, sizeof(x) /* * per bio private data */ struct crypt_io { struct dm_target *target; - struct bio *bio; + struct bio *base_bio; struct bio *first_clone; struct work_struct work; atomic_t pending; int error; + int post_process; }; /* @@ -63,6 +66,7 @@ struct crypt_iv_operations { * Crypt: maps a linear range of a block device * and encrypts / decrypts at the same time. */ +enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID }; struct crypt_config { struct dm_dev *dev; sector_t start; @@ -73,6 +77,7 @@ struct crypt_config { */ mempool_t *io_pool; mempool_t *page_pool; + struct bio_set *bs; /* * crypto related data @@ -86,11 +91,12 @@ struct crypt_config { char cipher[CRYPTO_MAX_ALG_NAME]; char chainmode[CRYPTO_MAX_ALG_NAME]; struct crypto_blkcipher *tfm; + unsigned long flags; unsigned int key_size; u8 key[0]; }; -#define MIN_IOS 256 +#define MIN_IOS 16 #define MIN_POOL_PAGES 32 #define MIN_BIO_PAGES 8 @@ -306,6 +312,14 @@ static int crypt_convert(struct crypt_config *cc, return r; } + static void dm_crypt_bio_destructor(struct bio *bio) + { + struct crypt_io *io = bio->bi_private; + struct crypt_config *cc = io->target->private; + + bio_free(bio, cc->bs); + } + /* * Generate a new unfragmented bio with the given size * This should never violate the device limitations @@ -315,34 +329,33 @@ static struct bio * crypt_alloc_buffer(struct crypt_config *cc, unsigned int size, struct bio *base_bio, unsigned int *bio_vec_idx) { - struct bio *bio; + struct bio *clone; unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM; unsigned int i; - /* - * Use __GFP_NOMEMALLOC to tell the VM to act less aggressively and - * to fail earlier. This is not necessary but increases throughput. - * FIXME: Is this really intelligent? - */ - if (base_bio) - bio = bio_clone(base_bio, GFP_NOIO|__GFP_NOMEMALLOC); - else - bio = bio_alloc(GFP_NOIO|__GFP_NOMEMALLOC, nr_iovecs); - if (!bio) + if (base_bio) { + clone = bio_alloc_bioset(GFP_NOIO, base_bio->bi_max_vecs, cc->bs); + __bio_clone(clone, base_bio); + } else + clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs); + + if (!clone) return NULL; + clone->bi_destructor = dm_crypt_bio_destructor; + /* if the last bio was not complete, continue where that one ended */ - bio->bi_idx = *bio_vec_idx; - bio->bi_vcnt = *bio_vec_idx; - bio->bi_size = 0; - bio->bi_flags &= ~(1 << BIO_SEG_VALID); + clone->bi_idx = *bio_vec_idx; + clone->bi_vcnt = *bio_vec_idx; + clone->bi_size = 0; + clone->bi_flags &= ~(1 << BIO_SEG_VALID); - /* bio->bi_idx pages have already been allocated */ - size -= bio->bi_idx * PAGE_SIZE; + /* clone->bi_idx pages have already been allocated */ + size -= clone->bi_idx * PAGE_SIZE; - for(i = bio->bi_idx; i < nr_iovecs; i++) { - struct bio_vec *bv = bio_iovec_idx(bio, i); + for (i = clone->bi_idx; i < nr_iovecs; i++) { + struct bio_vec *bv = bio_iovec_idx(clone, i); bv->bv_page = mempool_alloc(cc->page_pool, gfp_mask); if (!bv->bv_page) @@ -353,7 +366,7 @@ crypt_alloc_buffer(struct crypt_config *cc, unsigned int size, * return a partially allocated bio, the caller will then try * to allocate additional bios while submitting this partial bio */ - if ((i - bio->bi_idx) == (MIN_BIO_PAGES - 1)) + if ((i - clone->bi_idx) == (MIN_BIO_PAGES - 1)) gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT; bv->bv_offset = 0; @@ -362,13 +375,13 @@ crypt_alloc_buffer(struct crypt_config *cc, unsigned int size, else bv->bv_len = size; - bio->bi_size += bv->bv_len; - bio->bi_vcnt++; + clone->bi_size += bv->bv_len; + clone->bi_vcnt++; size -= bv->bv_len; } - if (!bio->bi_size) { - bio_put(bio); + if (!clone->bi_size) { + bio_put(clone); return NULL; } @@ -376,13 +389,13 @@ crypt_alloc_buffer(struct crypt_config *cc, unsigned int size, * Remember the last bio_vec allocated to be able * to correctly continue after the splitting. */ - *bio_vec_idx = bio->bi_vcnt; + *bio_vec_idx = clone->bi_vcnt; - return bio; + return clone; } static void crypt_free_buffer_pages(struct crypt_config *cc, - struct bio *bio, unsigned int bytes) + struct bio *clone, unsigned int bytes) { unsigned int i, start, end; struct bio_vec *bv; @@ -396,19 +409,19 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, * A fix to the bi_idx issue in the kernel is in the works, so * we will hopefully be able to revert to the cleaner solution soon. */ - i = bio->bi_vcnt - 1; - bv = bio_iovec_idx(bio, i); - end = (i << PAGE_SHIFT) + (bv->bv_offset + bv->bv_len) - bio->bi_size; + i = clone->bi_vcnt - 1; + bv = bio_iovec_idx(clone, i); + end = (i << PAGE_SHIFT) + (bv->bv_offset + bv->bv_len) - clone->bi_size; start = end - bytes; start >>= PAGE_SHIFT; - if (!bio->bi_size) - end = bio->bi_vcnt; + if (!clone->bi_size) + end = clone->bi_vcnt; else end >>= PAGE_SHIFT; - for(i = start; i < end; i++) { - bv = bio_iovec_idx(bio, i); + for (i = start; i < end; i++) { + bv = bio_iovec_idx(clone, i); BUG_ON(!bv->bv_page); mempool_free(bv->bv_page, cc->page_pool); bv->bv_page = NULL; @@ -432,7 +445,7 @@ static void dec_pending(struct crypt_io *io, int error) if (io->first_clone) bio_put(io->first_clone); - bio_endio(io->bio, io->bio->bi_size, io->error); + bio_endio(io->base_bio, io->base_bio->bi_size, io->error); mempool_free(io, cc->io_pool); } @@ -441,29 +454,179 @@ static void dec_pending(struct crypt_io *io, int error) * kcryptd: * * Needed because it would be very unwise to do decryption in an - * interrupt context, so bios returning from read requests get - * queued here. + * interrupt context. */ static struct workqueue_struct *_kcryptd_workqueue; +static void kcryptd_do_work(void *data); -static void kcryptd_do_work(void *data) +static void kcryptd_queue_io(struct crypt_io *io) { - struct crypt_io *io = (struct crypt_io *) data; - struct crypt_config *cc = (struct crypt_config *) io->target->private; + INIT_WORK(&io->work, kcryptd_do_work, io); + queue_work(_kcryptd_workqueue, &io->work); +} + +static int crypt_endio(struct bio *clone, unsigned int done, int error) +{ + struct crypt_io *io = clone->bi_private; + struct crypt_config *cc = io->target->private; + unsigned read_io = bio_data_dir(clone) == READ; + + /* + * free the processed pages, even if + * it's only a partially completed write + */ + if (!read_io) + crypt_free_buffer_pages(cc, clone, done); + + /* keep going - not finished yet */ + if (unlikely(clone->bi_size)) + return 1; + + if (!read_io) + goto out; + + if (unlikely(!bio_flagged(clone, BIO_UPTODATE))) { + error = -EIO; + goto out; + } + + bio_put(clone); + io->post_process = 1; + kcryptd_queue_io(io); + return 0; + +out: + bio_put(clone); + dec_pending(io, error); + return error; +} + +static void clone_init(struct crypt_io *io, struct bio *clone) +{ + struct crypt_config *cc = io->target->private; + + clone->bi_private = io; + clone->bi_end_io = crypt_endio; + clone->bi_bdev = cc->dev->bdev; + clone->bi_rw = io->base_bio->bi_rw; +} + +static void process_read(struct crypt_io *io) +{ + struct crypt_config *cc = io->target->private; + struct bio *base_bio = io->base_bio; + struct bio *clone; + sector_t sector = base_bio->bi_sector - io->target->begin; + + atomic_inc(&io->pending); + + /* + * The block layer might modify the bvec array, so always + * copy the required bvecs because we need the original + * one in order to decrypt the whole bio data *afterwards*. + */ + clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs); + if (unlikely(!clone)) { + dec_pending(io, -ENOMEM); + return; + } + + clone_init(io, clone); + clone->bi_destructor = dm_crypt_bio_destructor; + clone->bi_idx = 0; + clone->bi_vcnt = bio_segments(base_bio); + clone->bi_size = base_bio->bi_size; + clone->bi_sector = cc->start + sector; + memcpy(clone->bi_io_vec, bio_iovec(base_bio), + sizeof(struct bio_vec) * clone->bi_vcnt); + + generic_make_request(clone); +} + +static void process_write(struct crypt_io *io) +{ + struct crypt_config *cc = io->target->private; + struct bio *base_bio = io->base_bio; + struct bio *clone; struct convert_context ctx; - int r; + unsigned remaining = base_bio->bi_size; + sector_t sector = base_bio->bi_sector - io->target->begin; + unsigned bvec_idx = 0; + + atomic_inc(&io->pending); + + crypt_convert_init(cc, &ctx, NULL, base_bio, sector, 1); + + /* + * The allocated buffers can be smaller than the whole bio, + * so repeat the whole process until all the data can be handled. + */ + while (remaining) { + clone = crypt_alloc_buffer(cc, base_bio->bi_size, + io->first_clone, &bvec_idx); + if (unlikely(!clone)) { + dec_pending(io, -ENOMEM); + return; + } + + ctx.bio_out = clone; + + if (unlikely(crypt_convert(cc, &ctx) < 0)) { + crypt_free_buffer_pages(cc, clone, clone->bi_size); + bio_put(clone); + dec_pending(io, -EIO); + return; + } + + clone_init(io, clone); + clone->bi_sector = cc->start + sector; + + if (!io->first_clone) { + /* + * hold a reference to the first clone, because it + * holds the bio_vec array and that can't be freed + * before all other clones are released + */ + bio_get(clone); + io->first_clone = clone; + } + + remaining -= clone->bi_size; + sector += bio_sectors(clone); + + /* prevent bio_put of first_clone */ + if (remaining) + atomic_inc(&io->pending); - crypt_convert_init(cc, &ctx, io->bio, io->bio, - io->bio->bi_sector - io->target->begin, 0); - r = crypt_convert(cc, &ctx); + generic_make_request(clone); - dec_pending(io, r); + /* out of memory -> run queues */ + if (remaining) + blk_congestion_wait(bio_data_dir(clone), HZ/100); + } } -static void kcryptd_queue_io(struct crypt_io *io) +static void process_read_endio(struct crypt_io *io) { - INIT_WORK(&io->work, kcryptd_do_work, io); - queue_work(_kcryptd_workqueue, &io->work); + struct crypt_config *cc = io->target->private; + struct convert_context ctx; + + crypt_convert_init(cc, &ctx, io->base_bio, io->base_bio, + io->base_bio->bi_sector - io->target->begin, 0); + + dec_pending(io, crypt_convert(cc, &ctx)); +} + +static void kcryptd_do_work(void *data) +{ + struct crypt_io *io = data; + + if (io->post_process) + process_read_endio(io); + else if (bio_data_dir(io->base_bio) == READ) + process_read(io); + else + process_write(io); } /* @@ -477,7 +640,7 @@ static int crypt_decode_key(u8 *key, char *hex, unsigned int size) buffer[2] = '\0'; - for(i = 0; i < size; i++) { + for (i = 0; i < size; i++) { buffer[0] = *hex++; buffer[1] = *hex++; @@ -500,13 +663,38 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size) { unsigned int i; - for(i = 0; i < size; i++) { + for (i = 0; i < size; i++) { sprintf(hex, "%02x", *key); hex += 2; key++; } } +static int crypt_set_key(struct crypt_config *cc, char *key) +{ + unsigned key_size = strlen(key) >> 1; + + if (cc->key_size && cc->key_size != key_size) + return -EINVAL; + + cc->key_size = key_size; /* initial settings */ + + if ((!key_size && strcmp(key, "-")) || + (key_size && crypt_decode_key(cc->key, key, key_size) < 0)) + return -EINVAL; + + set_bit(DM_CRYPT_KEY_VALID, &cc->flags); + + return 0; +} + +static int crypt_wipe_key(struct crypt_config *cc) +{ + clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); + memset(&cc->key, 0, cc->key_size * sizeof(u8)); + return 0; +} + /* * Construct an encryption mapping: * <cipher> <key> <iv_offset> <dev_path> <start> @@ -539,16 +727,14 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) key_size = strlen(argv[1]) >> 1; - cc = kmalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL); + cc = kzalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL); if (cc == NULL) { ti->error = "Cannot allocate transparent encryption context"; return -ENOMEM; } - cc->key_size = key_size; - if ((!key_size && strcmp(argv[1], "-") != 0) || - (key_size && crypt_decode_key(cc->key, argv[1], key_size) < 0)) { + if (crypt_set_key(cc, argv[1])) { ti->error = "Error decoding key"; goto bad1; } @@ -626,6 +812,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad4; } + cc->bs = bioset_create(MIN_IOS, MIN_IOS, 4); + if (!cc->bs) { + ti->error = "Cannot allocate crypt bioset"; + goto bad_bs; + } + if (crypto_blkcipher_setkey(tfm, cc->key, key_size) < 0) { ti->error = "Error setting key"; goto bad5; @@ -665,6 +857,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) return 0; bad5: + bioset_free(cc->bs); +bad_bs: mempool_destroy(cc->page_pool); bad4: mempool_destroy(cc->io_pool); @@ -684,6 +878,7 @@ static void crypt_dtr(struct dm_target *ti) { struct crypt_config *cc = (struct crypt_config *) ti->private; + bioset_free(cc->bs); mempool_destroy(cc->page_pool); mempool_destroy(cc->io_pool); @@ -698,147 +893,21 @@ static void crypt_dtr(struct dm_target *ti) kfree(cc); } -static int crypt_endio(struct bio *bio, unsigned int done, int error) -{ - struct crypt_io *io = (struct crypt_io *) bio->bi_private; - struct crypt_config *cc = (struct crypt_config *) io->target->private; - - if (bio_data_dir(bio) == WRITE) { - /* - * free the processed pages, even if - * it's only a partially completed write - */ - crypt_free_buffer_pages(cc, bio, done); - } - - if (bio->bi_size) - return 1; - - bio_put(bio); - - /* - * successful reads are decrypted by the worker thread - */ - if ((bio_data_dir(bio) == READ) - && bio_flagged(bio, BIO_UPTODATE)) { - kcryptd_queue_io(io); - return 0; - } - - dec_pending(io, error); - return error; -} - -static inline struct bio * -crypt_clone(struct crypt_config *cc, struct crypt_io *io, struct bio *bio, - sector_t sector, unsigned int *bvec_idx, - struct convert_context *ctx) -{ - struct bio *clone; - - if (bio_data_dir(bio) == WRITE) { - clone = crypt_alloc_buffer(cc, bio->bi_size, - io->first_clone, bvec_idx); - if (clone) { - ctx->bio_out = clone; - if (crypt_convert(cc, ctx) < 0) { - crypt_free_buffer_pages(cc, clone, - clone->bi_size); - bio_put(clone); - return NULL; - } - } - } else { - /* - * The block layer might modify the bvec array, so always - * copy the required bvecs because we need the original - * one in order to decrypt the whole bio data *afterwards*. - */ - clone = bio_alloc(GFP_NOIO, bio_segments(bio)); - if (clone) { - clone->bi_idx = 0; - clone->bi_vcnt = bio_segments(bio); - clone->bi_size = bio->bi_size; - memcpy(clone->bi_io_vec, bio_iovec(bio), - sizeof(struct bio_vec) * clone->bi_vcnt); - } - } - - if (!clone) - return NULL; - - clone->bi_private = io; - clone->bi_end_io = crypt_endio; - clone->bi_bdev = cc->dev->bdev; - clone->bi_sector = cc->start + sector; - clone->bi_rw = bio->bi_rw; - - return clone; -} - static int crypt_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) { - struct crypt_config *cc = (struct crypt_config *) ti->private; - struct crypt_io *io = mempool_alloc(cc->io_pool, GFP_NOIO); - struct convert_context ctx; - struct bio *clone; - unsigned int remaining = bio->bi_size; - sector_t sector = bio->bi_sector - ti->begin; - unsigned int bvec_idx = 0; + struct crypt_config *cc = ti->private; + struct crypt_io *io; + io = mempool_alloc(cc->io_pool, GFP_NOIO); io->target = ti; - io->bio = bio; + io->base_bio = bio; io->first_clone = NULL; - io->error = 0; - atomic_set(&io->pending, 1); /* hold a reference */ - - if (bio_data_dir(bio) == WRITE) - crypt_convert_init(cc, &ctx, NULL, bio, sector, 1); - - /* - * The allocated buffers can be smaller than the whole bio, - * so repeat the whole process until all the data can be handled. - */ - while (remaining) { - clone = crypt_clone(cc, io, bio, sector, &bvec_idx, &ctx); - if (!clone) - goto cleanup; - - if (!io->first_clone) { - /* - * hold a reference to the first clone, because it - * holds the bio_vec array and that can't be freed - * before all other clones are released - */ - bio_get(clone); - io->first_clone = clone; - } - atomic_inc(&io->pending); + io->error = io->post_process = 0; + atomic_set(&io->pending, 0); + kcryptd_queue_io(io); - remaining -= clone->bi_size; - sector += bio_sectors(clone); - - generic_make_request(clone); - - /* out of memory -> run queues */ - if (remaining) - blk_congestion_wait(bio_data_dir(clone), HZ/100); - } - - /* drop reference, clones could have returned before we reach this */ - dec_pending(io, 0); return 0; - -cleanup: - if (io->first_clone) { - dec_pending(io, -ENOMEM); - return 0; - } - - /* if no bio has been dispatched yet, we can directly return the error */ - mempool_free(io, cc->io_pool); - return -ENOMEM; } static int crypt_status(struct dm_target *ti, status_type_t type, @@ -883,14 +952,71 @@ static int crypt_status(struct dm_target *ti, status_type_t type, return 0; } +static void crypt_postsuspend(struct dm_target *ti) +{ + struct crypt_config *cc = ti->private; + + set_bit(DM_CRYPT_SUSPENDED, &cc->flags); +} + +static int crypt_preresume(struct dm_target *ti) +{ + struct crypt_config *cc = ti->private; + + if (!test_bit(DM_CRYPT_KEY_VALID, &cc->flags)) { + DMERR("aborting resume - crypt key is not set."); + return -EAGAIN; + } + + return 0; +} + +static void crypt_resume(struct dm_target *ti) +{ + struct crypt_config *cc = ti->private; + + clear_bit(DM_CRYPT_SUSPENDED, &cc->flags); +} + +/* Message interface + * key set <key> + * key wipe + */ +static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) +{ + struct crypt_config *cc = ti->private; + + if (argc < 2) + goto error; + + if (!strnicmp(argv[0], MESG_STR("key"))) { + if (!test_bit(DM_CRYPT_SUSPENDED, &cc->flags)) { + DMWARN("not suspended during key manipulation."); + return -EINVAL; + } + if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) + return crypt_set_key(cc, argv[2]); + if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) + return crypt_wipe_key(cc); + } + +error: + DMWARN("unrecognised message received."); + return -EINVAL; +} + static struct target_type crypt_target = { .name = "crypt", - .version= {1, 1, 0}, + .version= {1, 3, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, .map = crypt_map, .status = crypt_status, + .postsuspend = crypt_postsuspend, + .preresume = crypt_preresume, + .resume = crypt_resume, + .message = crypt_message, }; static int __init dm_crypt_init(void) diff --git a/drivers/md/dm-emc.c b/drivers/md/dm-emc.c index 2a374ccb30d..2b2d45d7baa 100644 --- a/drivers/md/dm-emc.c +++ b/drivers/md/dm-emc.c @@ -126,7 +126,8 @@ static struct request *get_failover_req(struct emc_handler *h, memset(&rq->cmd, 0, BLK_MAX_CDB); rq->timeout = EMC_FAILOVER_TIMEOUT; - rq->flags |= (REQ_BLOCK_PC | REQ_FAILFAST | REQ_NOMERGE); + rq->cmd_type = REQ_TYPE_BLOCK_PC; + rq->cmd_flags |= REQ_FAILFAST | REQ_NOMERGE; return rq; } diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index d12379b5cdb..99cdffa7fbf 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c @@ -17,6 +17,7 @@ #include <linux/slab.h> #define DM_MSG_PREFIX "snapshots" +#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */ /*----------------------------------------------------------------- * Persistent snapshots, by persistent we mean that the snapshot @@ -150,6 +151,7 @@ static int alloc_area(struct pstore *ps) static void free_area(struct pstore *ps) { vfree(ps->area); + ps->area = NULL; } /* @@ -198,48 +200,79 @@ static int read_header(struct pstore *ps, int *new_snapshot) int r; struct disk_header *dh; chunk_t chunk_size; + int chunk_size_supplied = 1; - r = chunk_io(ps, 0, READ); + /* + * Use default chunk size (or hardsect_size, if larger) if none supplied + */ + if (!ps->snap->chunk_size) { + ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, + bdev_hardsect_size(ps->snap->cow->bdev) >> 9); + ps->snap->chunk_mask = ps->snap->chunk_size - 1; + ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1; + chunk_size_supplied = 0; + } + + r = dm_io_get(sectors_to_pages(ps->snap->chunk_size)); if (r) return r; + r = alloc_area(ps); + if (r) + goto bad1; + + r = chunk_io(ps, 0, READ); + if (r) + goto bad2; + dh = (struct disk_header *) ps->area; if (le32_to_cpu(dh->magic) == 0) { *new_snapshot = 1; + return 0; + } - } else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) { - *new_snapshot = 0; - ps->valid = le32_to_cpu(dh->valid); - ps->version = le32_to_cpu(dh->version); - chunk_size = le32_to_cpu(dh->chunk_size); - if (ps->snap->chunk_size != chunk_size) { - DMWARN("chunk size %llu in device metadata overrides " - "table chunk size of %llu.", - (unsigned long long)chunk_size, - (unsigned long long)ps->snap->chunk_size); - - /* We had a bogus chunk_size. Fix stuff up. */ - dm_io_put(sectors_to_pages(ps->snap->chunk_size)); - free_area(ps); - - ps->snap->chunk_size = chunk_size; - ps->snap->chunk_mask = chunk_size - 1; - ps->snap->chunk_shift = ffs(chunk_size) - 1; - - r = alloc_area(ps); - if (r) - return r; - - r = dm_io_get(sectors_to_pages(chunk_size)); - if (r) - return r; - } - } else { - DMWARN("Invalid/corrupt snapshot"); + if (le32_to_cpu(dh->magic) != SNAP_MAGIC) { + DMWARN("Invalid or corrupt snapshot"); r = -ENXIO; + goto bad2; } + *new_snapshot = 0; + ps->valid = le32_to_cpu(dh->valid); + ps->version = le32_to_cpu(dh->version); + chunk_size = le32_to_cpu(dh->chunk_size); + + if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size) + return 0; + + DMWARN("chunk size %llu in device metadata overrides " + "table chunk size of %llu.", + (unsigned long long)chunk_size, + (unsigned long long)ps->snap->chunk_size); + + /* We had a bogus chunk_size. Fix stuff up. */ + dm_io_put(sectors_to_pages(ps->snap->chunk_size)); + free_area(ps); + + ps->snap->chunk_size = chunk_size; + ps->snap->chunk_mask = chunk_size - 1; + ps->snap->chunk_shift = ffs(chunk_size) - 1; + + r = dm_io_get(sectors_to_pages(chunk_size)); + if (r) + return r; + + r = alloc_area(ps); + if (r) + goto bad1; + + return 0; + +bad2: + free_area(ps); +bad1: + dm_io_put(sectors_to_pages(ps->snap->chunk_size)); return r; } @@ -263,42 +296,29 @@ static int write_header(struct pstore *ps) */ static struct disk_exception *get_exception(struct pstore *ps, uint32_t index) { - if (index >= ps->exceptions_per_area) - return NULL; + BUG_ON(index >= ps->exceptions_per_area); return ((struct disk_exception *) ps->area) + index; } -static int read_exception(struct pstore *ps, - uint32_t index, struct disk_exception *result) +static void read_exception(struct pstore *ps, + uint32_t index, struct disk_exception *result) { - struct disk_exception *e; - - e = get_exception(ps, index); - if (!e) - return -EINVAL; + struct disk_exception *e = get_exception(ps, index); /* copy it */ result->old_chunk = le64_to_cpu(e->old_chunk); result->new_chunk = le64_to_cpu(e->new_chunk); - - return 0; } -static int write_exception(struct pstore *ps, - uint32_t index, struct disk_exception *de) +static void write_exception(struct pstore *ps, + uint32_t index, struct disk_exception *de) { - struct disk_exception *e; - - e = get_exception(ps, index); - if (!e) - return -EINVAL; + struct disk_exception *e = get_exception(ps, index); /* copy it */ e->old_chunk = cpu_to_le64(de->old_chunk); e->new_chunk = cpu_to_le64(de->new_chunk); - - return 0; } /* @@ -316,10 +336,7 @@ static int insert_exceptions(struct pstore *ps, int *full) *full = 1; for (i = 0; i < ps->exceptions_per_area; i++) { - r = read_exception(ps, i, &de); - - if (r) - return r; + read_exception(ps, i, &de); /* * If the new_chunk is pointing at the start of @@ -519,6 +536,16 @@ static void persistent_commit(struct exception_store *store, if (r) ps->valid = 0; + /* + * Have we completely filled the current area ? + */ + if (ps->current_committed == ps->exceptions_per_area) { + ps->current_committed = 0; + r = zero_area(ps, ps->current_area + 1); + if (r) + ps->valid = 0; + } + for (i = 0; i < ps->callback_count; i++) { cb = ps->callbacks + i; cb->callback(cb->context, r == 0 ? 1 : 0); @@ -526,16 +553,6 @@ static void persistent_commit(struct exception_store *store, ps->callback_count = 0; } - - /* - * Have we completely filled the current area ? - */ - if (ps->current_committed == ps->exceptions_per_area) { - ps->current_committed = 0; - r = zero_area(ps, ps->current_area + 1); - if (r) - ps->valid = 0; - } } static void persistent_drop(struct exception_store *store) @@ -547,32 +564,22 @@ static void persistent_drop(struct exception_store *store) DMWARN("write header failed"); } -int dm_create_persistent(struct exception_store *store, uint32_t chunk_size) +int dm_create_persistent(struct exception_store *store) { - int r; struct pstore *ps; - r = dm_io_get(sectors_to_pages(chunk_size)); - if (r) - return r; - /* allocate the pstore */ ps = kmalloc(sizeof(*ps), GFP_KERNEL); - if (!ps) { - r = -ENOMEM; - goto bad; - } + if (!ps) + return -ENOMEM; ps->snap = store->snap; ps->valid = 1; ps->version = SNAPSHOT_DISK_VERSION; + ps->area = NULL; ps->next_free = 2; /* skipping the header and first area */ ps->current_committed = 0; - r = alloc_area(ps); - if (r) - goto bad; - ps->callback_count = 0; atomic_set(&ps->pending_count, 0); ps->callbacks = NULL; @@ -586,13 +593,6 @@ int dm_create_persistent(struct exception_store *store, uint32_t chunk_size) store->context = ps; return 0; - - bad: - dm_io_put(sectors_to_pages(chunk_size)); - if (ps && ps->area) - free_area(ps); - kfree(ps); - return r; } /*----------------------------------------------------------------- @@ -642,18 +642,16 @@ static void transient_fraction_full(struct exception_store *store, *denominator = get_dev_size(store->snap->cow->bdev); } -int dm_create_transient(struct exception_store *store, - struct dm_snapshot *s, int blocksize) +int dm_create_transient(struct exception_store *store) { struct transient_c *tc; - memset(store, 0, sizeof(*store)); store->destroy = transient_destroy; store->read_metadata = transient_read_metadata; store->prepare_exception = transient_prepare; store->commit_exception = transient_commit; + store->drop_snapshot = NULL; store->fraction_full = transient_fraction_full; - store->snap = s; tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL); if (!tc) diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 47b3c62bbdb..00234909b3d 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -98,14 +98,31 @@ static int linear_status(struct dm_target *ti, status_type_t type, return 0; } +static int linear_ioctl(struct dm_target *ti, struct inode *inode, + struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct linear_c *lc = (struct linear_c *) ti->private; + struct block_device *bdev = lc->dev->bdev; + struct file fake_file = {}; + struct dentry fake_dentry = {}; + + fake_file.f_mode = lc->dev->mode; + fake_file.f_dentry = &fake_dentry; + fake_dentry.d_inode = bdev->bd_inode; + + return blkdev_driver_ioctl(bdev->bd_inode, &fake_file, bdev->bd_disk, cmd, arg); +} + static struct target_type linear_target = { .name = "linear", - .version= {1, 0, 1}, + .version= {1, 0, 2}, .module = THIS_MODULE, .ctr = linear_ctr, .dtr = linear_dtr, .map = linear_map, .status = linear_status, + .ioctl = linear_ioctl, }; int __init dm_linear_init(void) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 93f701ea87b..d754e0bc6e9 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -114,12 +114,10 @@ static void trigger_event(void *data); static struct pgpath *alloc_pgpath(void) { - struct pgpath *pgpath = kmalloc(sizeof(*pgpath), GFP_KERNEL); + struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL); - if (pgpath) { - memset(pgpath, 0, sizeof(*pgpath)); + if (pgpath) pgpath->path.is_active = 1; - } return pgpath; } @@ -133,12 +131,10 @@ static struct priority_group *alloc_priority_group(void) { struct priority_group *pg; - pg = kmalloc(sizeof(*pg), GFP_KERNEL); - if (!pg) - return NULL; + pg = kzalloc(sizeof(*pg), GFP_KERNEL); - memset(pg, 0, sizeof(*pg)); - INIT_LIST_HEAD(&pg->pgpaths); + if (pg) + INIT_LIST_HEAD(&pg->pgpaths); return pg; } @@ -168,13 +164,12 @@ static void free_priority_group(struct priority_group *pg, kfree(pg); } -static struct multipath *alloc_multipath(void) +static struct multipath *alloc_multipath(struct dm_target *ti) { struct multipath *m; - m = kmalloc(sizeof(*m), GFP_KERNEL); + m = kzalloc(sizeof(*m), GFP_KERNEL); if (m) { - memset(m, 0, sizeof(*m)); INIT_LIST_HEAD(&m->priority_groups); spin_lock_init(&m->lock); m->queue_io = 1; @@ -185,6 +180,8 @@ static struct multipath *alloc_multipath(void) kfree(m); return NULL; } + m->ti = ti; + ti->private = m; } return m; @@ -557,8 +554,7 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, } static struct priority_group *parse_priority_group(struct arg_set *as, - struct multipath *m, - struct dm_target *ti) + struct multipath *m) { static struct param _params[] = { {1, 1024, "invalid number of paths"}, @@ -568,6 +564,7 @@ static struct priority_group *parse_priority_group(struct arg_set *as, int r; unsigned i, nr_selector_args, nr_params; struct priority_group *pg; + struct dm_target *ti = m->ti; if (as->argc < 2) { as->argc = 0; @@ -624,12 +621,12 @@ static struct priority_group *parse_priority_group(struct arg_set *as, return NULL; } -static int parse_hw_handler(struct arg_set *as, struct multipath *m, - struct dm_target *ti) +static int parse_hw_handler(struct arg_set *as, struct multipath *m) { int r; struct hw_handler_type *hwht; unsigned hw_argc; + struct dm_target *ti = m->ti; static struct param _params[] = { {0, 1024, "invalid number of hardware handler args"}, @@ -661,11 +658,11 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m, return 0; } -static int parse_features(struct arg_set *as, struct multipath *m, - struct dm_target *ti) +static int parse_features(struct arg_set *as, struct multipath *m) { int r; unsigned argc; + struct dm_target *ti = m->ti; static struct param _params[] = { {0, 1, "invalid number of feature args"}, @@ -704,19 +701,17 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, as.argc = argc; as.argv = argv; - m = alloc_multipath(); + m = alloc_multipath(ti); if (!m) { ti->error = "can't allocate multipath"; return -EINVAL; } - m->ti = ti; - - r = parse_features(&as, m, ti); + r = parse_features(&as, m); if (r) goto bad; - r = parse_hw_handler(&as, m, ti); + r = parse_hw_handler(&as, m); if (r) goto bad; @@ -732,7 +727,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, while (as.argc) { struct priority_group *pg; - pg = parse_priority_group(&as, m, ti); + pg = parse_priority_group(&as, m); if (!pg) { r = -EINVAL; goto bad; @@ -752,8 +747,6 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, goto bad; } - ti->private = m; - return 0; bad: @@ -1266,12 +1259,47 @@ error: return -EINVAL; } +static int multipath_ioctl(struct dm_target *ti, struct inode *inode, + struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct multipath *m = (struct multipath *) ti->private; + struct block_device *bdev = NULL; + unsigned long flags; + struct file fake_file = {}; + struct dentry fake_dentry = {}; + int r = 0; + + fake_file.f_dentry = &fake_dentry; + + spin_lock_irqsave(&m->lock, flags); + + if (!m->current_pgpath) + __choose_pgpath(m); + + if (m->current_pgpath) { + bdev = m->current_pgpath->path.dev->bdev; + fake_dentry.d_inode = bdev->bd_inode; + fake_file.f_mode = m->current_pgpath->path.dev->mode; + } + + if (m->queue_io) + r = -EAGAIN; + else if (!bdev) + r = -EIO; + + spin_unlock_irqrestore(&m->lock, flags); + + return r ? : blkdev_driver_ioctl(bdev->bd_inode, &fake_file, + bdev->bd_disk, cmd, arg); +} + /*----------------------------------------------------------------- * Module setup *---------------------------------------------------------------*/ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 0, 4}, + .version = {1, 0, 5}, .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, @@ -1281,6 +1309,7 @@ static struct target_type multipath_target = { .resume = multipath_resume, .status = multipath_status, .message = multipath_message, + .ioctl = multipath_ioctl, }; static int __init dm_multipath_init(void) diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index c54de989eb0..659224cb7c5 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -1213,9 +1213,9 @@ static int mirror_status(struct dm_target *ti, status_type_t type, break; case STATUSTYPE_TABLE: - DMEMIT("%d ", ms->nr_mirrors); + DMEMIT("%d", ms->nr_mirrors); for (m = 0; m < ms->nr_mirrors; m++) - DMEMIT("%s %llu ", ms->mirror[m].dev->name, + DMEMIT(" %s %llu", ms->mirror[m].dev->name, (unsigned long long)ms->mirror[m].offset); } diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 1d0fafda0f7..5281e009407 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -39,6 +39,9 @@ */ #define SNAPSHOT_PAGES 256 +struct workqueue_struct *ksnapd; +static void flush_queued_bios(void *data); + struct pending_exception { struct exception e; @@ -56,7 +59,7 @@ struct pending_exception { /* * The primary pending_exception is the one that holds - * the sibling_count and the list of origin_bios for a + * the ref_count and the list of origin_bios for a * group of pending_exceptions. It is always last to get freed. * These fields get set up when writing to the origin. */ @@ -69,7 +72,7 @@ struct pending_exception { * the sibling concerned and not pe->primary_pe->snap->lock unless * they are the same. */ - atomic_t sibling_count; + atomic_t ref_count; /* Pointer back to snapshot context */ struct dm_snapshot *snap; @@ -387,15 +390,46 @@ static inline ulong round_up(ulong n, ulong size) return (n + size) & ~size; } -static void read_snapshot_metadata(struct dm_snapshot *s) +static int set_chunk_size(struct dm_snapshot *s, const char *chunk_size_arg, + char **error) { - if (s->store.read_metadata(&s->store)) { - down_write(&s->lock); - s->valid = 0; - up_write(&s->lock); + unsigned long chunk_size; + char *value; + + chunk_size = simple_strtoul(chunk_size_arg, &value, 10); + if (*chunk_size_arg == '\0' || *value != '\0') { + *error = "Invalid chunk size"; + return -EINVAL; + } + + if (!chunk_size) { + s->chunk_size = s->chunk_mask = s->chunk_shift = 0; + return 0; + } + + /* + * Chunk size must be multiple of page size. Silently + * round up if it's not. + */ + chunk_size = round_up(chunk_size, PAGE_SIZE >> 9); + + /* Check chunk_size is a power of 2 */ + if (chunk_size & (chunk_size - 1)) { + *error = "Chunk size is not a power of 2"; + return -EINVAL; + } - dm_table_event(s->table); + /* Validate the chunk size against the device block size */ + if (chunk_size % (bdev_hardsect_size(s->cow->bdev) >> 9)) { + *error = "Chunk size is not a multiple of device blocksize"; + return -EINVAL; } + + s->chunk_size = chunk_size; + s->chunk_mask = chunk_size - 1; + s->chunk_shift = ffs(chunk_size) - 1; + + return 0; } /* @@ -404,15 +438,12 @@ static void read_snapshot_metadata(struct dm_snapshot *s) static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct dm_snapshot *s; - unsigned long chunk_size; int r = -EINVAL; char persistent; char *origin_path; char *cow_path; - char *value; - int blocksize; - if (argc < 4) { + if (argc != 4) { ti->error = "requires exactly 4 arguments"; r = -EINVAL; goto bad1; @@ -428,13 +459,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad1; } - chunk_size = simple_strtoul(argv[3], &value, 10); - if (chunk_size == 0 || value == NULL) { - ti->error = "Invalid chunk size"; - r = -EINVAL; - goto bad1; - } - s = kmalloc(sizeof(*s), GFP_KERNEL); if (s == NULL) { ti->error = "Cannot allocate snapshot context private " @@ -457,36 +481,17 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad2; } - /* - * Chunk size must be multiple of page size. Silently - * round up if it's not. - */ - chunk_size = round_up(chunk_size, PAGE_SIZE >> 9); - - /* Validate the chunk size against the device block size */ - blocksize = s->cow->bdev->bd_disk->queue->hardsect_size; - if (chunk_size % (blocksize >> 9)) { - ti->error = "Chunk size is not a multiple of device blocksize"; - r = -EINVAL; - goto bad3; - } - - /* Check chunk_size is a power of 2 */ - if (chunk_size & (chunk_size - 1)) { - ti->error = "Chunk size is not a power of 2"; - r = -EINVAL; + r = set_chunk_size(s, argv[3], &ti->error); + if (r) goto bad3; - } - s->chunk_size = chunk_size; - s->chunk_mask = chunk_size - 1; s->type = persistent; - s->chunk_shift = ffs(chunk_size) - 1; s->valid = 1; s->active = 0; s->last_percent = 0; init_rwsem(&s->lock); + spin_lock_init(&s->pe_lock); s->table = ti->table; /* Allocate hash table for COW data */ @@ -496,16 +501,12 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad3; } - /* - * Check the persistent flag - done here because we need the iobuf - * to check the LV header - */ s->store.snap = s; if (persistent == 'P') - r = dm_create_persistent(&s->store, chunk_size); + r = dm_create_persistent(&s->store); else - r = dm_create_transient(&s->store, s, blocksize); + r = dm_create_transient(&s->store); if (r) { ti->error = "Couldn't create exception store"; @@ -520,7 +521,14 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) } /* Metadata must only be loaded into one table at once */ - read_snapshot_metadata(s); + r = s->store.read_metadata(&s->store); + if (r) { + ti->error = "Failed to read snapshot metadata"; + goto bad6; + } + + bio_list_init(&s->queued_bios); + INIT_WORK(&s->queued_bios_work, flush_queued_bios, s); /* Add snapshot to the list of snapshots for this origin */ /* Exceptions aren't triggered till snapshot_resume() is called */ @@ -560,6 +568,8 @@ static void snapshot_dtr(struct dm_target *ti) { struct dm_snapshot *s = (struct dm_snapshot *) ti->private; + flush_workqueue(ksnapd); + /* Prevent further origin writes from using this snapshot. */ /* After this returns there can be no new kcopyd jobs. */ unregister_snapshot(s); @@ -593,6 +603,19 @@ static void flush_bios(struct bio *bio) } } +static void flush_queued_bios(void *data) +{ + struct dm_snapshot *s = (struct dm_snapshot *) data; + struct bio *queued_bios; + unsigned long flags; + + spin_lock_irqsave(&s->pe_lock, flags); + queued_bios = bio_list_get(&s->queued_bios); + spin_unlock_irqrestore(&s->pe_lock, flags); + + flush_bios(queued_bios); +} + /* * Error a list of buffers. */ @@ -608,28 +631,7 @@ static void error_bios(struct bio *bio) } } -static inline void error_snapshot_bios(struct pending_exception *pe) -{ - error_bios(bio_list_get(&pe->snapshot_bios)); -} - -static struct bio *__flush_bios(struct pending_exception *pe) -{ - /* - * If this pe is involved in a write to the origin and - * it is the last sibling to complete then release - * the bios for the original write to the origin. - */ - - if (pe->primary_pe && - atomic_dec_and_test(&pe->primary_pe->sibling_count)) - return bio_list_get(&pe->primary_pe->origin_bios); - - return NULL; -} - -static void __invalidate_snapshot(struct dm_snapshot *s, - struct pending_exception *pe, int err) +static void __invalidate_snapshot(struct dm_snapshot *s, int err) { if (!s->valid) return; @@ -639,9 +641,6 @@ static void __invalidate_snapshot(struct dm_snapshot *s, else if (err == -ENOMEM) DMERR("Invalidating snapshot: Unable to allocate exception."); - if (pe) - remove_exception(&pe->e); - if (s->store.drop_snapshot) s->store.drop_snapshot(&s->store); @@ -650,78 +649,95 @@ static void __invalidate_snapshot(struct dm_snapshot *s, dm_table_event(s->table); } +static void get_pending_exception(struct pending_exception *pe) +{ + atomic_inc(&pe->ref_count); +} + +static struct bio *put_pending_exception(struct pending_exception *pe) +{ + struct pending_exception *primary_pe; + struct bio *origin_bios = NULL; + + primary_pe = pe->primary_pe; + + /* + * If this pe is involved in a write to the origin and + * it is the last sibling to complete then release + * the bios for the original write to the origin. + */ + if (primary_pe && + atomic_dec_and_test(&primary_pe->ref_count)) + origin_bios = bio_list_get(&primary_pe->origin_bios); + + /* + * Free the pe if it's not linked to an origin write or if + * it's not itself a primary pe. + */ + if (!primary_pe || primary_pe != pe) + free_pending_exception(pe); + + /* + * Free the primary pe if nothing references it. + */ + if (primary_pe && !atomic_read(&primary_pe->ref_count)) + free_pending_exception(primary_pe); + + return origin_bios; +} + static void pending_complete(struct pending_exception *pe, int success) { struct exception *e; - struct pending_exception *primary_pe; struct dm_snapshot *s = pe->snap; - struct bio *flush = NULL; + struct bio *origin_bios = NULL; + struct bio *snapshot_bios = NULL; + int error = 0; if (!success) { /* Read/write error - snapshot is unusable */ down_write(&s->lock); - __invalidate_snapshot(s, pe, -EIO); - flush = __flush_bios(pe); - up_write(&s->lock); - - error_snapshot_bios(pe); + __invalidate_snapshot(s, -EIO); + error = 1; goto out; } e = alloc_exception(); if (!e) { down_write(&s->lock); - __invalidate_snapshot(s, pe, -ENOMEM); - flush = __flush_bios(pe); - up_write(&s->lock); - - error_snapshot_bios(pe); + __invalidate_snapshot(s, -ENOMEM); + error = 1; goto out; } *e = pe->e; - /* - * Add a proper exception, and remove the - * in-flight exception from the list. - */ down_write(&s->lock); if (!s->valid) { - flush = __flush_bios(pe); - up_write(&s->lock); - free_exception(e); - - error_snapshot_bios(pe); + error = 1; goto out; } + /* + * Add a proper exception, and remove the + * in-flight exception from the list. + */ insert_exception(&s->complete, e); + + out: remove_exception(&pe->e); - flush = __flush_bios(pe); + snapshot_bios = bio_list_get(&pe->snapshot_bios); + origin_bios = put_pending_exception(pe); up_write(&s->lock); /* Submit any pending write bios */ - flush_bios(bio_list_get(&pe->snapshot_bios)); - - out: - primary_pe = pe->primary_pe; - - /* - * Free the pe if it's not linked to an origin write or if - * it's not itself a primary pe. - */ - if (!primary_pe || primary_pe != pe) - free_pending_exception(pe); - - /* - * Free the primary pe if nothing references it. - */ - if (primary_pe && !atomic_read(&primary_pe->sibling_count)) - free_pending_exception(primary_pe); + if (error) + error_bios(snapshot_bios); + else + flush_bios(snapshot_bios); - if (flush) - flush_bios(flush); + flush_bios(origin_bios); } static void commit_callback(void *context, int success) @@ -822,7 +838,7 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio) bio_list_init(&pe->origin_bios); bio_list_init(&pe->snapshot_bios); pe->primary_pe = NULL; - atomic_set(&pe->sibling_count, 1); + atomic_set(&pe->ref_count, 0); pe->snap = s; pe->started = 0; @@ -831,6 +847,7 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio) return NULL; } + get_pending_exception(pe); insert_exception(&s->pending, &pe->e); out: @@ -850,7 +867,6 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, { struct exception *e; struct dm_snapshot *s = (struct dm_snapshot *) ti->private; - int copy_needed = 0; int r = 1; chunk_t chunk; struct pending_exception *pe = NULL; @@ -865,32 +881,31 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, if (unlikely(bio_barrier(bio))) return -EOPNOTSUPP; + /* FIXME: should only take write lock if we need + * to copy an exception */ + down_write(&s->lock); + + if (!s->valid) { + r = -EIO; + goto out_unlock; + } + + /* If the block is already remapped - use that, else remap it */ + e = lookup_exception(&s->complete, chunk); + if (e) { + remap_exception(s, e, bio); + goto out_unlock; + } + /* * Write to snapshot - higher level takes care of RW/RO * flags so we should only get this if we are * writeable. */ if (bio_rw(bio) == WRITE) { - - /* FIXME: should only take write lock if we need - * to copy an exception */ - down_write(&s->lock); - - if (!s->valid) { - r = -EIO; - goto out_unlock; - } - - /* If the block is already remapped - use that, else remap it */ - e = lookup_exception(&s->complete, chunk); - if (e) { - remap_exception(s, e, bio); - goto out_unlock; - } - pe = __find_pending_exception(s, bio); if (!pe) { - __invalidate_snapshot(s, pe, -ENOMEM); + __invalidate_snapshot(s, -ENOMEM); r = -EIO; goto out_unlock; } @@ -898,45 +913,27 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, remap_exception(s, &pe->e, bio); bio_list_add(&pe->snapshot_bios, bio); + r = 0; + if (!pe->started) { /* this is protected by snap->lock */ pe->started = 1; - copy_needed = 1; - } - - r = 0; - - out_unlock: - up_write(&s->lock); - - if (copy_needed) + up_write(&s->lock); start_copy(pe); - } else { + goto out; + } + } else /* * FIXME: this read path scares me because we * always use the origin when we have a pending * exception. However I can't think of a * situation where this is wrong - ejt. */ + bio->bi_bdev = s->origin->bdev; - /* Do reads */ - down_read(&s->lock); - - if (!s->valid) { - up_read(&s->lock); - return -EIO; - } - - /* See if it it has been remapped */ - e = lookup_exception(&s->complete, chunk); - if (e) - remap_exception(s, e, bio); - else - bio->bi_bdev = s->origin->bdev; - - up_read(&s->lock); - } - + out_unlock: + up_write(&s->lock); + out: return r; } @@ -1025,7 +1022,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) * is already remapped in this snapshot * and trigger an exception if not. * - * sibling_count is initialised to 1 so pending_complete() + * ref_count is initialised to 1 so pending_complete() * won't destroy the primary_pe while we're inside this loop. */ e = lookup_exception(&snap->complete, chunk); @@ -1034,7 +1031,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) pe = __find_pending_exception(snap, bio); if (!pe) { - __invalidate_snapshot(snap, pe, ENOMEM); + __invalidate_snapshot(snap, -ENOMEM); goto next_snapshot; } @@ -1056,8 +1053,8 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) } if (!pe->primary_pe) { - atomic_inc(&primary_pe->sibling_count); pe->primary_pe = primary_pe; + get_pending_exception(primary_pe); } if (!pe->started) { @@ -1070,20 +1067,20 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) } if (!primary_pe) - goto out; + return r; /* * If this is the first time we're processing this chunk and - * sibling_count is now 1 it means all the pending exceptions + * ref_count is now 1 it means all the pending exceptions * got completed while we were in the loop above, so it falls to * us here to remove the primary_pe and submit any origin_bios. */ - if (first && atomic_dec_and_test(&primary_pe->sibling_count)) { + if (first && atomic_dec_and_test(&primary_pe->ref_count)) { flush_bios(bio_list_get(&primary_pe->origin_bios)); free_pending_exception(primary_pe); /* If we got here, pe_queue is necessarily empty. */ - goto out; + return r; } /* @@ -1092,7 +1089,6 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) list_for_each_entry_safe(pe, next_pe, &pe_queue, list) start_copy(pe); - out: return r; } @@ -1205,7 +1201,7 @@ static int origin_status(struct dm_target *ti, status_type_t type, char *result, static struct target_type origin_target = { .name = "snapshot-origin", - .version = {1, 4, 0}, + .version = {1, 5, 0}, .module = THIS_MODULE, .ctr = origin_ctr, .dtr = origin_dtr, @@ -1216,7 +1212,7 @@ static struct target_type origin_target = { static struct target_type snapshot_target = { .name = "snapshot", - .version = {1, 4, 0}, + .version = {1, 5, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, @@ -1275,8 +1271,17 @@ static int __init dm_snapshot_init(void) goto bad5; } + ksnapd = create_singlethread_workqueue("ksnapd"); + if (!ksnapd) { + DMERR("Failed to create ksnapd workqueue."); + r = -ENOMEM; + goto bad6; + } + return 0; + bad6: + mempool_destroy(pending_pool); bad5: kmem_cache_destroy(pending_cache); bad4: @@ -1294,6 +1299,8 @@ static void __exit dm_snapshot_exit(void) { int r; + destroy_workqueue(ksnapd); + r = dm_unregister_target(&snapshot_target); if (r) DMERR("snapshot unregister failed %d", r); diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h index fdec1e2dc87..15fa2ae6cdc 100644 --- a/drivers/md/dm-snap.h +++ b/drivers/md/dm-snap.h @@ -10,7 +10,9 @@ #define DM_SNAPSHOT_H #include "dm.h" +#include "dm-bio-list.h" #include <linux/blkdev.h> +#include <linux/workqueue.h> struct exception_table { uint32_t hash_mask; @@ -112,10 +114,20 @@ struct dm_snapshot { struct exception_table pending; struct exception_table complete; + /* + * pe_lock protects all pending_exception operations and access + * as well as the snapshot_bios list. + */ + spinlock_t pe_lock; + /* The on disk metadata handler */ struct exception_store store; struct kcopyd_client *kcopyd_client; + + /* Queue of snapshot writes for ksnapd to flush */ + struct bio_list queued_bios; + struct work_struct queued_bios_work; }; /* @@ -128,10 +140,9 @@ int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new); * Constructor and destructor for the default persistent * store. */ -int dm_create_persistent(struct exception_store *store, uint32_t chunk_size); +int dm_create_persistent(struct exception_store *store); -int dm_create_transient(struct exception_store *store, - struct dm_snapshot *s, int blocksize); +int dm_create_transient(struct exception_store *store); /* * Return the number of sectors in the device. diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 75fe9493e6a..05befa91807 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -522,56 +522,61 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti, return 0; } - -int dm_get_device(struct dm_target *ti, const char *path, sector_t start, - sector_t len, int mode, struct dm_dev **result) +void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev) { - int r = __table_get_device(ti->table, ti, path, - start, len, mode, result); - if (!r) { - request_queue_t *q = bdev_get_queue((*result)->bdev); - struct io_restrictions *rs = &ti->limits; - - /* - * Combine the device limits low. - * - * FIXME: if we move an io_restriction struct - * into q this would just be a call to - * combine_restrictions_low() - */ + request_queue_t *q = bdev_get_queue(bdev); + struct io_restrictions *rs = &ti->limits; + + /* + * Combine the device limits low. + * + * FIXME: if we move an io_restriction struct + * into q this would just be a call to + * combine_restrictions_low() + */ + rs->max_sectors = + min_not_zero(rs->max_sectors, q->max_sectors); + + /* FIXME: Device-Mapper on top of RAID-0 breaks because DM + * currently doesn't honor MD's merge_bvec_fn routine. + * In this case, we'll force DM to use PAGE_SIZE or + * smaller I/O, just to be safe. A better fix is in the + * works, but add this for the time being so it will at + * least operate correctly. + */ + if (q->merge_bvec_fn) rs->max_sectors = - min_not_zero(rs->max_sectors, q->max_sectors); + min_not_zero(rs->max_sectors, + (unsigned int) (PAGE_SIZE >> 9)); - /* FIXME: Device-Mapper on top of RAID-0 breaks because DM - * currently doesn't honor MD's merge_bvec_fn routine. - * In this case, we'll force DM to use PAGE_SIZE or - * smaller I/O, just to be safe. A better fix is in the - * works, but add this for the time being so it will at - * least operate correctly. - */ - if (q->merge_bvec_fn) - rs->max_sectors = - min_not_zero(rs->max_sectors, - (unsigned int) (PAGE_SIZE >> 9)); + rs->max_phys_segments = + min_not_zero(rs->max_phys_segments, + q->max_phys_segments); - rs->max_phys_segments = - min_not_zero(rs->max_phys_segments, - q->max_phys_segments); + rs->max_hw_segments = + min_not_zero(rs->max_hw_segments, q->max_hw_segments); - rs->max_hw_segments = - min_not_zero(rs->max_hw_segments, q->max_hw_segments); + rs->hardsect_size = max(rs->hardsect_size, q->hardsect_size); - rs->hardsect_size = max(rs->hardsect_size, q->hardsect_size); + rs->max_segment_size = + min_not_zero(rs->max_segment_size, q->max_segment_size); - rs->max_segment_size = - min_not_zero(rs->max_segment_size, q->max_segment_size); + rs->seg_boundary_mask = + min_not_zero(rs->seg_boundary_mask, + q->seg_boundary_mask); - rs->seg_boundary_mask = - min_not_zero(rs->seg_boundary_mask, - q->seg_boundary_mask); + rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); +} +EXPORT_SYMBOL_GPL(dm_set_device_limits); - rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); - } +int dm_get_device(struct dm_target *ti, const char *path, sector_t start, + sector_t len, int mode, struct dm_dev **result) +{ + int r = __table_get_device(ti->table, ti, path, + start, len, mode, result); + + if (!r) + dm_set_device_limits(ti, (*result)->bdev); return r; } @@ -939,9 +944,20 @@ void dm_table_postsuspend_targets(struct dm_table *t) return suspend_targets(t, 1); } -void dm_table_resume_targets(struct dm_table *t) +int dm_table_resume_targets(struct dm_table *t) { - int i; + int i, r = 0; + + for (i = 0; i < t->num_targets; i++) { + struct dm_target *ti = t->targets + i; + + if (!ti->type->preresume) + continue; + + r = ti->type->preresume(ti); + if (r) + return r; + } for (i = 0; i < t->num_targets; i++) { struct dm_target *ti = t->targets + i; @@ -949,6 +965,8 @@ void dm_table_resume_targets(struct dm_table *t) if (ti->type->resume) ti->type->resume(ti); } + + return 0; } int dm_table_any_congested(struct dm_table *t, int bdi_bits) @@ -983,6 +1001,11 @@ int dm_table_flush_all(struct dm_table *t) { struct list_head *d, *devices = dm_table_get_devices(t); int ret = 0; + unsigned i; + + for (i = 0; i < t->num_targets; i++) + if (t->targets[i].type->flush) + t->targets[i].type->flush(&t->targets[i]); for (d = devices->next; d != devices; d = d->next) { struct dm_dev *dd = list_entry(d, struct dm_dev, list); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index c99bf9f0175..b5764a86c8b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -20,6 +20,7 @@ #include <linux/idr.h> #include <linux/hdreg.h> #include <linux/blktrace_api.h> +#include <linux/smp_lock.h> #define DM_MSG_PREFIX "core" @@ -101,6 +102,8 @@ struct mapped_device { mempool_t *io_pool; mempool_t *tio_pool; + struct bio_set *bs; + /* * Event handling. */ @@ -121,16 +124,10 @@ struct mapped_device { static kmem_cache_t *_io_cache; static kmem_cache_t *_tio_cache; -static struct bio_set *dm_set; - static int __init local_init(void) { int r; - dm_set = bioset_create(16, 16, 4); - if (!dm_set) - return -ENOMEM; - /* allocate a slab for the dm_ios */ _io_cache = kmem_cache_create("dm_io", sizeof(struct dm_io), 0, 0, NULL, NULL); @@ -164,8 +161,6 @@ static void local_exit(void) kmem_cache_destroy(_tio_cache); kmem_cache_destroy(_io_cache); - bioset_free(dm_set); - if (unregister_blkdev(_major, _name) < 0) DMERR("unregister_blkdev failed"); @@ -288,6 +283,45 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) return dm_get_geometry(md, geo); } +static int dm_blk_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct mapped_device *md; + struct dm_table *map; + struct dm_target *tgt; + int r = -ENOTTY; + + /* We don't really need this lock, but we do need 'inode'. */ + unlock_kernel(); + + md = inode->i_bdev->bd_disk->private_data; + + map = dm_get_table(md); + + if (!map || !dm_table_get_size(map)) + goto out; + + /* We only support devices that have a single target */ + if (dm_table_get_num_targets(map) != 1) + goto out; + + tgt = dm_table_get_target(map, 0); + + if (dm_suspended(md)) { + r = -EAGAIN; + goto out; + } + + if (tgt->type->ioctl) + r = tgt->type->ioctl(tgt, inode, file, cmd, arg); + +out: + dm_table_put(map); + + lock_kernel(); + return r; +} + static inline struct dm_io *alloc_io(struct mapped_device *md) { return mempool_alloc(md->io_pool, GFP_NOIO); @@ -435,7 +469,7 @@ static int clone_endio(struct bio *bio, unsigned int done, int error) { int r = 0; struct target_io *tio = bio->bi_private; - struct dm_io *io = tio->io; + struct mapped_device *md = tio->io->md; dm_endio_fn endio = tio->ti->type->end_io; if (bio->bi_size) @@ -454,9 +488,15 @@ static int clone_endio(struct bio *bio, unsigned int done, int error) return 1; } - free_tio(io->md, tio); - dec_pending(io, error); + dec_pending(tio->io, error); + + /* + * Store md for cleanup instead of tio which is about to get freed. + */ + bio->bi_private = md->bs; + bio_put(bio); + free_tio(md, tio); return r; } @@ -485,6 +525,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, { int r; sector_t sector; + struct mapped_device *md; /* * Sanity checks. @@ -514,10 +555,14 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, else if (r < 0) { /* error the io and bail out */ - struct dm_io *io = tio->io; - free_tio(tio->io->md, tio); - dec_pending(io, r); + md = tio->io->md; + dec_pending(tio->io, r); + /* + * Store bio_set for cleanup. + */ + clone->bi_private = md->bs; bio_put(clone); + free_tio(md, tio); } } @@ -533,7 +578,9 @@ struct clone_info { static void dm_bio_destructor(struct bio *bio) { - bio_free(bio, dm_set); + struct bio_set *bs = bio->bi_private; + + bio_free(bio, bs); } /* @@ -541,12 +588,12 @@ static void dm_bio_destructor(struct bio *bio) */ static struct bio *split_bvec(struct bio *bio, sector_t sector, unsigned short idx, unsigned int offset, - unsigned int len) + unsigned int len, struct bio_set *bs) { struct bio *clone; struct bio_vec *bv = bio->bi_io_vec + idx; - clone = bio_alloc_bioset(GFP_NOIO, 1, dm_set); + clone = bio_alloc_bioset(GFP_NOIO, 1, bs); clone->bi_destructor = dm_bio_destructor; *clone->bi_io_vec = *bv; @@ -566,11 +613,13 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector, */ static struct bio *clone_bio(struct bio *bio, sector_t sector, unsigned short idx, unsigned short bv_count, - unsigned int len) + unsigned int len, struct bio_set *bs) { struct bio *clone; - clone = bio_clone(bio, GFP_NOIO); + clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); + __bio_clone(clone, bio); + clone->bi_destructor = dm_bio_destructor; clone->bi_sector = sector; clone->bi_idx = idx; clone->bi_vcnt = idx + bv_count; @@ -601,7 +650,8 @@ static void __clone_and_map(struct clone_info *ci) * the remaining io with a single clone. */ clone = clone_bio(bio, ci->sector, ci->idx, - bio->bi_vcnt - ci->idx, ci->sector_count); + bio->bi_vcnt - ci->idx, ci->sector_count, + ci->md->bs); __map_bio(ti, clone, tio); ci->sector_count = 0; @@ -624,7 +674,8 @@ static void __clone_and_map(struct clone_info *ci) len += bv_len; } - clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len); + clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len, + ci->md->bs); __map_bio(ti, clone, tio); ci->sector += len; @@ -653,7 +704,8 @@ static void __clone_and_map(struct clone_info *ci) len = min(remaining, max); clone = split_bvec(bio, ci->sector, ci->idx, - bv->bv_offset + offset, len); + bv->bv_offset + offset, len, + ci->md->bs); __map_bio(ti, clone, tio); @@ -903,7 +955,7 @@ static struct mapped_device *alloc_dev(int minor) md->queue = blk_alloc_queue(GFP_KERNEL); if (!md->queue) - goto bad1; + goto bad1_free_minor; md->queue->queuedata = md; md->queue->backing_dev_info.congested_fn = dm_any_congested; @@ -921,6 +973,10 @@ static struct mapped_device *alloc_dev(int minor) if (!md->tio_pool) goto bad3; + md->bs = bioset_create(16, 16, 4); + if (!md->bs) + goto bad_no_bioset; + md->disk = alloc_disk(1); if (!md->disk) goto bad4; @@ -948,11 +1004,14 @@ static struct mapped_device *alloc_dev(int minor) return md; bad4: + bioset_free(md->bs); + bad_no_bioset: mempool_destroy(md->tio_pool); bad3: mempool_destroy(md->io_pool); bad2: blk_cleanup_queue(md->queue); + bad1_free_minor: free_minor(minor); bad1: module_put(THIS_MODULE); @@ -971,6 +1030,7 @@ static void free_dev(struct mapped_device *md) } mempool_destroy(md->tio_pool); mempool_destroy(md->io_pool); + bioset_free(md->bs); del_gendisk(md->disk); free_minor(minor); @@ -1319,7 +1379,9 @@ int dm_resume(struct mapped_device *md) if (!map || !dm_table_get_size(map)) goto out; - dm_table_resume_targets(map); + r = dm_table_resume_targets(map); + if (r) + goto out; down_write(&md->io_lock); clear_bit(DMF_BLOCK_IO, &md->flags); @@ -1337,6 +1399,8 @@ int dm_resume(struct mapped_device *md) dm_table_unplug_all(map); + kobject_uevent(&md->disk->kobj, KOBJ_CHANGE); + r = 0; out: @@ -1377,6 +1441,7 @@ int dm_suspended(struct mapped_device *md) static struct block_device_operations dm_blk_dops = { .open = dm_blk_open, .release = dm_blk_close, + .ioctl = dm_blk_ioctl, .getgeo = dm_blk_getgeo, .owner = THIS_MODULE }; diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 3c03c0ecab7..a48ec5e3c1f 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -21,6 +21,11 @@ #define DMERR(f, arg...) printk(KERN_ERR DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg) #define DMWARN(f, arg...) printk(KERN_WARNING DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg) #define DMINFO(f, arg...) printk(KERN_INFO DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg) +#ifdef CONFIG_DM_DEBUG +# define DMDEBUG(f, arg...) printk(KERN_DEBUG DM_NAME ": " DM_MSG_PREFIX " DEBUG: " f "\n", ## arg) +#else +# define DMDEBUG(f, arg...) do {} while (0) +#endif #define DMEMIT(x...) sz += ((sz >= maxlen) ? \ 0 : scnprintf(result + sz, maxlen - sz, x)) @@ -52,7 +57,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q); struct list_head *dm_table_get_devices(struct dm_table *t); void dm_table_presuspend_targets(struct dm_table *t); void dm_table_postsuspend_targets(struct dm_table *t); -void dm_table_resume_targets(struct dm_table *t); +int dm_table_resume_targets(struct dm_table *t); int dm_table_any_congested(struct dm_table *t, int bdi_bits); void dm_table_unplug_all(struct dm_table *t); int dm_table_flush_all(struct dm_table *t); diff --git a/drivers/md/linear.c b/drivers/md/linear.c index b99c19c7eb2..c625ddb8833 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -111,6 +111,19 @@ static int linear_issue_flush(request_queue_t *q, struct gendisk *disk, return ret; } +static int linear_congested(void *data, int bits) +{ + mddev_t *mddev = data; + linear_conf_t *conf = mddev_to_conf(mddev); + int i, ret = 0; + + for (i = 0; i < mddev->raid_disks && !ret ; i++) { + request_queue_t *q = bdev_get_queue(conf->disks[i].rdev->bdev); + ret |= bdi_congested(&q->backing_dev_info, bits); + } + return ret; +} + static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) { linear_conf_t *conf; @@ -269,6 +282,8 @@ static int linear_run (mddev_t *mddev) blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); mddev->queue->unplug_fn = linear_unplug; mddev->queue->issue_flush_fn = linear_issue_flush; + mddev->queue->backing_dev_info.congested_fn = linear_congested; + mddev->queue->backing_dev_info.congested_data = mddev; return 0; } diff --git a/drivers/md/md.c b/drivers/md/md.c index 8dbab2ef388..38a0a5741d5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -389,8 +389,12 @@ static int super_written(struct bio *bio, unsigned int bytes_done, int error) if (bio->bi_size) return 1; - if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) + if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { + printk("md: super_written gets error=%d, uptodate=%d\n", + error, test_bit(BIO_UPTODATE, &bio->bi_flags)); + WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags)); md_error(mddev, rdev); + } if (atomic_dec_and_test(&mddev->pending_writes)) wake_up(&mddev->sb_wait); @@ -1587,7 +1591,7 @@ static void sync_sbs(mddev_t * mddev, int nospares) } } -void md_update_sb(mddev_t * mddev) +static void md_update_sb(mddev_t * mddev, int force_change) { int err; struct list_head *tmp; @@ -1598,7 +1602,18 @@ void md_update_sb(mddev_t * mddev) repeat: spin_lock_irq(&mddev->write_lock); - if (mddev->degraded && mddev->sb_dirty == 3) + set_bit(MD_CHANGE_PENDING, &mddev->flags); + if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) + force_change = 1; + if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) + /* just a clean<-> dirty transition, possibly leave spares alone, + * though if events isn't the right even/odd, we will have to do + * spares after all + */ + nospares = 1; + if (force_change) + nospares = 0; + if (mddev->degraded) /* If the array is degraded, then skipping spares is both * dangerous and fairly pointless. * Dangerous because a device that was removed from the array @@ -1608,20 +1623,14 @@ repeat: * then a recovery will happen and soon that array won't * be degraded any more and the spare can go back to sleep then. */ - mddev->sb_dirty = 1; + nospares = 0; sync_req = mddev->in_sync; mddev->utime = get_seconds(); - if (mddev->sb_dirty == 3) - /* just a clean<-> dirty transition, possibly leave spares alone, - * though if events isn't the right even/odd, we will have to do - * spares after all - */ - nospares = 1; /* If this is just a dirty<->clean transition, and the array is clean * and 'events' is odd, we can roll back to the previous clean state */ - if (mddev->sb_dirty == 3 + if (nospares && (mddev->in_sync && mddev->recovery_cp == MaxSector) && (mddev->events & 1)) mddev->events--; @@ -1652,7 +1661,6 @@ repeat: MD_BUG(); mddev->events --; } - mddev->sb_dirty = 2; sync_sbs(mddev, nospares); /* @@ -1660,7 +1668,7 @@ repeat: * nonpersistent superblocks */ if (!mddev->persistent) { - mddev->sb_dirty = 0; + clear_bit(MD_CHANGE_PENDING, &mddev->flags); spin_unlock_irq(&mddev->write_lock); wake_up(&mddev->sb_wait); return; @@ -1697,20 +1705,20 @@ repeat: break; } md_super_wait(mddev); - /* if there was a failure, sb_dirty was set to 1, and we re-write super */ + /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ spin_lock_irq(&mddev->write_lock); - if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) { + if (mddev->in_sync != sync_req || + test_bit(MD_CHANGE_DEVS, &mddev->flags)) { /* have to write it out again */ spin_unlock_irq(&mddev->write_lock); goto repeat; } - mddev->sb_dirty = 0; + clear_bit(MD_CHANGE_PENDING, &mddev->flags); spin_unlock_irq(&mddev->write_lock); wake_up(&mddev->sb_wait); } -EXPORT_SYMBOL_GPL(md_update_sb); /* words written to sysfs files may, or my not, be \n terminated. * We want to accept with case. For this we use cmd_match. @@ -1783,7 +1791,7 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) else { mddev_t *mddev = rdev->mddev; kick_rdev_from_array(rdev); - md_update_sb(mddev); + md_update_sb(mddev, 1); md_new_event(mddev); err = 0; } @@ -2426,7 +2434,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) spin_lock_irq(&mddev->write_lock); if (atomic_read(&mddev->writes_pending) == 0) { mddev->in_sync = 1; - mddev->sb_dirty = 1; + set_bit(MD_CHANGE_CLEAN, &mddev->flags); } spin_unlock_irq(&mddev->write_lock); } else { @@ -2438,7 +2446,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) case active: if (mddev->pers) { restart_array(mddev); - mddev->sb_dirty = 0; + clear_bit(MD_CHANGE_CLEAN, &mddev->flags); wake_up(&mddev->sb_wait); err = 0; } else { @@ -2520,6 +2528,36 @@ static struct md_sysfs_entry md_new_device = __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); static ssize_t +bitmap_store(mddev_t *mddev, const char *buf, size_t len) +{ + char *end; + unsigned long chunk, end_chunk; + + if (!mddev->bitmap) + goto out; + /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ + while (*buf) { + chunk = end_chunk = simple_strtoul(buf, &end, 0); + if (buf == end) break; + if (*end == '-') { /* range */ + buf = end + 1; + end_chunk = simple_strtoul(buf, &end, 0); + if (buf == end) break; + } + if (*end && !isspace(*end)) break; + bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); + buf = end; + while (isspace(*buf)) buf++; + } + bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ +out: + return len; +} + +static struct md_sysfs_entry md_bitmap = +__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); + +static ssize_t size_show(mddev_t *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)mddev->size); @@ -2543,7 +2581,7 @@ size_store(mddev_t *mddev, const char *buf, size_t len) if (mddev->pers) { err = update_size(mddev, size); - md_update_sb(mddev); + md_update_sb(mddev, 1); } else { if (mddev->size == 0 || mddev->size > size) @@ -2839,6 +2877,7 @@ static struct attribute *md_redundancy_attrs[] = { &md_sync_completed.attr, &md_suspend_lo.attr, &md_suspend_hi.attr, + &md_bitmap.attr, NULL, }; static struct attribute_group md_redundancy_group = { @@ -3111,8 +3150,8 @@ static int do_md_run(mddev_t * mddev) set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - if (mddev->sb_dirty) - md_update_sb(mddev); + if (mddev->flags) + md_update_sb(mddev, 0); set_capacity(disk, mddev->array_size<<1); @@ -3275,10 +3314,10 @@ static int do_md_stop(mddev_t * mddev, int mode) if (mddev->ro) mddev->ro = 0; } - if (!mddev->in_sync || mddev->sb_dirty) { + if (!mddev->in_sync || mddev->flags) { /* mark array as shutdown cleanly */ mddev->in_sync = 1; - md_update_sb(mddev); + md_update_sb(mddev, 1); } if (mode == 1) set_disk_ro(disk, 1); @@ -3374,6 +3413,7 @@ static void autorun_devices(int part) printk(KERN_INFO "md: autorun ...\n"); while (!list_empty(&pending_raid_disks)) { + int unit; dev_t dev; LIST_HEAD(candidates); rdev0 = list_entry(pending_raid_disks.next, @@ -3393,16 +3433,19 @@ static void autorun_devices(int part) * mostly sane superblocks. It's time to allocate the * mddev. */ - if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) { + if (part) { + dev = MKDEV(mdp_major, + rdev0->preferred_minor << MdpMinorShift); + unit = MINOR(dev) >> MdpMinorShift; + } else { + dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); + unit = MINOR(dev); + } + if (rdev0->preferred_minor != unit) { printk(KERN_INFO "md: unit number in %s is bad: %d\n", bdevname(rdev0->bdev, b), rdev0->preferred_minor); break; } - if (part) - dev = MKDEV(mdp_major, - rdev0->preferred_minor << MdpMinorShift); - else - dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); md_probe(dev, NULL, NULL); mddev = mddev_find(dev); @@ -3440,67 +3483,6 @@ static void autorun_devices(int part) printk(KERN_INFO "md: ... autorun DONE.\n"); } -/* - * import RAID devices based on one partition - * if possible, the array gets run as well. - */ - -static int autostart_array(dev_t startdev) -{ - char b[BDEVNAME_SIZE]; - int err = -EINVAL, i; - mdp_super_t *sb = NULL; - mdk_rdev_t *start_rdev = NULL, *rdev; - - start_rdev = md_import_device(startdev, 0, 0); - if (IS_ERR(start_rdev)) - return err; - - - /* NOTE: this can only work for 0.90.0 superblocks */ - sb = (mdp_super_t*)page_address(start_rdev->sb_page); - if (sb->major_version != 0 || - sb->minor_version != 90 ) { - printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n"); - export_rdev(start_rdev); - return err; - } - - if (test_bit(Faulty, &start_rdev->flags)) { - printk(KERN_WARNING - "md: can not autostart based on faulty %s!\n", - bdevname(start_rdev->bdev,b)); - export_rdev(start_rdev); - return err; - } - list_add(&start_rdev->same_set, &pending_raid_disks); - - for (i = 0; i < MD_SB_DISKS; i++) { - mdp_disk_t *desc = sb->disks + i; - dev_t dev = MKDEV(desc->major, desc->minor); - - if (!dev) - continue; - if (dev == startdev) - continue; - if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor) - continue; - rdev = md_import_device(dev, 0, 0); - if (IS_ERR(rdev)) - continue; - - list_add(&rdev->same_set, &pending_raid_disks); - } - - /* - * possibly return codes - */ - autorun_devices(0); - return 0; - -} - - static int get_version(void __user * arg) { mdu_version_t ver; @@ -3808,7 +3790,7 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev) goto busy; kick_rdev_from_array(rdev); - md_update_sb(mddev); + md_update_sb(mddev, 1); md_new_event(mddev); return 0; @@ -3885,7 +3867,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) rdev->raid_disk = -1; - md_update_sb(mddev); + md_update_sb(mddev, 1); /* * Kick recovery, maybe this spare has to be added to the @@ -4016,7 +3998,8 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) mddev->max_disks = MD_SB_DISKS; - mddev->sb_dirty = 1; + mddev->flags = 0; + set_bit(MD_CHANGE_DEVS, &mddev->flags); mddev->default_bitmap_offset = MD_SB_BYTES >> 9; mddev->bitmap_offset = 0; @@ -4185,7 +4168,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) mddev->bitmap_offset = 0; } } - md_update_sb(mddev); + md_update_sb(mddev, 1); return rv; } @@ -4259,27 +4242,6 @@ static int md_ioctl(struct inode *inode, struct file *file, goto abort; } - - if (cmd == START_ARRAY) { - /* START_ARRAY doesn't need to lock the array as autostart_array - * does the locking, and it could even be a different array - */ - static int cnt = 3; - if (cnt > 0 ) { - printk(KERN_WARNING - "md: %s(pid %d) used deprecated START_ARRAY ioctl. " - "This will not be supported beyond July 2006\n", - current->comm, current->pid); - cnt--; - } - err = autostart_array(new_decode_dev(arg)); - if (err) { - printk(KERN_WARNING "md: autostart failed!\n"); - goto abort; - } - goto done; - } - err = mddev_lock(mddev); if (err) { printk(KERN_INFO @@ -4687,9 +4649,11 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev) seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? "reshape" : - (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? - "resync" : "recovery")), - per_milli/10, per_milli % 10, + (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? + "check" : + (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? + "resync" : "recovery"))), + per_milli/10, per_milli % 10, (unsigned long long) resync, (unsigned long long) max_blocks); @@ -5042,12 +5006,12 @@ void md_write_start(mddev_t *mddev, struct bio *bi) spin_lock_irq(&mddev->write_lock); if (mddev->in_sync) { mddev->in_sync = 0; - mddev->sb_dirty = 3; + set_bit(MD_CHANGE_CLEAN, &mddev->flags); md_wakeup_thread(mddev->thread); } spin_unlock_irq(&mddev->write_lock); } - wait_event(mddev->sb_wait, mddev->sb_dirty==0); + wait_event(mddev->sb_wait, mddev->flags==0); } void md_write_end(mddev_t *mddev) @@ -5078,6 +5042,7 @@ void md_do_sync(mddev_t *mddev) int skipped = 0; struct list_head *rtmp; mdk_rdev_t *rdev; + char *desc; /* just incase thread restarts... */ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) @@ -5085,6 +5050,18 @@ void md_do_sync(mddev_t *mddev) if (mddev->ro) /* never try to sync a read-only array */ return; + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) + desc = "data-check"; + else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + desc = "requested-resync"; + else + desc = "resync"; + } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + desc = "reshape"; + else + desc = "recovery"; + /* we overload curr_resync somewhat here. * 0 == not engaged in resync at all * 2 == checking that there is no conflict with another sync @@ -5128,10 +5105,10 @@ void md_do_sync(mddev_t *mddev) prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE); if (!kthread_should_stop() && mddev2->curr_resync >= mddev->curr_resync) { - printk(KERN_INFO "md: delaying resync of %s" - " until %s has finished resync (they" + printk(KERN_INFO "md: delaying %s of %s" + " until %s has finished (they" " share one or more physical units)\n", - mdname(mddev), mdname(mddev2)); + desc, mdname(mddev), mdname(mddev2)); mddev_put(mddev2); schedule(); finish_wait(&resync_wait, &wq); @@ -5167,12 +5144,12 @@ void md_do_sync(mddev_t *mddev) j = rdev->recovery_offset; } - printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); - printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" - " %d KB/sec/disc.\n", speed_min(mddev)); + printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); + printk(KERN_INFO "md: minimum _guaranteed_ speed:" + " %d KB/sec/disk.\n", speed_min(mddev)); printk(KERN_INFO "md: using maximum available idle IO bandwidth " - "(but not more than %d KB/sec) for reconstruction.\n", - speed_max(mddev)); + "(but not more than %d KB/sec) for %s.\n", + speed_max(mddev), desc); is_mddev_idle(mddev); /* this also initializes IO event counters */ @@ -5198,8 +5175,8 @@ void md_do_sync(mddev_t *mddev) if (j>2) { printk(KERN_INFO - "md: resuming recovery of %s from checkpoint.\n", - mdname(mddev)); + "md: resuming %s of %s from checkpoint.\n", + desc, mdname(mddev)); mddev->curr_resync = j; } @@ -5282,7 +5259,7 @@ void md_do_sync(mddev_t *mddev) } } } - printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev)); + printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); /* * this also signals 'finished resyncing' to md_stop */ @@ -5302,8 +5279,8 @@ void md_do_sync(mddev_t *mddev) if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { if (mddev->curr_resync >= mddev->recovery_cp) { printk(KERN_INFO - "md: checkpointing recovery of %s.\n", - mdname(mddev)); + "md: checkpointing %s of %s.\n", + desc, mdname(mddev)); mddev->recovery_cp = mddev->curr_resync; } } else @@ -5317,7 +5294,6 @@ void md_do_sync(mddev_t *mddev) !test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < mddev->curr_resync) rdev->recovery_offset = mddev->curr_resync; - mddev->sb_dirty = 1; } } @@ -5374,7 +5350,7 @@ void md_check_recovery(mddev_t *mddev) } if ( ! ( - mddev->sb_dirty || + mddev->flags || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || test_bit(MD_RECOVERY_DONE, &mddev->recovery) || (mddev->safemode == 1) || @@ -5390,14 +5366,14 @@ void md_check_recovery(mddev_t *mddev) if (mddev->safemode && !atomic_read(&mddev->writes_pending) && !mddev->in_sync && mddev->recovery_cp == MaxSector) { mddev->in_sync = 1; - mddev->sb_dirty = 3; + set_bit(MD_CHANGE_CLEAN, &mddev->flags); } if (mddev->safemode == 1) mddev->safemode = 0; spin_unlock_irq(&mddev->write_lock); - if (mddev->sb_dirty) - md_update_sb(mddev); + if (mddev->flags) + md_update_sb(mddev, 0); if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && @@ -5416,7 +5392,7 @@ void md_check_recovery(mddev_t *mddev) /* activate any spares */ mddev->pers->spare_active(mddev); } - md_update_sb(mddev); + md_update_sb(mddev, 1); /* if array is no-longer degraded, then any saved_raid_disk * information must be scrapped @@ -5556,22 +5532,15 @@ static void md_geninit(void) static int __init md_init(void) { - printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d," - " MD_SB_DISKS=%d\n", - MD_MAJOR_VERSION, MD_MINOR_VERSION, - MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); - printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR_HI, - BITMAP_MINOR); - if (register_blkdev(MAJOR_NR, "md")) return -1; if ((mdp_major=register_blkdev(0, "mdp"))<=0) { unregister_blkdev(MAJOR_NR, "md"); return -1; } - blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE, - md_probe, NULL, NULL); - blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE, + blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE, + md_probe, NULL, NULL); + blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, md_probe, NULL, NULL); register_reboot_notifier(&md_notifier); @@ -5630,8 +5599,8 @@ static __exit void md_exit(void) mddev_t *mddev; struct list_head *tmp; - blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS); - blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift); + blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS); + blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); unregister_blkdev(MAJOR_NR,"md"); unregister_blkdev(mdp_major, "mdp"); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 1cc9de44ce8..171ff41b52b 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -228,6 +228,28 @@ static int multipath_issue_flush(request_queue_t *q, struct gendisk *disk, rcu_read_unlock(); return ret; } +static int multipath_congested(void *data, int bits) +{ + mddev_t *mddev = data; + multipath_conf_t *conf = mddev_to_conf(mddev); + int i, ret = 0; + + rcu_read_lock(); + for (i = 0; i < mddev->raid_disks ; i++) { + mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev); + if (rdev && !test_bit(Faulty, &rdev->flags)) { + request_queue_t *q = bdev_get_queue(rdev->bdev); + + ret |= bdi_congested(&q->backing_dev_info, bits); + /* Just like multipath_map, we just check the + * first available device + */ + break; + } + } + rcu_read_unlock(); + return ret; +} /* * Careful, this can execute in IRQ contexts as well! @@ -253,7 +275,7 @@ static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev) char b[BDEVNAME_SIZE]; clear_bit(In_sync, &rdev->flags); set_bit(Faulty, &rdev->flags); - mddev->sb_dirty = 1; + set_bit(MD_CHANGE_DEVS, &mddev->flags); conf->working_disks--; printk(KERN_ALERT "multipath: IO failure on %s," " disabling IO path. \n Operation continuing" @@ -470,7 +492,6 @@ static int multipath_run (mddev_t *mddev) } conf->raid_disks = mddev->raid_disks; - mddev->sb_dirty = 1; conf->mddev = mddev; spin_lock_init(&conf->device_lock); INIT_LIST_HEAD(&conf->retry_list); @@ -510,6 +531,8 @@ static int multipath_run (mddev_t *mddev) mddev->queue->unplug_fn = multipath_unplug; mddev->queue->issue_flush_fn = multipath_issue_flush; + mddev->queue->backing_dev_info.congested_fn = multipath_congested; + mddev->queue->backing_dev_info.congested_data = mddev; return 0; diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index cb8c6317e4e..dfe32149ad3 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -60,6 +60,21 @@ static int raid0_issue_flush(request_queue_t *q, struct gendisk *disk, return ret; } +static int raid0_congested(void *data, int bits) +{ + mddev_t *mddev = data; + raid0_conf_t *conf = mddev_to_conf(mddev); + mdk_rdev_t **devlist = conf->strip_zone[0].dev; + int i, ret = 0; + + for (i = 0; i < mddev->raid_disks && !ret ; i++) { + request_queue_t *q = bdev_get_queue(devlist[i]->bdev); + + ret |= bdi_congested(&q->backing_dev_info, bits); + } + return ret; +} + static int create_strip_zones (mddev_t *mddev) { @@ -236,6 +251,8 @@ static int create_strip_zones (mddev_t *mddev) mddev->queue->unplug_fn = raid0_unplug; mddev->queue->issue_flush_fn = raid0_issue_flush; + mddev->queue->backing_dev_info.congested_fn = raid0_congested; + mddev->queue->backing_dev_info.congested_data = mddev; printk("raid0: done.\n"); return 0; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 3b4d69c0562..dc9d2def027 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -271,7 +271,7 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int */ update_head_pos(mirror, r1_bio); - if (uptodate || conf->working_disks <= 1) { + if (uptodate || (conf->raid_disks - conf->mddev->degraded) <= 1) { /* * Set R1BIO_Uptodate in our master bio, so that * we will return a good error code for to the higher @@ -601,6 +601,32 @@ static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk, return ret; } +static int raid1_congested(void *data, int bits) +{ + mddev_t *mddev = data; + conf_t *conf = mddev_to_conf(mddev); + int i, ret = 0; + + rcu_read_lock(); + for (i = 0; i < mddev->raid_disks; i++) { + mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); + if (rdev && !test_bit(Faulty, &rdev->flags)) { + request_queue_t *q = bdev_get_queue(rdev->bdev); + + /* Note the '|| 1' - when read_balance prefers + * non-congested targets, it can be removed + */ + if ((bits & (1<<BDI_write_congested)) || 1) + ret |= bdi_congested(&q->backing_dev_info, bits); + else + ret &= bdi_congested(&q->backing_dev_info, bits); + } + } + rcu_read_unlock(); + return ret; +} + + /* Barriers.... * Sometimes we need to suspend IO while we do something else, * either some resync/recovery, or reconfigure the array. @@ -929,7 +955,7 @@ static void status(struct seq_file *seq, mddev_t *mddev) int i; seq_printf(seq, " [%d/%d] [", conf->raid_disks, - conf->working_disks); + conf->raid_disks - mddev->degraded); rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); @@ -953,26 +979,27 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * else mark the drive as failed */ if (test_bit(In_sync, &rdev->flags) - && conf->working_disks == 1) + && (conf->raid_disks - mddev->degraded) == 1) /* * Don't fail the drive, act as though we were just a * normal single drive */ return; - if (test_bit(In_sync, &rdev->flags)) { + if (test_and_clear_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded++; - conf->working_disks--; + spin_unlock_irqrestore(&conf->device_lock, flags); /* * if recovery is running, make sure it aborts. */ set_bit(MD_RECOVERY_ERR, &mddev->recovery); } - clear_bit(In_sync, &rdev->flags); set_bit(Faulty, &rdev->flags); - mddev->sb_dirty = 1; + set_bit(MD_CHANGE_DEVS, &mddev->flags); printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n" " Operation continuing on %d devices\n", - bdevname(rdev->bdev,b), conf->working_disks); + bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); } static void print_conf(conf_t *conf) @@ -984,7 +1011,7 @@ static void print_conf(conf_t *conf) printk("(!conf)\n"); return; } - printk(" --- wd:%d rd:%d\n", conf->working_disks, + printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, conf->raid_disks); rcu_read_lock(); @@ -1023,10 +1050,11 @@ static int raid1_spare_active(mddev_t *mddev) mdk_rdev_t *rdev = conf->mirrors[i].rdev; if (rdev && !test_bit(Faulty, &rdev->flags) - && !test_bit(In_sync, &rdev->flags)) { - conf->working_disks++; + && !test_and_set_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded--; - set_bit(In_sync, &rdev->flags); + spin_unlock_irqrestore(&conf->device_lock, flags); } } @@ -1368,6 +1396,95 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) * 3. Performs writes following reads for array syncronising. */ +static void fix_read_error(conf_t *conf, int read_disk, + sector_t sect, int sectors) +{ + mddev_t *mddev = conf->mddev; + while(sectors) { + int s = sectors; + int d = read_disk; + int success = 0; + int start; + mdk_rdev_t *rdev; + + if (s > (PAGE_SIZE>>9)) + s = PAGE_SIZE >> 9; + + do { + /* Note: no rcu protection needed here + * as this is synchronous in the raid1d thread + * which is the thread that might remove + * a device. If raid1d ever becomes multi-threaded.... + */ + rdev = conf->mirrors[d].rdev; + if (rdev && + test_bit(In_sync, &rdev->flags) && + sync_page_io(rdev->bdev, + sect + rdev->data_offset, + s<<9, + conf->tmppage, READ)) + success = 1; + else { + d++; + if (d == conf->raid_disks) + d = 0; + } + } while (!success && d != read_disk); + + if (!success) { + /* Cannot read from anywhere -- bye bye array */ + md_error(mddev, conf->mirrors[read_disk].rdev); + break; + } + /* write it back and re-read */ + start = d; + while (d != read_disk) { + if (d==0) + d = conf->raid_disks; + d--; + rdev = conf->mirrors[d].rdev; + if (rdev && + test_bit(In_sync, &rdev->flags)) { + if (sync_page_io(rdev->bdev, + sect + rdev->data_offset, + s<<9, conf->tmppage, WRITE) + == 0) + /* Well, this device is dead */ + md_error(mddev, rdev); + } + } + d = start; + while (d != read_disk) { + char b[BDEVNAME_SIZE]; + if (d==0) + d = conf->raid_disks; + d--; + rdev = conf->mirrors[d].rdev; + if (rdev && + test_bit(In_sync, &rdev->flags)) { + if (sync_page_io(rdev->bdev, + sect + rdev->data_offset, + s<<9, conf->tmppage, READ) + == 0) + /* Well, this device is dead */ + md_error(mddev, rdev); + else { + atomic_add(s, &rdev->corrected_errors); + printk(KERN_INFO + "raid1:%s: read error corrected " + "(%d sectors at %llu on %s)\n", + mdname(mddev), s, + (unsigned long long)sect + + rdev->data_offset, + bdevname(rdev->bdev, b)); + } + } + } + sectors -= s; + sect += s; + } +} + static void raid1d(mddev_t *mddev) { r1bio_t *r1_bio; @@ -1460,86 +1577,14 @@ static void raid1d(mddev_t *mddev) * This is all done synchronously while the array is * frozen */ - sector_t sect = r1_bio->sector; - int sectors = r1_bio->sectors; - freeze_array(conf); - if (mddev->ro == 0) while(sectors) { - int s = sectors; - int d = r1_bio->read_disk; - int success = 0; - - if (s > (PAGE_SIZE>>9)) - s = PAGE_SIZE >> 9; - - do { - /* Note: no rcu protection needed here - * as this is synchronous in the raid1d thread - * which is the thread that might remove - * a device. If raid1d ever becomes multi-threaded.... - */ - rdev = conf->mirrors[d].rdev; - if (rdev && - test_bit(In_sync, &rdev->flags) && - sync_page_io(rdev->bdev, - sect + rdev->data_offset, - s<<9, - conf->tmppage, READ)) - success = 1; - else { - d++; - if (d == conf->raid_disks) - d = 0; - } - } while (!success && d != r1_bio->read_disk); - - if (success) { - /* write it back and re-read */ - int start = d; - while (d != r1_bio->read_disk) { - if (d==0) - d = conf->raid_disks; - d--; - rdev = conf->mirrors[d].rdev; - if (rdev && - test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev->bdev, - sect + rdev->data_offset, - s<<9, conf->tmppage, WRITE) == 0) - /* Well, this device is dead */ - md_error(mddev, rdev); - } - } - d = start; - while (d != r1_bio->read_disk) { - if (d==0) - d = conf->raid_disks; - d--; - rdev = conf->mirrors[d].rdev; - if (rdev && - test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev->bdev, - sect + rdev->data_offset, - s<<9, conf->tmppage, READ) == 0) - /* Well, this device is dead */ - md_error(mddev, rdev); - else { - atomic_add(s, &rdev->corrected_errors); - printk(KERN_INFO "raid1:%s: read error corrected (%d sectors at %llu on %s)\n", - mdname(mddev), s, (unsigned long long)(sect + rdev->data_offset), bdevname(rdev->bdev, b)); - } - } - } - } else { - /* Cannot read from anywhere -- bye bye array */ - md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); - break; - } - sectors -= s; - sect += s; + if (mddev->ro == 0) { + freeze_array(conf); + fix_read_error(conf, r1_bio->read_disk, + r1_bio->sector, + r1_bio->sectors); + unfreeze_array(conf); } - unfreeze_array(conf); - bio = r1_bio->bios[r1_bio->read_disk]; if ((disk=read_balance(conf, r1_bio)) == -1) { printk(KERN_ALERT "raid1: %s: unrecoverable I/O" @@ -1884,15 +1929,11 @@ static int run(mddev_t *mddev) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); disk->head_position = 0; - if (!test_bit(Faulty, &rdev->flags) && test_bit(In_sync, &rdev->flags)) - conf->working_disks++; } conf->raid_disks = mddev->raid_disks; conf->mddev = mddev; spin_lock_init(&conf->device_lock); INIT_LIST_HEAD(&conf->retry_list); - if (conf->working_disks == 1) - mddev->recovery_cp = MaxSector; spin_lock_init(&conf->resync_lock); init_waitqueue_head(&conf->wait_barrier); @@ -1900,11 +1941,6 @@ static int run(mddev_t *mddev) bio_list_init(&conf->pending_bio_list); bio_list_init(&conf->flushing_bio_list); - if (!conf->working_disks) { - printk(KERN_ERR "raid1: no operational mirrors for %s\n", - mdname(mddev)); - goto out_free_conf; - } mddev->degraded = 0; for (i = 0; i < conf->raid_disks; i++) { @@ -1917,6 +1953,13 @@ static int run(mddev_t *mddev) mddev->degraded++; } } + if (mddev->degraded == conf->raid_disks) { + printk(KERN_ERR "raid1: no operational mirrors for %s\n", + mdname(mddev)); + goto out_free_conf; + } + if (conf->raid_disks - mddev->degraded == 1) + mddev->recovery_cp = MaxSector; /* * find the first working one and use it as a starting point @@ -1948,6 +1991,8 @@ static int run(mddev_t *mddev) mddev->queue->unplug_fn = raid1_unplug; mddev->queue->issue_flush_fn = raid1_issue_flush; + mddev->queue->backing_dev_info.congested_fn = raid1_congested; + mddev->queue->backing_dev_info.congested_data = mddev; return 0; @@ -2035,7 +2080,7 @@ static int raid1_reshape(mddev_t *mddev) mirror_info_t *newmirrors; conf_t *conf = mddev_to_conf(mddev); int cnt, raid_disks; - + unsigned long flags; int d, d2; /* Cannot change chunk_size, layout, or level */ @@ -2094,7 +2139,9 @@ static int raid1_reshape(mddev_t *mddev) kfree(conf->poolinfo); conf->poolinfo = newpoolinfo; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded += (raid_disks - conf->raid_disks); + spin_unlock_irqrestore(&conf->device_lock, flags); conf->raid_disks = mddev->raid_disks = raid_disks; mddev->delta_disks = 0; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 016ddb831c9..1250f0eab4a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -648,6 +648,26 @@ static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk, return ret; } +static int raid10_congested(void *data, int bits) +{ + mddev_t *mddev = data; + conf_t *conf = mddev_to_conf(mddev); + int i, ret = 0; + + rcu_read_lock(); + for (i = 0; i < mddev->raid_disks && ret == 0; i++) { + mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); + if (rdev && !test_bit(Faulty, &rdev->flags)) { + request_queue_t *q = bdev_get_queue(rdev->bdev); + + ret |= bdi_congested(&q->backing_dev_info, bits); + } + } + rcu_read_unlock(); + return ret; +} + + /* Barriers.... * Sometimes we need to suspend IO while we do something else, * either some resync/recovery, or reconfigure the array. @@ -921,7 +941,7 @@ static void status(struct seq_file *seq, mddev_t *mddev) seq_printf(seq, " %d far-copies", conf->far_copies); } seq_printf(seq, " [%d/%d] [", conf->raid_disks, - conf->working_disks); + conf->raid_disks - mddev->degraded); for (i = 0; i < conf->raid_disks; i++) seq_printf(seq, "%s", conf->mirrors[i].rdev && @@ -941,7 +961,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * else mark the drive as failed */ if (test_bit(In_sync, &rdev->flags) - && conf->working_disks == 1) + && conf->raid_disks-mddev->degraded == 1) /* * Don't fail the drive, just return an IO error. * The test should really be more sophisticated than @@ -950,20 +970,21 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * really dead" tests... */ return; - if (test_bit(In_sync, &rdev->flags)) { + if (test_and_clear_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded++; - conf->working_disks--; + spin_unlock_irqrestore(&conf->device_lock, flags); /* * if recovery is running, make sure it aborts. */ set_bit(MD_RECOVERY_ERR, &mddev->recovery); } - clear_bit(In_sync, &rdev->flags); set_bit(Faulty, &rdev->flags); - mddev->sb_dirty = 1; + set_bit(MD_CHANGE_DEVS, &mddev->flags); printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n" " Operation continuing on %d devices\n", - bdevname(rdev->bdev,b), conf->working_disks); + bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); } static void print_conf(conf_t *conf) @@ -976,7 +997,7 @@ static void print_conf(conf_t *conf) printk("(!conf)\n"); return; } - printk(" --- wd:%d rd:%d\n", conf->working_disks, + printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, conf->raid_disks); for (i = 0; i < conf->raid_disks; i++) { @@ -1034,10 +1055,11 @@ static int raid10_spare_active(mddev_t *mddev) tmp = conf->mirrors + i; if (tmp->rdev && !test_bit(Faulty, &tmp->rdev->flags) - && !test_bit(In_sync, &tmp->rdev->flags)) { - conf->working_disks++; + && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded--; - set_bit(In_sync, &tmp->rdev->flags); + spin_unlock_irqrestore(&conf->device_lock, flags); } } @@ -1350,9 +1372,119 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) * * 1. Retries failed read operations on working mirrors. * 2. Updates the raid superblock when problems encounter. - * 3. Performs writes following reads for array syncronising. + * 3. Performs writes following reads for array synchronising. */ +static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) +{ + int sect = 0; /* Offset from r10_bio->sector */ + int sectors = r10_bio->sectors; + mdk_rdev_t*rdev; + while(sectors) { + int s = sectors; + int sl = r10_bio->read_slot; + int success = 0; + int start; + + if (s > (PAGE_SIZE>>9)) + s = PAGE_SIZE >> 9; + + rcu_read_lock(); + do { + int d = r10_bio->devs[sl].devnum; + rdev = rcu_dereference(conf->mirrors[d].rdev); + if (rdev && + test_bit(In_sync, &rdev->flags)) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + success = sync_page_io(rdev->bdev, + r10_bio->devs[sl].addr + + sect + rdev->data_offset, + s<<9, + conf->tmppage, READ); + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); + if (success) + break; + } + sl++; + if (sl == conf->copies) + sl = 0; + } while (!success && sl != r10_bio->read_slot); + rcu_read_unlock(); + + if (!success) { + /* Cannot read from anywhere -- bye bye array */ + int dn = r10_bio->devs[r10_bio->read_slot].devnum; + md_error(mddev, conf->mirrors[dn].rdev); + break; + } + + start = sl; + /* write it back and re-read */ + rcu_read_lock(); + while (sl != r10_bio->read_slot) { + int d; + if (sl==0) + sl = conf->copies; + sl--; + d = r10_bio->devs[sl].devnum; + rdev = rcu_dereference(conf->mirrors[d].rdev); + if (rdev && + test_bit(In_sync, &rdev->flags)) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + atomic_add(s, &rdev->corrected_errors); + if (sync_page_io(rdev->bdev, + r10_bio->devs[sl].addr + + sect + rdev->data_offset, + s<<9, conf->tmppage, WRITE) + == 0) + /* Well, this device is dead */ + md_error(mddev, rdev); + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); + } + } + sl = start; + while (sl != r10_bio->read_slot) { + int d; + if (sl==0) + sl = conf->copies; + sl--; + d = r10_bio->devs[sl].devnum; + rdev = rcu_dereference(conf->mirrors[d].rdev); + if (rdev && + test_bit(In_sync, &rdev->flags)) { + char b[BDEVNAME_SIZE]; + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + if (sync_page_io(rdev->bdev, + r10_bio->devs[sl].addr + + sect + rdev->data_offset, + s<<9, conf->tmppage, READ) == 0) + /* Well, this device is dead */ + md_error(mddev, rdev); + else + printk(KERN_INFO + "raid10:%s: read error corrected" + " (%d sectors at %llu on %s)\n", + mdname(mddev), s, + (unsigned long long)sect+ + rdev->data_offset, + bdevname(rdev->bdev, b)); + + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); + } + } + rcu_read_unlock(); + + sectors -= s; + sect += s; + } +} + static void raid10d(mddev_t *mddev) { r10bio_t *r10_bio; @@ -1413,105 +1545,12 @@ static void raid10d(mddev_t *mddev) * This is all done synchronously while the array is * frozen. */ - int sect = 0; /* Offset from r10_bio->sector */ - int sectors = r10_bio->sectors; - freeze_array(conf); - if (mddev->ro == 0) while(sectors) { - int s = sectors; - int sl = r10_bio->read_slot; - int success = 0; - - if (s > (PAGE_SIZE>>9)) - s = PAGE_SIZE >> 9; - - rcu_read_lock(); - do { - int d = r10_bio->devs[sl].devnum; - rdev = rcu_dereference(conf->mirrors[d].rdev); - if (rdev && - test_bit(In_sync, &rdev->flags)) { - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - success = sync_page_io(rdev->bdev, - r10_bio->devs[sl].addr + - sect + rdev->data_offset, - s<<9, - conf->tmppage, READ); - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); - if (success) - break; - } - sl++; - if (sl == conf->copies) - sl = 0; - } while (!success && sl != r10_bio->read_slot); - rcu_read_unlock(); - - if (success) { - int start = sl; - /* write it back and re-read */ - rcu_read_lock(); - while (sl != r10_bio->read_slot) { - int d; - if (sl==0) - sl = conf->copies; - sl--; - d = r10_bio->devs[sl].devnum; - rdev = rcu_dereference(conf->mirrors[d].rdev); - if (rdev && - test_bit(In_sync, &rdev->flags)) { - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - atomic_add(s, &rdev->corrected_errors); - if (sync_page_io(rdev->bdev, - r10_bio->devs[sl].addr + - sect + rdev->data_offset, - s<<9, conf->tmppage, WRITE) == 0) - /* Well, this device is dead */ - md_error(mddev, rdev); - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); - } - } - sl = start; - while (sl != r10_bio->read_slot) { - int d; - if (sl==0) - sl = conf->copies; - sl--; - d = r10_bio->devs[sl].devnum; - rdev = rcu_dereference(conf->mirrors[d].rdev); - if (rdev && - test_bit(In_sync, &rdev->flags)) { - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - if (sync_page_io(rdev->bdev, - r10_bio->devs[sl].addr + - sect + rdev->data_offset, - s<<9, conf->tmppage, READ) == 0) - /* Well, this device is dead */ - md_error(mddev, rdev); - else - printk(KERN_INFO "raid10:%s: read error corrected (%d sectors at %llu on %s)\n", - mdname(mddev), s, (unsigned long long)(sect+rdev->data_offset), bdevname(rdev->bdev, b)); - - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); - } - } - rcu_read_unlock(); - } else { - /* Cannot read from anywhere -- bye bye array */ - md_error(mddev, conf->mirrors[r10_bio->devs[r10_bio->read_slot].devnum].rdev); - break; - } - sectors -= s; - sect += s; + if (mddev->ro == 0) { + freeze_array(conf); + fix_read_error(conf, mddev, r10_bio); + unfreeze_array(conf); } - unfreeze_array(conf); - bio = r10_bio->devs[r10_bio->read_slot].bio; r10_bio->devs[r10_bio->read_slot].bio = mddev->ro ? IO_BLOCKED : NULL; @@ -2018,8 +2057,6 @@ static int run(mddev_t *mddev) mddev->queue->max_sectors = (PAGE_SIZE>>9); disk->head_position = 0; - if (!test_bit(Faulty, &rdev->flags) && test_bit(In_sync, &rdev->flags)) - conf->working_disks++; } conf->raid_disks = mddev->raid_disks; conf->mddev = mddev; @@ -2077,6 +2114,8 @@ static int run(mddev_t *mddev) mddev->queue->unplug_fn = raid10_unplug; mddev->queue->issue_flush_fn = raid10_issue_flush; + mddev->queue->backing_dev_info.congested_fn = raid10_congested; + mddev->queue->backing_dev_info.congested_data = mddev; /* Calculate max read-ahead size. * We need to readahead at least twice a whole stripe.... diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 45006600716..37e4ff661b6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -636,7 +636,6 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done, struct stripe_head *sh = bi->bi_private; raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks, i; - unsigned long flags; int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); if (bi->bi_size) @@ -654,7 +653,6 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done, return 0; } - spin_lock_irqsave(&conf->device_lock, flags); if (!uptodate) md_error(conf->mddev, conf->disks[i].rdev); @@ -662,8 +660,7 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done, clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); - __release_stripe(conf, sh); - spin_unlock_irqrestore(&conf->device_lock, flags); + release_stripe(sh); return 0; } @@ -696,12 +693,12 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) PRINTK("raid5: error called\n"); if (!test_bit(Faulty, &rdev->flags)) { - mddev->sb_dirty = 1; - if (test_bit(In_sync, &rdev->flags)) { - conf->working_disks--; + set_bit(MD_CHANGE_DEVS, &mddev->flags); + if (test_and_clear_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded++; - conf->failed_disks++; - clear_bit(In_sync, &rdev->flags); + spin_unlock_irqrestore(&conf->device_lock, flags); /* * if recovery was running, make sure it aborts. */ @@ -711,7 +708,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) printk (KERN_ALERT "raid5: Disk failure on %s, disabling device." " Operation continuing on %d devices\n", - bdevname(rdev->bdev,b), conf->working_disks); + bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); } } @@ -1353,10 +1350,9 @@ static int page_is_zero(struct page *p) static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) { int sectors_per_chunk = conf->chunk_size >> 9; - sector_t x = stripe; int pd_idx, dd_idx; - int chunk_offset = sector_div(x, sectors_per_chunk); - stripe = x; + int chunk_offset = sector_div(stripe, sectors_per_chunk); + raid5_compute_sector(stripe*(disks-1)*sectors_per_chunk + chunk_offset, disks, disks-1, &dd_idx, &pd_idx, conf); return pd_idx; @@ -2597,6 +2593,24 @@ static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk, return ret; } +static int raid5_congested(void *data, int bits) +{ + mddev_t *mddev = data; + raid5_conf_t *conf = mddev_to_conf(mddev); + + /* No difference between reads and writes. Just check + * how busy the stripe_cache is + */ + if (conf->inactive_blocked) + return 1; + if (conf->quiesce) + return 1; + if (list_empty_careful(&conf->inactive_list)) + return 1; + + return 0; +} + static int make_request(request_queue_t *q, struct bio * bi) { mddev_t *mddev = q->queuedata; @@ -2781,9 +2795,9 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes)==0); mddev->reshape_position = conf->expand_progress; - mddev->sb_dirty = 1; + set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); - wait_event(mddev->sb_wait, mddev->sb_dirty == 0 || + wait_event(mddev->sb_wait, mddev->flags == 0 || kthread_should_stop()); spin_lock_irq(&conf->device_lock); conf->expand_lo = mddev->reshape_position; @@ -3074,6 +3088,7 @@ static int run(mddev_t *mddev) mdk_rdev_t *rdev; struct disk_info *disk; struct list_head *tmp; + int working_disks = 0; if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) { printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n", @@ -3176,14 +3191,14 @@ static int run(mddev_t *mddev) printk(KERN_INFO "raid5: device %s operational as raid" " disk %d\n", bdevname(rdev->bdev,b), raid_disk); - conf->working_disks++; + working_disks++; } } /* * 0 for a fully functional array, 1 or 2 for a degraded array. */ - mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks; + mddev->degraded = conf->raid_disks - working_disks; conf->mddev = mddev; conf->chunk_size = mddev->chunk_size; conf->level = mddev->level; @@ -3218,7 +3233,7 @@ static int run(mddev_t *mddev) if (mddev->degraded > conf->max_degraded) { printk(KERN_ERR "raid5: not enough operational devices for %s" " (%d/%d failed)\n", - mdname(mddev), conf->failed_disks, conf->raid_disks); + mdname(mddev), mddev->degraded, conf->raid_disks); goto abort; } @@ -3299,6 +3314,9 @@ static int run(mddev_t *mddev) mddev->queue->unplug_fn = raid5_unplug_device; mddev->queue->issue_flush_fn = raid5_issue_flush; + mddev->queue->backing_dev_info.congested_fn = raid5_congested; + mddev->queue->backing_dev_info.congested_data = mddev; + mddev->array_size = mddev->size * (conf->previous_raid_disks - conf->max_degraded); @@ -3375,7 +3393,7 @@ static void status (struct seq_file *seq, mddev_t *mddev) int i; seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout); - seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks); + seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); for (i = 0; i < conf->raid_disks; i++) seq_printf (seq, "%s", conf->disks[i].rdev && @@ -3397,8 +3415,8 @@ static void print_raid5_conf (raid5_conf_t *conf) printk("(conf==NULL)\n"); return; } - printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks, - conf->working_disks, conf->failed_disks); + printk(" --- rd:%d wd:%d\n", conf->raid_disks, + conf->raid_disks - conf->mddev->degraded); for (i = 0; i < conf->raid_disks; i++) { char b[BDEVNAME_SIZE]; @@ -3420,11 +3438,11 @@ static int raid5_spare_active(mddev_t *mddev) tmp = conf->disks + i; if (tmp->rdev && !test_bit(Faulty, &tmp->rdev->flags) - && !test_bit(In_sync, &tmp->rdev->flags)) { + && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded--; - conf->failed_disks--; - conf->working_disks++; - set_bit(In_sync, &tmp->rdev->flags); + spin_unlock_irqrestore(&conf->device_lock, flags); } } print_raid5_conf(conf); @@ -3560,6 +3578,7 @@ static int raid5_start_reshape(mddev_t *mddev) struct list_head *rtmp; int spares = 0; int added_devices = 0; + unsigned long flags; if (mddev->degraded || test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) @@ -3593,7 +3612,6 @@ static int raid5_start_reshape(mddev_t *mddev) if (raid5_add_disk(mddev, rdev)) { char nm[20]; set_bit(In_sync, &rdev->flags); - conf->working_disks++; added_devices++; rdev->recovery_offset = 0; sprintf(nm, "rd%d", rdev->raid_disk); @@ -3602,10 +3620,12 @@ static int raid5_start_reshape(mddev_t *mddev) break; } + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices; + spin_unlock_irqrestore(&conf->device_lock, flags); mddev->raid_disks = conf->raid_disks; mddev->reshape_position = 0; - mddev->sb_dirty = 1; + set_bit(MD_CHANGE_DEVS, &mddev->flags); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); |