20 files changed, 1390 insertions, 952 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index bf869ed03ee..c92c1521546 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -2,6 +2,8 @@
 # Block device driver configuration
 #
 
+if BLOCK
+
 menu "Multi-device support (RAID and LVM)"
 
 config MD
@@ -136,16 +138,16 @@ config MD_RAID456
 	  If unsure, say Y.
 
 config MD_RAID5_RESHAPE
-	bool "Support adding drives to a raid-5 array (experimental)"
-	depends on MD_RAID456 && EXPERIMENTAL
+	bool "Support adding drives to a raid-5 array"
+	depends on MD_RAID456
+	default y
 	---help---
 	  A RAID-5 set can be expanded by adding extra drives. This
 	  requires "restriping" the array which means (almost) every
 	  block must be written to a different place.
 
           This option allows such restriping to be done while the array
-	  is online.  However it is still EXPERIMENTAL code.  It should
-	  work, but please be sure that you have backups.
+	  is online.
 
 	  You will need mdadm version 2.4.1 or later to use this
 	  feature safely.  During the early stage of reshape there is
@@ -162,6 +164,8 @@ config MD_RAID5_RESHAPE
 	  There should be enough spares already present to make the new
 	  array workable.
 
+	  If unsure, say Y.
+
 config MD_MULTIPATH
 	tristate "Multipath I/O support"
 	depends on BLK_DEV_MD
@@ -199,6 +203,14 @@ config BLK_DEV_DM
 
 	  If unsure, say N.
 
+config DM_DEBUG
+	boolean "Device mapper debugging support"
+	depends on BLK_DEV_DM && EXPERIMENTAL
+	---help---
+	  Enable this for messages that may help debug device-mapper problems.
+
+	  If unsure, say N.
+
 config DM_CRYPT
 	tristate "Crypt target support"
 	depends on BLK_DEV_DM && EXPERIMENTAL
@@ -251,3 +263,4 @@ config DM_MULTIPATH_EMC
 
 endmenu
 
+endif
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index ecc56765d94..8e67634e79a 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -613,6 +613,7 @@ static inline unsigned long file_page_offset(unsigned long chunk)
 static inline struct page *filemap_get_page(struct bitmap *bitmap,
 					unsigned long chunk)
 {
+	if (file_page_index(chunk) >= bitmap->file_pages) return NULL;
 	return bitmap->filemap[file_page_index(chunk) - file_page_index(0)];
 }
 
@@ -739,6 +740,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 	}
 
 	page = filemap_get_page(bitmap, chunk);
+	if (!page) return;
 	bit = file_page_offset(chunk);
 
  	/* set the bit */
@@ -1322,6 +1324,18 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
 
 }
 
+/* dirty the memory and file bits for bitmap chunks "s" to "e" */
+void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
+{
+	unsigned long chunk;
+
+	for (chunk = s; chunk <= e; chunk++) {
+		sector_t sec = chunk << CHUNK_BLOCK_SHIFT(bitmap);
+		bitmap_set_memory_bits(bitmap, sec, 1);
+		bitmap_file_set_bit(bitmap, sec);
+	}
+}
+
 /*
  * flush out any pending updates
  */
@@ -1430,8 +1444,7 @@ int bitmap_create(mddev_t *mddev)
 	if (err)
 		goto error;
 
-	bitmap->chunkshift = find_first_bit(&bitmap->chunksize,
-					sizeof(bitmap->chunksize));
+	bitmap->chunkshift = ffz(~bitmap->chunksize);
 
 	/* now that chunksize and chunkshift are set, we can use these macros */
  	chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) /
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index bdbd34993a8..655d816760e 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (C) 2003 Christophe Saout <christophe@saout.de>
  * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
+ * Copyright (C) 2006 Red Hat, Inc. All rights reserved.
  *
  * This file is released under the GPL.
  */
@@ -22,17 +23,19 @@
 #include "dm.h"
 
 #define DM_MSG_PREFIX "crypt"
+#define MESG_STR(x) x, sizeof(x)
 
 /*
  * per bio private data
  */
 struct crypt_io {
 	struct dm_target *target;
-	struct bio *bio;
+	struct bio *base_bio;
 	struct bio *first_clone;
 	struct work_struct work;
 	atomic_t pending;
 	int error;
+	int post_process;
 };
 
 /*
@@ -63,6 +66,7 @@ struct crypt_iv_operations {
  * Crypt: maps a linear range of a block device
  * and encrypts / decrypts at the same time.
  */
+enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
 struct crypt_config {
 	struct dm_dev *dev;
 	sector_t start;
@@ -73,6 +77,7 @@ struct crypt_config {
 	 */
 	mempool_t *io_pool;
 	mempool_t *page_pool;
+	struct bio_set *bs;
 
 	/*
 	 * crypto related data
@@ -86,11 +91,12 @@ struct crypt_config {
 	char cipher[CRYPTO_MAX_ALG_NAME];
 	char chainmode[CRYPTO_MAX_ALG_NAME];
 	struct crypto_blkcipher *tfm;
+	unsigned long flags;
 	unsigned int key_size;
 	u8 key[0];
 };
 
-#define MIN_IOS        256
+#define MIN_IOS        16
 #define MIN_POOL_PAGES 32
 #define MIN_BIO_PAGES  8
 
@@ -306,6 +312,14 @@ static int crypt_convert(struct crypt_config *cc,
 	return r;
 }
 
+ static void dm_crypt_bio_destructor(struct bio *bio)
+ {
+	struct crypt_io *io = bio->bi_private;
+	struct crypt_config *cc = io->target->private;
+
+	bio_free(bio, cc->bs);
+ }
+
 /*
  * Generate a new unfragmented bio with the given size
  * This should never violate the device limitations
@@ -315,34 +329,33 @@ static struct bio *
 crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
                    struct bio *base_bio, unsigned int *bio_vec_idx)
 {
-	struct bio *bio;
+	struct bio *clone;
 	unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
 	unsigned int i;
 
-	/*
-	 * Use __GFP_NOMEMALLOC to tell the VM to act less aggressively and
-	 * to fail earlier.  This is not necessary but increases throughput.
-	 * FIXME: Is this really intelligent?
-	 */
-	if (base_bio)
-		bio = bio_clone(base_bio, GFP_NOIO|__GFP_NOMEMALLOC);
-	else
-		bio = bio_alloc(GFP_NOIO|__GFP_NOMEMALLOC, nr_iovecs);
-	if (!bio)
+	if (base_bio) {
+		clone = bio_alloc_bioset(GFP_NOIO, base_bio->bi_max_vecs, cc->bs);
+		__bio_clone(clone, base_bio);
+	} else
+		clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
+
+	if (!clone)
 		return NULL;
 
+	clone->bi_destructor = dm_crypt_bio_destructor;
+
 	/* if the last bio was not complete, continue where that one ended */
-	bio->bi_idx = *bio_vec_idx;
-	bio->bi_vcnt = *bio_vec_idx;
-	bio->bi_size = 0;
-	bio->bi_flags &= ~(1 << BIO_SEG_VALID);
+	clone->bi_idx = *bio_vec_idx;
+	clone->bi_vcnt = *bio_vec_idx;
+	clone->bi_size = 0;
+	clone->bi_flags &= ~(1 << BIO_SEG_VALID);
 
-	/* bio->bi_idx pages have already been allocated */
-	size -= bio->bi_idx * PAGE_SIZE;
+	/* clone->bi_idx pages have already been allocated */
+	size -= clone->bi_idx * PAGE_SIZE;
 
-	for(i = bio->bi_idx; i < nr_iovecs; i++) {
-		struct bio_vec *bv = bio_iovec_idx(bio, i);
+	for (i = clone->bi_idx; i < nr_iovecs; i++) {
+		struct bio_vec *bv = bio_iovec_idx(clone, i);
 
 		bv->bv_page = mempool_alloc(cc->page_pool, gfp_mask);
 		if (!bv->bv_page)
@@ -353,7 +366,7 @@ crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
 		 * return a partially allocated bio, the caller will then try
 		 * to allocate additional bios while submitting this partial bio
 		 */
-		if ((i - bio->bi_idx) == (MIN_BIO_PAGES - 1))
+		if ((i - clone->bi_idx) == (MIN_BIO_PAGES - 1))
 			gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
 
 		bv->bv_offset = 0;
@@ -362,13 +375,13 @@ crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
 		else
 			bv->bv_len = size;
 
-		bio->bi_size += bv->bv_len;
-		bio->bi_vcnt++;
+		clone->bi_size += bv->bv_len;
+		clone->bi_vcnt++;
 		size -= bv->bv_len;
 	}
 
-	if (!bio->bi_size) {
-		bio_put(bio);
+	if (!clone->bi_size) {
+		bio_put(clone);
 		return NULL;
 	}
 
@@ -376,13 +389,13 @@ crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
 	 * Remember the last bio_vec allocated to be able
 	 * to correctly continue after the splitting.
 	 */
-	*bio_vec_idx = bio->bi_vcnt;
+	*bio_vec_idx = clone->bi_vcnt;
 
-	return bio;
+	return clone;
 }
 
 static void crypt_free_buffer_pages(struct crypt_config *cc,
-                                    struct bio *bio, unsigned int bytes)
+                                    struct bio *clone, unsigned int bytes)
 {
 	unsigned int i, start, end;
 	struct bio_vec *bv;
@@ -396,19 +409,19 @@ static void crypt_free_buffer_pages(struct crypt_config *cc,
 	 * A fix to the bi_idx issue in the kernel is in the works, so
 	 * we will hopefully be able to revert to the cleaner solution soon.
 	 */
-	i = bio->bi_vcnt - 1;
-	bv = bio_iovec_idx(bio, i);
-	end = (i << PAGE_SHIFT) + (bv->bv_offset + bv->bv_len) - bio->bi_size;
+	i = clone->bi_vcnt - 1;
+	bv = bio_iovec_idx(clone, i);
+	end = (i << PAGE_SHIFT) + (bv->bv_offset + bv->bv_len) - clone->bi_size;
 	start = end - bytes;
 
 	start >>= PAGE_SHIFT;
-	if (!bio->bi_size)
-		end = bio->bi_vcnt;
+	if (!clone->bi_size)
+		end = clone->bi_vcnt;
 	else
 		end >>= PAGE_SHIFT;
 
-	for(i = start; i < end; i++) {
-		bv = bio_iovec_idx(bio, i);
+	for (i = start; i < end; i++) {
+		bv = bio_iovec_idx(clone, i);
 		BUG_ON(!bv->bv_page);
 		mempool_free(bv->bv_page, cc->page_pool);
 		bv->bv_page = NULL;
@@ -432,7 +445,7 @@ static void dec_pending(struct crypt_io *io, int error)
 	if (io->first_clone)
 		bio_put(io->first_clone);
 
-	bio_endio(io->bio, io->bio->bi_size, io->error);
+	bio_endio(io->base_bio, io->base_bio->bi_size, io->error);
 
 	mempool_free(io, cc->io_pool);
 }
@@ -441,29 +454,179 @@ static void dec_pending(struct crypt_io *io, int error)
  * kcryptd:
  *
  * Needed because it would be very unwise to do decryption in an
- * interrupt context, so bios returning from read requests get
- * queued here.
+ * interrupt context.
  */
 static struct workqueue_struct *_kcryptd_workqueue;
+static void kcryptd_do_work(void *data);
 
-static void kcryptd_do_work(void *data)
+static void kcryptd_queue_io(struct crypt_io *io)
 {
-	struct crypt_io *io = (struct crypt_io *) data;
-	struct crypt_config *cc = (struct crypt_config *) io->target->private;
+	INIT_WORK(&io->work, kcryptd_do_work, io);
+	queue_work(_kcryptd_workqueue, &io->work);
+}
+
+static int crypt_endio(struct bio *clone, unsigned int done, int error)
+{
+	struct crypt_io *io = clone->bi_private;
+	struct crypt_config *cc = io->target->private;
+	unsigned read_io = bio_data_dir(clone) == READ;
+
+	/*
+	 * free the processed pages, even if
+	 * it's only a partially completed write
+	 */
+	if (!read_io)
+		crypt_free_buffer_pages(cc, clone, done);
+
+	/* keep going - not finished yet */
+	if (unlikely(clone->bi_size))
+		return 1;
+
+	if (!read_io)
+		goto out;
+
+	if (unlikely(!bio_flagged(clone, BIO_UPTODATE))) {
+		error = -EIO;
+		goto out;
+	}
+
+	bio_put(clone);
+	io->post_process = 1;
+	kcryptd_queue_io(io);
+	return 0;
+
+out:
+	bio_put(clone);
+	dec_pending(io, error);
+	return error;
+}
+
+static void clone_init(struct crypt_io *io, struct bio *clone)
+{
+	struct crypt_config *cc = io->target->private;
+
+	clone->bi_private = io;
+	clone->bi_end_io  = crypt_endio;
+	clone->bi_bdev    = cc->dev->bdev;
+	clone->bi_rw      = io->base_bio->bi_rw;
+}
+
+static void process_read(struct crypt_io *io)
+{
+	struct crypt_config *cc = io->target->private;
+	struct bio *base_bio = io->base_bio;
+	struct bio *clone;
+	sector_t sector = base_bio->bi_sector - io->target->begin;
+
+	atomic_inc(&io->pending);
+
+	/*
+	 * The block layer might modify the bvec array, so always
+	 * copy the required bvecs because we need the original
+	 * one in order to decrypt the whole bio data *afterwards*.
+	 */
+	clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs);
+	if (unlikely(!clone)) {
+		dec_pending(io, -ENOMEM);
+		return;
+	}
+
+	clone_init(io, clone);
+	clone->bi_destructor = dm_crypt_bio_destructor;
+	clone->bi_idx = 0;
+	clone->bi_vcnt = bio_segments(base_bio);
+	clone->bi_size = base_bio->bi_size;
+	clone->bi_sector = cc->start + sector;
+	memcpy(clone->bi_io_vec, bio_iovec(base_bio),
+	       sizeof(struct bio_vec) * clone->bi_vcnt);
+
+	generic_make_request(clone);
+}
+
+static void process_write(struct crypt_io *io)
+{
+	struct crypt_config *cc = io->target->private;
+	struct bio *base_bio = io->base_bio;
+	struct bio *clone;
 	struct convert_context ctx;
-	int r;
+	unsigned remaining = base_bio->bi_size;
+	sector_t sector = base_bio->bi_sector - io->target->begin;
+	unsigned bvec_idx = 0;
+
+	atomic_inc(&io->pending);
+
+	crypt_convert_init(cc, &ctx, NULL, base_bio, sector, 1);
+
+	/*
+	 * The allocated buffers can be smaller than the whole bio,
+	 * so repeat the whole process until all the data can be handled.
+	 */
+	while (remaining) {
+		clone = crypt_alloc_buffer(cc, base_bio->bi_size,
+					   io->first_clone, &bvec_idx);
+		if (unlikely(!clone)) {
+			dec_pending(io, -ENOMEM);
+			return;
+		}
+
+		ctx.bio_out = clone;
+
+		if (unlikely(crypt_convert(cc, &ctx) < 0)) {
+			crypt_free_buffer_pages(cc, clone, clone->bi_size);
+			bio_put(clone);
+			dec_pending(io, -EIO);
+			return;
+		}
+
+		clone_init(io, clone);
+		clone->bi_sector = cc->start + sector;
+
+		if (!io->first_clone) {
+			/*
+			 * hold a reference to the first clone, because it
+			 * holds the bio_vec array and that can't be freed
+			 * before all other clones are released
+			 */
+			bio_get(clone);
+			io->first_clone = clone;
+		}
+
+		remaining -= clone->bi_size;
+		sector += bio_sectors(clone);
+
+		/* prevent bio_put of first_clone */
+		if (remaining)
+			atomic_inc(&io->pending);
 
-	crypt_convert_init(cc, &ctx, io->bio, io->bio,
-	                   io->bio->bi_sector - io->target->begin, 0);
-	r = crypt_convert(cc, &ctx);
+		generic_make_request(clone);
 
-	dec_pending(io, r);
+		/* out of memory -> run queues */
+		if (remaining)
+			blk_congestion_wait(bio_data_dir(clone), HZ/100);
+	}
 }
 
-static void kcryptd_queue_io(struct crypt_io *io)
+static void process_read_endio(struct crypt_io *io)
 {
-	INIT_WORK(&io->work, kcryptd_do_work, io);
-	queue_work(_kcryptd_workqueue, &io->work);
+	struct crypt_config *cc = io->target->private;
+	struct convert_context ctx;
+
+	crypt_convert_init(cc, &ctx, io->base_bio, io->base_bio,
+			   io->base_bio->bi_sector - io->target->begin, 0);
+
+	dec_pending(io, crypt_convert(cc, &ctx));
+}
+
+static void kcryptd_do_work(void *data)
+{
+	struct crypt_io *io = data;
+
+	if (io->post_process)
+		process_read_endio(io);
+	else if (bio_data_dir(io->base_bio) == READ)
+		process_read(io);
+	else
+		process_write(io);
 }
 
 /*
@@ -477,7 +640,7 @@ static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
 
 	buffer[2] = '\0';
 
-	for(i = 0; i < size; i++) {
+	for (i = 0; i < size; i++) {
 		buffer[0] = *hex++;
 		buffer[1] = *hex++;
 
@@ -500,13 +663,38 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
 {
 	unsigned int i;
 
-	for(i = 0; i < size; i++) {
+	for (i = 0; i < size; i++) {
 		sprintf(hex, "%02x", *key);
 		hex += 2;
 		key++;
 	}
 }
 
+static int crypt_set_key(struct crypt_config *cc, char *key)
+{
+	unsigned key_size = strlen(key) >> 1;
+
+	if (cc->key_size && cc->key_size != key_size)
+		return -EINVAL;
+
+	cc->key_size = key_size; /* initial settings */
+
+	if ((!key_size && strcmp(key, "-")) ||
+	    (key_size && crypt_decode_key(cc->key, key, key_size) < 0))
+		return -EINVAL;
+
+	set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
+
+	return 0;
+}
+
+static int crypt_wipe_key(struct crypt_config *cc)
+{
+	clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
+	memset(&cc->key, 0, cc->key_size * sizeof(u8));
+	return 0;
+}
+
 /*
  * Construct an encryption mapping:
  * <cipher> <key> <iv_offset> <dev_path> <start>
@@ -539,16 +727,14 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 	key_size = strlen(argv[1]) >> 1;
 
-	cc = kmalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL);
+ 	cc = kzalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL);
 	if (cc == NULL) {
 		ti->error =
 			"Cannot allocate transparent encryption context";
 		return -ENOMEM;
 	}
 
-	cc->key_size = key_size;
-	if ((!key_size && strcmp(argv[1], "-") != 0) ||
-	    (key_size && crypt_decode_key(cc->key, argv[1], key_size) < 0)) {
+ 	if (crypt_set_key(cc, argv[1])) {
 		ti->error = "Error decoding key";
 		goto bad1;
 	}
@@ -626,6 +812,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad4;
 	}
 
+	cc->bs = bioset_create(MIN_IOS, MIN_IOS, 4);
+	if (!cc->bs) {
+		ti->error = "Cannot allocate crypt bioset";
+		goto bad_bs;
+	}
+
 	if (crypto_blkcipher_setkey(tfm, cc->key, key_size) < 0) {
 		ti->error = "Error setting key";
 		goto bad5;
@@ -665,6 +857,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	return 0;
 
 bad5:
+	bioset_free(cc->bs);
+bad_bs:
 	mempool_destroy(cc->page_pool);
 bad4:
 	mempool_destroy(cc->io_pool);
@@ -684,6 +878,7 @@ static void crypt_dtr(struct dm_target *ti)
 {
 	struct crypt_config *cc = (struct crypt_config *) ti->private;
 
+	bioset_free(cc->bs);
 	mempool_destroy(cc->page_pool);
 	mempool_destroy(cc->io_pool);
 
@@ -698,147 +893,21 @@ static void crypt_dtr(struct dm_target *ti)
 	kfree(cc);
 }
 
-static int crypt_endio(struct bio *bio, unsigned int done, int error)
-{
-	struct crypt_io *io = (struct crypt_io *) bio->bi_private;
-	struct crypt_config *cc = (struct crypt_config *) io->target->private;
-
-	if (bio_data_dir(bio) == WRITE) {
-		/*
-		 * free the processed pages, even if
-		 * it's only a partially completed write
-		 */
-		crypt_free_buffer_pages(cc, bio, done);
-	}
-
-	if (bio->bi_size)
-		return 1;
-
-	bio_put(bio);
-
-	/*
-	 * successful reads are decrypted by the worker thread
-	 */
-	if ((bio_data_dir(bio) == READ)
-	    && bio_flagged(bio, BIO_UPTODATE)) {
-		kcryptd_queue_io(io);
-		return 0;
-	}
-
-	dec_pending(io, error);
-	return error;
-}
-
-static inline struct bio *
-crypt_clone(struct crypt_config *cc, struct crypt_io *io, struct bio *bio,
-            sector_t sector, unsigned int *bvec_idx,
-            struct convert_context *ctx)
-{
-	struct bio *clone;
-
-	if (bio_data_dir(bio) == WRITE) {
-		clone = crypt_alloc_buffer(cc, bio->bi_size,
-                                 io->first_clone, bvec_idx);
-		if (clone) {
-			ctx->bio_out = clone;
-			if (crypt_convert(cc, ctx) < 0) {
-				crypt_free_buffer_pages(cc, clone,
-				                        clone->bi_size);
-				bio_put(clone);
-				return NULL;
-			}
-		}
-	} else {
-		/*
-		 * The block layer might modify the bvec array, so always
-		 * copy the required bvecs because we need the original
-		 * one in order to decrypt the whole bio data *afterwards*.
-		 */
-		clone = bio_alloc(GFP_NOIO, bio_segments(bio));
-		if (clone) {
-			clone->bi_idx = 0;
-			clone->bi_vcnt = bio_segments(bio);
-			clone->bi_size = bio->bi_size;
-			memcpy(clone->bi_io_vec, bio_iovec(bio),
-			       sizeof(struct bio_vec) * clone->bi_vcnt);
-		}
-	}
-
-	if (!clone)
-		return NULL;
-
-	clone->bi_private = io;
-	clone->bi_end_io = crypt_endio;
-	clone->bi_bdev = cc->dev->bdev;
-	clone->bi_sector = cc->start + sector;
-	clone->bi_rw = bio->bi_rw;
-
-	return clone;
-}
-
 static int crypt_map(struct dm_target *ti, struct bio *bio,
 		     union map_info *map_context)
 {
-	struct crypt_config *cc = (struct crypt_config *) ti->private;
-	struct crypt_io *io = mempool_alloc(cc->io_pool, GFP_NOIO);
-	struct convert_context ctx;
-	struct bio *clone;
-	unsigned int remaining = bio->bi_size;
-	sector_t sector = bio->bi_sector - ti->begin;
-	unsigned int bvec_idx = 0;
+	struct crypt_config *cc = ti->private;
+	struct crypt_io *io;
 
+	io = mempool_alloc(cc->io_pool, GFP_NOIO);
 	io->target = ti;
-	io->bio = bio;
+	io->base_bio = bio;
 	io->first_clone = NULL;
-	io->error = 0;
-	atomic_set(&io->pending, 1); /* hold a reference */
-
-	if (bio_data_dir(bio) == WRITE)
-		crypt_convert_init(cc, &ctx, NULL, bio, sector, 1);
-
-	/*
-	 * The allocated buffers can be smaller than the whole bio,
-	 * so repeat the whole process until all the data can be handled.
-	 */
-	while (remaining) {
-		clone = crypt_clone(cc, io, bio, sector, &bvec_idx, &ctx);
-		if (!clone)
-			goto cleanup;
-
-		if (!io->first_clone) {
-			/*
-			 * hold a reference to the first clone, because it
-			 * holds the bio_vec array and that can't be freed
-			 * before all other clones are released
-			 */
-			bio_get(clone);
-			io->first_clone = clone;
-		}
-		atomic_inc(&io->pending);
+	io->error = io->post_process = 0;
+	atomic_set(&io->pending, 0);
+	kcryptd_queue_io(io);
 
-		remaining -= clone->bi_size;
-		sector += bio_sectors(clone);
-
-		generic_make_request(clone);
-
-		/* out of memory -> run queues */
-		if (remaining)
-			blk_congestion_wait(bio_data_dir(clone), HZ/100);
-	}
-
-	/* drop reference, clones could have returned before we reach this */
-	dec_pending(io, 0);
 	return 0;
-
-cleanup:
-	if (io->first_clone) {
-		dec_pending(io, -ENOMEM);
-		return 0;
-	}
-
-	/* if no bio has been dispatched yet, we can directly return the error */
-	mempool_free(io, cc->io_pool);
-	return -ENOMEM;
 }
 
 static int crypt_status(struct dm_target *ti, status_type_t type,
@@ -883,14 +952,71 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
 	return 0;
 }
 
+static void crypt_postsuspend(struct dm_target *ti)
+{
+	struct crypt_config *cc = ti->private;
+
+	set_bit(DM_CRYPT_SUSPENDED, &cc->flags);
+}
+
+static int crypt_preresume(struct dm_target *ti)
+{
+	struct crypt_config *cc = ti->private;
+
+	if (!test_bit(DM_CRYPT_KEY_VALID, &cc->flags)) {
+		DMERR("aborting resume - crypt key is not set.");
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+
+static void crypt_resume(struct dm_target *ti)
+{
+	struct crypt_config *cc = ti->private;
+
+	clear_bit(DM_CRYPT_SUSPENDED, &cc->flags);
+}
+
+/* Message interface
+ *	key set <key>
+ *	key wipe
+ */
+static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+	struct crypt_config *cc = ti->private;
+
+	if (argc < 2)
+		goto error;
+
+	if (!strnicmp(argv[0], MESG_STR("key"))) {
+		if (!test_bit(DM_CRYPT_SUSPENDED, &cc->flags)) {
+			DMWARN("not suspended during key manipulation.");
+			return -EINVAL;
+		}
+		if (argc == 3 && !strnicmp(argv[1], MESG_STR("set")))
+			return crypt_set_key(cc, argv[2]);
+		if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe")))
+			return crypt_wipe_key(cc);
+	}
+
+error:
+	DMWARN("unrecognised message received.");
+	return -EINVAL;
+}
+
 static struct target_type crypt_target = {
 	.name   = "crypt",
-	.version= {1, 1, 0},
+	.version= {1, 3, 0},
 	.module = THIS_MODULE,
 	.ctr    = crypt_ctr,
 	.dtr    = crypt_dtr,
 	.map    = crypt_map,
 	.status = crypt_status,
+	.postsuspend = crypt_postsuspend,
+	.preresume = crypt_preresume,
+	.resume = crypt_resume,
+	.message = crypt_message,
 };
 
 static int __init dm_crypt_init(void)
diff --git a/drivers/md/dm-emc.c b/drivers/md/dm-emc.c
index 2a374ccb30d..2b2d45d7baa 100644
--- a/drivers/md/dm-emc.c
+++ b/drivers/md/dm-emc.c
@@ -126,7 +126,8 @@ static struct request *get_failover_req(struct emc_handler *h,
 	memset(&rq->cmd, 0, BLK_MAX_CDB);
 
 	rq->timeout = EMC_FAILOVER_TIMEOUT;
-	rq->flags |= (REQ_BLOCK_PC | REQ_FAILFAST | REQ_NOMERGE);
+	rq->cmd_type = REQ_TYPE_BLOCK_PC;
+	rq->cmd_flags |= REQ_FAILFAST | REQ_NOMERGE;
 
 	return rq;
 }
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index d12379b5cdb..99cdffa7fbf 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -17,6 +17,7 @@
 #include <linux/slab.h>
 
 #define DM_MSG_PREFIX "snapshots"
+#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32	/* 16KB */
 
 /*-----------------------------------------------------------------
  * Persistent snapshots, by persistent we mean that the snapshot
@@ -150,6 +151,7 @@ static int alloc_area(struct pstore *ps)
 static void free_area(struct pstore *ps)
 {
 	vfree(ps->area);
+	ps->area = NULL;
 }
 
 /*
@@ -198,48 +200,79 @@ static int read_header(struct pstore *ps, int *new_snapshot)
 	int r;
 	struct disk_header *dh;
 	chunk_t chunk_size;
+	int chunk_size_supplied = 1;
 
-	r = chunk_io(ps, 0, READ);
+	/*
+	 * Use default chunk size (or hardsect_size, if larger) if none supplied
+	 */
+	if (!ps->snap->chunk_size) {
+        	ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
+		    bdev_hardsect_size(ps->snap->cow->bdev) >> 9);
+		ps->snap->chunk_mask = ps->snap->chunk_size - 1;
+		ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1;
+		chunk_size_supplied = 0;
+	}
+
+	r = dm_io_get(sectors_to_pages(ps->snap->chunk_size));
 	if (r)
 		return r;
 
+	r = alloc_area(ps);
+	if (r)
+		goto bad1;
+
+	r = chunk_io(ps, 0, READ);
+	if (r)
+		goto bad2;
+
 	dh = (struct disk_header *) ps->area;
 
 	if (le32_to_cpu(dh->magic) == 0) {
 		*new_snapshot = 1;
+		return 0;
+	}
 
-	} else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) {
-		*new_snapshot = 0;
-		ps->valid = le32_to_cpu(dh->valid);
-		ps->version = le32_to_cpu(dh->version);
-		chunk_size = le32_to_cpu(dh->chunk_size);
-		if (ps->snap->chunk_size != chunk_size) {
-			DMWARN("chunk size %llu in device metadata overrides "
-			       "table chunk size of %llu.",
-			       (unsigned long long)chunk_size,
-			       (unsigned long long)ps->snap->chunk_size);
-
-			/* We had a bogus chunk_size. Fix stuff up. */
-			dm_io_put(sectors_to_pages(ps->snap->chunk_size));
-			free_area(ps);
-
-			ps->snap->chunk_size = chunk_size;
-			ps->snap->chunk_mask = chunk_size - 1;
-			ps->snap->chunk_shift = ffs(chunk_size) - 1;
-
-			r = alloc_area(ps);
-			if (r)
-				return r;
-
-			r = dm_io_get(sectors_to_pages(chunk_size));
-			if (r)
-				return r;
-		}
-	} else {
-		DMWARN("Invalid/corrupt snapshot");
+	if (le32_to_cpu(dh->magic) != SNAP_MAGIC) {
+		DMWARN("Invalid or corrupt snapshot");
 		r = -ENXIO;
+		goto bad2;
 	}
 
+	*new_snapshot = 0;
+	ps->valid = le32_to_cpu(dh->valid);
+	ps->version = le32_to_cpu(dh->version);
+	chunk_size = le32_to_cpu(dh->chunk_size);
+
+	if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size)
+		return 0;
+
+	DMWARN("chunk size %llu in device metadata overrides "
+	       "table chunk size of %llu.",
+	       (unsigned long long)chunk_size,
+	       (unsigned long long)ps->snap->chunk_size);
+
+	/* We had a bogus chunk_size. Fix stuff up. */
+	dm_io_put(sectors_to_pages(ps->snap->chunk_size));
+	free_area(ps);
+
+	ps->snap->chunk_size = chunk_size;
+	ps->snap->chunk_mask = chunk_size - 1;
+	ps->snap->chunk_shift = ffs(chunk_size) - 1;
+
+	r = dm_io_get(sectors_to_pages(chunk_size));
+	if (r)
+		return r;
+
+	r = alloc_area(ps);
+	if (r)
+		goto bad1;
+
+	return 0;
+
+bad2:
+	free_area(ps);
+bad1:
+	dm_io_put(sectors_to_pages(ps->snap->chunk_size));
 	return r;
 }
 
@@ -263,42 +296,29 @@ static int write_header(struct pstore *ps)
  */
 static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
 {
-	if (index >= ps->exceptions_per_area)
-		return NULL;
+	BUG_ON(index >= ps->exceptions_per_area);
 
 	return ((struct disk_exception *) ps->area) + index;
 }
 
-static int read_exception(struct pstore *ps,
-			  uint32_t index, struct disk_exception *result)
+static void read_exception(struct pstore *ps,
+			   uint32_t index, struct disk_exception *result)
 {
-	struct disk_exception *e;
-
-	e = get_exception(ps, index);
-	if (!e)
-		return -EINVAL;
+	struct disk_exception *e = get_exception(ps, index);
 
 	/* copy it */
 	result->old_chunk = le64_to_cpu(e->old_chunk);
 	result->new_chunk = le64_to_cpu(e->new_chunk);
-
-	return 0;
 }
 
-static int write_exception(struct pstore *ps,
-			   uint32_t index, struct disk_exception *de)
+static void write_exception(struct pstore *ps,
+			    uint32_t index, struct disk_exception *de)
 {
-	struct disk_exception *e;
-
-	e = get_exception(ps, index);
-	if (!e)
-		return -EINVAL;
+	struct disk_exception *e = get_exception(ps, index);
 
 	/* copy it */
 	e->old_chunk = cpu_to_le64(de->old_chunk);
 	e->new_chunk = cpu_to_le64(de->new_chunk);
-
-	return 0;
 }
 
 /*
@@ -316,10 +336,7 @@ static int insert_exceptions(struct pstore *ps, int *full)
 	*full = 1;
 
 	for (i = 0; i < ps->exceptions_per_area; i++) {
-		r = read_exception(ps, i, &de);
-
-		if (r)
-			return r;
+		read_exception(ps, i, &de);
 
 		/*
 		 * If the new_chunk is pointing at the start of
@@ -519,6 +536,16 @@ static void persistent_commit(struct exception_store *store,
 		if (r)
 			ps->valid = 0;
 
+		/*
+		 * Have we completely filled the current area ?
+		 */
+		if (ps->current_committed == ps->exceptions_per_area) {
+			ps->current_committed = 0;
+			r = zero_area(ps, ps->current_area + 1);
+			if (r)
+				ps->valid = 0;
+		}
+
 		for (i = 0; i < ps->callback_count; i++) {
 			cb = ps->callbacks + i;
 			cb->callback(cb->context, r == 0 ? 1 : 0);
@@ -526,16 +553,6 @@ static void persistent_commit(struct exception_store *store,
 
 		ps->callback_count = 0;
 	}
-
-	/*
-	 * Have we completely filled the current area ?
-	 */
-	if (ps->current_committed == ps->exceptions_per_area) {
-		ps->current_committed = 0;
-		r = zero_area(ps, ps->current_area + 1);
-		if (r)
-			ps->valid = 0;
-	}
 }
 
 static void persistent_drop(struct exception_store *store)
@@ -547,32 +564,22 @@ static void persistent_drop(struct exception_store *store)
 		DMWARN("write header failed");
 }
 
-int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
+int dm_create_persistent(struct exception_store *store)
 {
-	int r;
 	struct pstore *ps;
 
-	r = dm_io_get(sectors_to_pages(chunk_size));
-	if (r)
-		return r;
-
 	/* allocate the pstore */
 	ps = kmalloc(sizeof(*ps), GFP_KERNEL);
-	if (!ps) {
-		r = -ENOMEM;
-		goto bad;
-	}
+	if (!ps)
+		return -ENOMEM;
 
 	ps->snap = store->snap;
 	ps->valid = 1;
 	ps->version = SNAPSHOT_DISK_VERSION;
+	ps->area = NULL;
 	ps->next_free = 2;	/* skipping the header and first area */
 	ps->current_committed = 0;
 
-	r = alloc_area(ps);
-	if (r)
-		goto bad;
-
 	ps->callback_count = 0;
 	atomic_set(&ps->pending_count, 0);
 	ps->callbacks = NULL;
@@ -586,13 +593,6 @@ int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
 	store->context = ps;
 
 	return 0;
-
-      bad:
-	dm_io_put(sectors_to_pages(chunk_size));
-	if (ps && ps->area)
-		free_area(ps);
-	kfree(ps);
-	return r;
 }
 
 /*-----------------------------------------------------------------
@@ -642,18 +642,16 @@ static void transient_fraction_full(struct exception_store *store,
 	*denominator = get_dev_size(store->snap->cow->bdev);
 }
 
-int dm_create_transient(struct exception_store *store,
-			struct dm_snapshot *s, int blocksize)
+int dm_create_transient(struct exception_store *store)
 {
 	struct transient_c *tc;
 
-	memset(store, 0, sizeof(*store));
 	store->destroy = transient_destroy;
 	store->read_metadata = transient_read_metadata;
 	store->prepare_exception = transient_prepare;
 	store->commit_exception = transient_commit;
+	store->drop_snapshot = NULL;
 	store->fraction_full = transient_fraction_full;
-	store->snap = s;
 
 	tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
 	if (!tc)
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 47b3c62bbdb..00234909b3d 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -98,14 +98,31 @@ static int linear_status(struct dm_target *ti, status_type_t type,
 	return 0;
 }
 
+static int linear_ioctl(struct dm_target *ti, struct inode *inode,
+			struct file *filp, unsigned int cmd,
+			unsigned long arg)
+{
+	struct linear_c *lc = (struct linear_c *) ti->private;
+	struct block_device *bdev = lc->dev->bdev;
+	struct file fake_file = {};
+	struct dentry fake_dentry = {};
+
+	fake_file.f_mode = lc->dev->mode;
+	fake_file.f_dentry = &fake_dentry;
+	fake_dentry.d_inode = bdev->bd_inode;
+
+	return blkdev_driver_ioctl(bdev->bd_inode, &fake_file, bdev->bd_disk, cmd, arg);
+}
+
 static struct target_type linear_target = {
 	.name   = "linear",
-	.version= {1, 0, 1},
+	.version= {1, 0, 2},
 	.module = THIS_MODULE,
 	.ctr    = linear_ctr,
 	.dtr    = linear_dtr,
 	.map    = linear_map,
 	.status = linear_status,
+	.ioctl  = linear_ioctl,
 };
 
 int __init dm_linear_init(void)
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 93f701ea87b..d754e0bc6e9 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -114,12 +114,10 @@ static void trigger_event(void *data);
 
 static struct pgpath *alloc_pgpath(void)
 {
-	struct pgpath *pgpath = kmalloc(sizeof(*pgpath), GFP_KERNEL);
+	struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL);
 
-	if (pgpath) {
-		memset(pgpath, 0, sizeof(*pgpath));
+	if (pgpath)
 		pgpath->path.is_active = 1;
-	}
 
 	return pgpath;
 }
@@ -133,12 +131,10 @@ static struct priority_group *alloc_priority_group(void)
 {
 	struct priority_group *pg;
 
-	pg = kmalloc(sizeof(*pg), GFP_KERNEL);
-	if (!pg)
-		return NULL;
+	pg = kzalloc(sizeof(*pg), GFP_KERNEL);
 
-	memset(pg, 0, sizeof(*pg));
-	INIT_LIST_HEAD(&pg->pgpaths);
+	if (pg)
+		INIT_LIST_HEAD(&pg->pgpaths);
 
 	return pg;
 }
@@ -168,13 +164,12 @@ static void free_priority_group(struct priority_group *pg,
 	kfree(pg);
 }
 
-static struct multipath *alloc_multipath(void)
+static struct multipath *alloc_multipath(struct dm_target *ti)
 {
 	struct multipath *m;
 
-	m = kmalloc(sizeof(*m), GFP_KERNEL);
+	m = kzalloc(sizeof(*m), GFP_KERNEL);
 	if (m) {
-		memset(m, 0, sizeof(*m));
 		INIT_LIST_HEAD(&m->priority_groups);
 		spin_lock_init(&m->lock);
 		m->queue_io = 1;
@@ -185,6 +180,8 @@ static struct multipath *alloc_multipath(void)
 			kfree(m);
 			return NULL;
 		}
+		m->ti = ti;
+		ti->private = m;
 	}
 
 	return m;
@@ -557,8 +554,7 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
 }
 
 static struct priority_group *parse_priority_group(struct arg_set *as,
-						   struct multipath *m,
-						   struct dm_target *ti)
+						   struct multipath *m)
 {
 	static struct param _params[] = {
 		{1, 1024, "invalid number of paths"},
@@ -568,6 +564,7 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
 	int r;
 	unsigned i, nr_selector_args, nr_params;
 	struct priority_group *pg;
+	struct dm_target *ti = m->ti;
 
 	if (as->argc < 2) {
 		as->argc = 0;
@@ -624,12 +621,12 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
 	return NULL;
 }
 
-static int parse_hw_handler(struct arg_set *as, struct multipath *m,
-			    struct dm_target *ti)
+static int parse_hw_handler(struct arg_set *as, struct multipath *m)
 {
 	int r;
 	struct hw_handler_type *hwht;
 	unsigned hw_argc;
+	struct dm_target *ti = m->ti;
 
 	static struct param _params[] = {
 		{0, 1024, "invalid number of hardware handler args"},
@@ -661,11 +658,11 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m,
 	return 0;
 }
 
-static int parse_features(struct arg_set *as, struct multipath *m,
-			  struct dm_target *ti)
+static int parse_features(struct arg_set *as, struct multipath *m)
 {
 	int r;
 	unsigned argc;
+	struct dm_target *ti = m->ti;
 
 	static struct param _params[] = {
 		{0, 1, "invalid number of feature args"},
@@ -704,19 +701,17 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 	as.argc = argc;
 	as.argv = argv;
 
-	m = alloc_multipath();
+	m = alloc_multipath(ti);
 	if (!m) {
 		ti->error = "can't allocate multipath";
 		return -EINVAL;
 	}
 
-	m->ti = ti;
-
-	r = parse_features(&as, m, ti);
+	r = parse_features(&as, m);
 	if (r)
 		goto bad;
 
-	r = parse_hw_handler(&as, m, ti);
+	r = parse_hw_handler(&as, m);
 	if (r)
 		goto bad;
 
@@ -732,7 +727,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 	while (as.argc) {
 		struct priority_group *pg;
 
-		pg = parse_priority_group(&as, m, ti);
+		pg = parse_priority_group(&as, m);
 		if (!pg) {
 			r = -EINVAL;
 			goto bad;
@@ -752,8 +747,6 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 		goto bad;
 	}
 
-	ti->private = m;
-
 	return 0;
 
  bad:
@@ -1266,12 +1259,47 @@ error:
 	return -EINVAL;
 }
 
+static int multipath_ioctl(struct dm_target *ti, struct inode *inode,
+			   struct file *filp, unsigned int cmd,
+			   unsigned long arg)
+{
+	struct multipath *m = (struct multipath *) ti->private;
+	struct block_device *bdev = NULL;
+	unsigned long flags;
+	struct file fake_file = {};
+	struct dentry fake_dentry = {};
+	int r = 0;
+
+	fake_file.f_dentry = &fake_dentry;
+
+	spin_lock_irqsave(&m->lock, flags);
+
+	if (!m->current_pgpath)
+		__choose_pgpath(m);
+
+	if (m->current_pgpath) {
+		bdev = m->current_pgpath->path.dev->bdev;
+		fake_dentry.d_inode = bdev->bd_inode;
+		fake_file.f_mode = m->current_pgpath->path.dev->mode;
+	}
+
+	if (m->queue_io)
+		r = -EAGAIN;
+	else if (!bdev)
+		r = -EIO;
+
+	spin_unlock_irqrestore(&m->lock, flags);
+
+	return r ? : blkdev_driver_ioctl(bdev->bd_inode, &fake_file,
+					 bdev->bd_disk, cmd, arg);
+}
+
 /*-----------------------------------------------------------------
  * Module setup
  *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
 	.name = "multipath",
-	.version = {1, 0, 4},
+	.version = {1, 0, 5},
 	.module = THIS_MODULE,
 	.ctr = multipath_ctr,
 	.dtr = multipath_dtr,
@@ -1281,6 +1309,7 @@ static struct target_type multipath_target = {
 	.resume = multipath_resume,
 	.status = multipath_status,
 	.message = multipath_message,
+	.ioctl  = multipath_ioctl,
 };
 
 static int __init dm_multipath_init(void)
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index c54de989eb0..659224cb7c5 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1213,9 +1213,9 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
 		break;
 
 	case STATUSTYPE_TABLE:
-		DMEMIT("%d ", ms->nr_mirrors);
+		DMEMIT("%d", ms->nr_mirrors);
 		for (m = 0; m < ms->nr_mirrors; m++)
-			DMEMIT("%s %llu ", ms->mirror[m].dev->name,
+			DMEMIT(" %s %llu", ms->mirror[m].dev->name,
 				(unsigned long long)ms->mirror[m].offset);
 	}
 
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 1d0fafda0f7..5281e009407 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -39,6 +39,9 @@
  */
 #define SNAPSHOT_PAGES 256
 
+struct workqueue_struct *ksnapd;
+static void flush_queued_bios(void *data);
+
 struct pending_exception {
 	struct exception e;
 
@@ -56,7 +59,7 @@ struct pending_exception {
 
 	/*
 	 * The primary pending_exception is the one that holds
-	 * the sibling_count and the list of origin_bios for a
+	 * the ref_count and the list of origin_bios for a
 	 * group of pending_exceptions.  It is always last to get freed.
 	 * These fields get set up when writing to the origin.
 	 */
@@ -69,7 +72,7 @@ struct pending_exception {
 	 * the sibling concerned and not pe->primary_pe->snap->lock unless
 	 * they are the same.
 	 */
-	atomic_t sibling_count;
+	atomic_t ref_count;
 
 	/* Pointer back to snapshot context */
 	struct dm_snapshot *snap;
@@ -387,15 +390,46 @@ static inline ulong round_up(ulong n, ulong size)
 	return (n + size) & ~size;
 }
 
-static void read_snapshot_metadata(struct dm_snapshot *s)
+static int set_chunk_size(struct dm_snapshot *s, const char *chunk_size_arg,
+			  char **error)
 {
-	if (s->store.read_metadata(&s->store)) {
-		down_write(&s->lock);
-		s->valid = 0;
-		up_write(&s->lock);
+	unsigned long chunk_size;
+	char *value;
+
+	chunk_size = simple_strtoul(chunk_size_arg, &value, 10);
+	if (*chunk_size_arg == '\0' || *value != '\0') {
+		*error = "Invalid chunk size";
+		return -EINVAL;
+	}
+
+	if (!chunk_size) {
+		s->chunk_size = s->chunk_mask = s->chunk_shift = 0;
+		return 0;
+	}
+
+	/*
+	 * Chunk size must be multiple of page size.  Silently
+	 * round up if it's not.
+	 */
+	chunk_size = round_up(chunk_size, PAGE_SIZE >> 9);
+
+	/* Check chunk_size is a power of 2 */
+	if (chunk_size & (chunk_size - 1)) {
+		*error = "Chunk size is not a power of 2";
+		return -EINVAL;
+	}
 
-		dm_table_event(s->table);
+	/* Validate the chunk size against the device block size */
+	if (chunk_size % (bdev_hardsect_size(s->cow->bdev) >> 9)) {
+		*error = "Chunk size is not a multiple of device blocksize";
+		return -EINVAL;
 	}
+
+	s->chunk_size = chunk_size;
+	s->chunk_mask = chunk_size - 1;
+	s->chunk_shift = ffs(chunk_size) - 1;
+
+	return 0;
 }
 
 /*
@@ -404,15 +438,12 @@ static void read_snapshot_metadata(struct dm_snapshot *s)
 static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
 	struct dm_snapshot *s;
-	unsigned long chunk_size;
 	int r = -EINVAL;
 	char persistent;
 	char *origin_path;
 	char *cow_path;
-	char *value;
-	int blocksize;
 
-	if (argc < 4) {
+	if (argc != 4) {
 		ti->error = "requires exactly 4 arguments";
 		r = -EINVAL;
 		goto bad1;
@@ -428,13 +459,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad1;
 	}
 
-	chunk_size = simple_strtoul(argv[3], &value, 10);
-	if (chunk_size == 0 || value == NULL) {
-		ti->error = "Invalid chunk size";
-		r = -EINVAL;
-		goto bad1;
-	}
-
 	s = kmalloc(sizeof(*s), GFP_KERNEL);
 	if (s == NULL) {
 		ti->error = "Cannot allocate snapshot context private "
@@ -457,36 +481,17 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad2;
 	}
 
-	/*
-	 * Chunk size must be multiple of page size.  Silently
-	 * round up if it's not.
-	 */
-	chunk_size = round_up(chunk_size, PAGE_SIZE >> 9);
-
-	/* Validate the chunk size against the device block size */
-	blocksize = s->cow->bdev->bd_disk->queue->hardsect_size;
-	if (chunk_size % (blocksize >> 9)) {
-		ti->error = "Chunk size is not a multiple of device blocksize";
-		r = -EINVAL;
-		goto bad3;
-	}
-
-	/* Check chunk_size is a power of 2 */
-	if (chunk_size & (chunk_size - 1)) {
-		ti->error = "Chunk size is not a power of 2";
-		r = -EINVAL;
+	r = set_chunk_size(s, argv[3], &ti->error);
+	if (r)
 		goto bad3;
-	}
 
-	s->chunk_size = chunk_size;
-	s->chunk_mask = chunk_size - 1;
 	s->type = persistent;
-	s->chunk_shift = ffs(chunk_size) - 1;
 
 	s->valid = 1;
 	s->active = 0;
 	s->last_percent = 0;
 	init_rwsem(&s->lock);
+	spin_lock_init(&s->pe_lock);
 	s->table = ti->table;
 
 	/* Allocate hash table for COW data */
@@ -496,16 +501,12 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad3;
 	}
 
-	/*
-	 * Check the persistent flag - done here because we need the iobuf
-	 * to check the LV header
-	 */
 	s->store.snap = s;
 
 	if (persistent == 'P')
-		r = dm_create_persistent(&s->store, chunk_size);
+		r = dm_create_persistent(&s->store);
 	else
-		r = dm_create_transient(&s->store, s, blocksize);
+		r = dm_create_transient(&s->store);
 
 	if (r) {
 		ti->error = "Couldn't create exception store";
@@ -520,7 +521,14 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 
 	/* Metadata must only be loaded into one table at once */
-	read_snapshot_metadata(s);
+	r = s->store.read_metadata(&s->store);
+	if (r) {
+		ti->error = "Failed to read snapshot metadata";
+		goto bad6;
+	}
+
+	bio_list_init(&s->queued_bios);
+	INIT_WORK(&s->queued_bios_work, flush_queued_bios, s);
 
 	/* Add snapshot to the list of snapshots for this origin */
 	/* Exceptions aren't triggered till snapshot_resume() is called */
@@ -560,6 +568,8 @@ static void snapshot_dtr(struct dm_target *ti)
 {
 	struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
 
+	flush_workqueue(ksnapd);
+
 	/* Prevent further origin writes from using this snapshot. */
 	/* After this returns there can be no new kcopyd jobs. */
 	unregister_snapshot(s);
@@ -593,6 +603,19 @@ static void flush_bios(struct bio *bio)
 	}
 }
 
+static void flush_queued_bios(void *data)
+{
+	struct dm_snapshot *s = (struct dm_snapshot *) data;
+	struct bio *queued_bios;
+	unsigned long flags;
+
+	spin_lock_irqsave(&s->pe_lock, flags);
+	queued_bios = bio_list_get(&s->queued_bios);
+	spin_unlock_irqrestore(&s->pe_lock, flags);
+
+	flush_bios(queued_bios);
+}
+
 /*
  * Error a list of buffers.
  */
@@ -608,28 +631,7 @@ static void error_bios(struct bio *bio)
 	}
 }
 
-static inline void error_snapshot_bios(struct pending_exception *pe)
-{
-	error_bios(bio_list_get(&pe->snapshot_bios));
-}
-
-static struct bio *__flush_bios(struct pending_exception *pe)
-{
-	/*
-	 * If this pe is involved in a write to the origin and
-	 * it is the last sibling to complete then release
-	 * the bios for the original write to the origin.
-	 */
-
-	if (pe->primary_pe &&
-	    atomic_dec_and_test(&pe->primary_pe->sibling_count))
-		return bio_list_get(&pe->primary_pe->origin_bios);
-
-	return NULL;
-}
-
-static void __invalidate_snapshot(struct dm_snapshot *s,
-				struct pending_exception *pe, int err)
+static void __invalidate_snapshot(struct dm_snapshot *s, int err)
 {
 	if (!s->valid)
 		return;
@@ -639,9 +641,6 @@ static void __invalidate_snapshot(struct dm_snapshot *s,
 	else if (err == -ENOMEM)
 		DMERR("Invalidating snapshot: Unable to allocate exception.");
 
-	if (pe)
-		remove_exception(&pe->e);
-
 	if (s->store.drop_snapshot)
 		s->store.drop_snapshot(&s->store);
 
@@ -650,78 +649,95 @@ static void __invalidate_snapshot(struct dm_snapshot *s,
 	dm_table_event(s->table);
 }
 
+static void get_pending_exception(struct pending_exception *pe)
+{
+	atomic_inc(&pe->ref_count);
+}
+
+static struct bio *put_pending_exception(struct pending_exception *pe)
+{
+	struct pending_exception *primary_pe;
+	struct bio *origin_bios = NULL;
+
+	primary_pe = pe->primary_pe;
+
+	/*
+	 * If this pe is involved in a write to the origin and
+	 * it is the last sibling to complete then release
+	 * the bios for the original write to the origin.
+	 */
+	if (primary_pe &&
+	    atomic_dec_and_test(&primary_pe->ref_count))
+		origin_bios = bio_list_get(&primary_pe->origin_bios);
+
+	/*
+	 * Free the pe if it's not linked to an origin write or if
+	 * it's not itself a primary pe.
+	 */
+	if (!primary_pe || primary_pe != pe)
+		free_pending_exception(pe);
+
+	/*
+	 * Free the primary pe if nothing references it.
+	 */
+	if (primary_pe && !atomic_read(&primary_pe->ref_count))
+		free_pending_exception(primary_pe);
+
+	return origin_bios;
+}
+
 static void pending_complete(struct pending_exception *pe, int success)
 {
 	struct exception *e;
-	struct pending_exception *primary_pe;
 	struct dm_snapshot *s = pe->snap;
-	struct bio *flush = NULL;
+	struct bio *origin_bios = NULL;
+	struct bio *snapshot_bios = NULL;
+	int error = 0;
 
 	if (!success) {
 		/* Read/write error - snapshot is unusable */
 		down_write(&s->lock);
-		__invalidate_snapshot(s, pe, -EIO);
-		flush = __flush_bios(pe);
-		up_write(&s->lock);
-
-		error_snapshot_bios(pe);
+		__invalidate_snapshot(s, -EIO);
+		error = 1;
 		goto out;
 	}
 
 	e = alloc_exception();
 	if (!e) {
 		down_write(&s->lock);
-		__invalidate_snapshot(s, pe, -ENOMEM);
-		flush = __flush_bios(pe);
-		up_write(&s->lock);
-
-		error_snapshot_bios(pe);
+		__invalidate_snapshot(s, -ENOMEM);
+		error = 1;
 		goto out;
 	}
 	*e = pe->e;
 
-	/*
-	 * Add a proper exception, and remove the
-	 * in-flight exception from the list.
-	 */
 	down_write(&s->lock);
 	if (!s->valid) {
-		flush = __flush_bios(pe);
-		up_write(&s->lock);
-
 		free_exception(e);
-
-		error_snapshot_bios(pe);
+		error = 1;
 		goto out;
 	}
 
+	/*
+	 * Add a proper exception, and remove the
+	 * in-flight exception from the list.
+	 */
 	insert_exception(&s->complete, e);
+
+ out:
 	remove_exception(&pe->e);
-	flush = __flush_bios(pe);
+	snapshot_bios = bio_list_get(&pe->snapshot_bios);
+	origin_bios = put_pending_exception(pe);
 
 	up_write(&s->lock);
 
 	/* Submit any pending write bios */
-	flush_bios(bio_list_get(&pe->snapshot_bios));
-
- out:
-	primary_pe = pe->primary_pe;
-
-	/*
-	 * Free the pe if it's not linked to an origin write or if
-	 * it's not itself a primary pe.
-	 */
-	if (!primary_pe || primary_pe != pe)
-		free_pending_exception(pe);
-
-	/*
-	 * Free the primary pe if nothing references it.
-	 */
-	if (primary_pe && !atomic_read(&primary_pe->sibling_count))
-		free_pending_exception(primary_pe);
+	if (error)
+		error_bios(snapshot_bios);
+	else
+		flush_bios(snapshot_bios);
 
-	if (flush)
-		flush_bios(flush);
+	flush_bios(origin_bios);
 }
 
 static void commit_callback(void *context, int success)
@@ -822,7 +838,7 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
 	bio_list_init(&pe->origin_bios);
 	bio_list_init(&pe->snapshot_bios);
 	pe->primary_pe = NULL;
-	atomic_set(&pe->sibling_count, 1);
+	atomic_set(&pe->ref_count, 0);
 	pe->snap = s;
 	pe->started = 0;
 
@@ -831,6 +847,7 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
 		return NULL;
 	}
 
+	get_pending_exception(pe);
 	insert_exception(&s->pending, &pe->e);
 
  out:
@@ -850,7 +867,6 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
 {
 	struct exception *e;
 	struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
-	int copy_needed = 0;
 	int r = 1;
 	chunk_t chunk;
 	struct pending_exception *pe = NULL;
@@ -865,32 +881,31 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
 	if (unlikely(bio_barrier(bio)))
 		return -EOPNOTSUPP;
 
+	/* FIXME: should only take write lock if we need
+	 * to copy an exception */
+	down_write(&s->lock);
+
+	if (!s->valid) {
+		r = -EIO;
+		goto out_unlock;
+	}
+
+	/* If the block is already remapped - use that, else remap it */
+	e = lookup_exception(&s->complete, chunk);
+	if (e) {
+		remap_exception(s, e, bio);
+		goto out_unlock;
+	}
+
 	/*
 	 * Write to snapshot - higher level takes care of RW/RO
 	 * flags so we should only get this if we are
 	 * writeable.
 	 */
 	if (bio_rw(bio) == WRITE) {
-
-		/* FIXME: should only take write lock if we need
-		 * to copy an exception */
-		down_write(&s->lock);
-
-		if (!s->valid) {
-			r = -EIO;
-			goto out_unlock;
-		}
-
-		/* If the block is already remapped - use that, else remap it */
-		e = lookup_exception(&s->complete, chunk);
-		if (e) {
-			remap_exception(s, e, bio);
-			goto out_unlock;
-		}
-
 		pe = __find_pending_exception(s, bio);
 		if (!pe) {
-			__invalidate_snapshot(s, pe, -ENOMEM);
+			__invalidate_snapshot(s, -ENOMEM);
 			r = -EIO;
 			goto out_unlock;
 		}
@@ -898,45 +913,27 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
 		remap_exception(s, &pe->e, bio);
 		bio_list_add(&pe->snapshot_bios, bio);
 
+		r = 0;
+
 		if (!pe->started) {
 			/* this is protected by snap->lock */
 			pe->started = 1;
-			copy_needed = 1;
-		}
-
-		r = 0;
-
- out_unlock:
-		up_write(&s->lock);
-
-		if (copy_needed)
+			up_write(&s->lock);
 			start_copy(pe);
-	} else {
+			goto out;
+		}
+	} else
 		/*
 		 * FIXME: this read path scares me because we
 		 * always use the origin when we have a pending
 		 * exception.  However I can't think of a
 		 * situation where this is wrong - ejt.
 		 */
+		bio->bi_bdev = s->origin->bdev;
 
-		/* Do reads */
-		down_read(&s->lock);
-
-		if (!s->valid) {
-			up_read(&s->lock);
-			return -EIO;
-		}
-
-		/* See if it it has been remapped */
-		e = lookup_exception(&s->complete, chunk);
-		if (e)
-			remap_exception(s, e, bio);
-		else
-			bio->bi_bdev = s->origin->bdev;
-
-		up_read(&s->lock);
-	}
-
+ out_unlock:
+	up_write(&s->lock);
+ out:
 	return r;
 }
 
@@ -1025,7 +1022,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
 		 * is already remapped in this snapshot
 		 * and trigger an exception if not.
 		 *
-		 * sibling_count is initialised to 1 so pending_complete()
+		 * ref_count is initialised to 1 so pending_complete()
 		 * won't destroy the primary_pe while we're inside this loop.
 		 */
 		e = lookup_exception(&snap->complete, chunk);
@@ -1034,7 +1031,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
 
 		pe = __find_pending_exception(snap, bio);
 		if (!pe) {
-			__invalidate_snapshot(snap, pe, ENOMEM);
+			__invalidate_snapshot(snap, -ENOMEM);
 			goto next_snapshot;
 		}
 
@@ -1056,8 +1053,8 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
 		}
 
 		if (!pe->primary_pe) {
-			atomic_inc(&primary_pe->sibling_count);
 			pe->primary_pe = primary_pe;
+			get_pending_exception(primary_pe);
 		}
 
 		if (!pe->started) {
@@ -1070,20 +1067,20 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
 	}
 
 	if (!primary_pe)
-		goto out;
+		return r;
 
 	/*
 	 * If this is the first time we're processing this chunk and
-	 * sibling_count is now 1 it means all the pending exceptions
+	 * ref_count is now 1 it means all the pending exceptions
 	 * got completed while we were in the loop above, so it falls to
 	 * us here to remove the primary_pe and submit any origin_bios.
 	 */
 
-	if (first && atomic_dec_and_test(&primary_pe->sibling_count)) {
+	if (first && atomic_dec_and_test(&primary_pe->ref_count)) {
 		flush_bios(bio_list_get(&primary_pe->origin_bios));
 		free_pending_exception(primary_pe);
 		/* If we got here, pe_queue is necessarily empty. */
-		goto out;
+		return r;
 	}
 
 	/*
@@ -1092,7 +1089,6 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
 	list_for_each_entry_safe(pe, next_pe, &pe_queue, list)
 		start_copy(pe);
 
- out:
 	return r;
 }
 
@@ -1205,7 +1201,7 @@ static int origin_status(struct dm_target *ti, status_type_t type, char *result,
 
 static struct target_type origin_target = {
 	.name    = "snapshot-origin",
-	.version = {1, 4, 0},
+	.version = {1, 5, 0},
 	.module  = THIS_MODULE,
 	.ctr     = origin_ctr,
 	.dtr     = origin_dtr,
@@ -1216,7 +1212,7 @@ static struct target_type origin_target = {
 
 static struct target_type snapshot_target = {
 	.name    = "snapshot",
-	.version = {1, 4, 0},
+	.version = {1, 5, 0},
 	.module  = THIS_MODULE,
 	.ctr     = snapshot_ctr,
 	.dtr     = snapshot_dtr,
@@ -1275,8 +1271,17 @@ static int __init dm_snapshot_init(void)
 		goto bad5;
 	}
 
+	ksnapd = create_singlethread_workqueue("ksnapd");
+	if (!ksnapd) {
+		DMERR("Failed to create ksnapd workqueue.");
+		r = -ENOMEM;
+		goto bad6;
+	}
+
 	return 0;
 
+      bad6:
+	mempool_destroy(pending_pool);
       bad5:
 	kmem_cache_destroy(pending_cache);
       bad4:
@@ -1294,6 +1299,8 @@ static void __exit dm_snapshot_exit(void)
 {
 	int r;
 
+	destroy_workqueue(ksnapd);
+
 	r = dm_unregister_target(&snapshot_target);
 	if (r)
 		DMERR("snapshot unregister failed %d", r);
diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h
index fdec1e2dc87..15fa2ae6cdc 100644
--- a/drivers/md/dm-snap.h
+++ b/drivers/md/dm-snap.h
@@ -10,7 +10,9 @@
 #define DM_SNAPSHOT_H
 
 #include "dm.h"
+#include "dm-bio-list.h"
 #include <linux/blkdev.h>
+#include <linux/workqueue.h>
 
 struct exception_table {
 	uint32_t hash_mask;
@@ -112,10 +114,20 @@ struct dm_snapshot {
 	struct exception_table pending;
 	struct exception_table complete;
 
+	/*
+	 * pe_lock protects all pending_exception operations and access
+	 * as well as the snapshot_bios list.
+	 */
+	spinlock_t pe_lock;
+
 	/* The on disk metadata handler */
 	struct exception_store store;
 
 	struct kcopyd_client *kcopyd_client;
+
+	/* Queue of snapshot writes for ksnapd to flush */
+	struct bio_list queued_bios;
+	struct work_struct queued_bios_work;
 };
 
 /*
@@ -128,10 +140,9 @@ int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new);
  * Constructor and destructor for the default persistent
  * store.
  */
-int dm_create_persistent(struct exception_store *store, uint32_t chunk_size);
+int dm_create_persistent(struct exception_store *store);
 
-int dm_create_transient(struct exception_store *store,
-			struct dm_snapshot *s, int blocksize);
+int dm_create_transient(struct exception_store *store);
 
 /*
  * Return the number of sectors in the device.
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 75fe9493e6a..05befa91807 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -522,56 +522,61 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
 	return 0;
 }
 
-
-int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
-		  sector_t len, int mode, struct dm_dev **result)
+void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 {
-	int r = __table_get_device(ti->table, ti, path,
-				   start, len, mode, result);
-	if (!r) {
-		request_queue_t *q = bdev_get_queue((*result)->bdev);
-		struct io_restrictions *rs = &ti->limits;
-
-		/*
-		 * Combine the device limits low.
-		 *
-		 * FIXME: if we move an io_restriction struct
-		 *        into q this would just be a call to
-		 *        combine_restrictions_low()
-		 */
+	request_queue_t *q = bdev_get_queue(bdev);
+	struct io_restrictions *rs = &ti->limits;
+
+	/*
+	 * Combine the device limits low.
+	 *
+	 * FIXME: if we move an io_restriction struct
+	 *        into q this would just be a call to
+	 *        combine_restrictions_low()
+	 */
+	rs->max_sectors =
+		min_not_zero(rs->max_sectors, q->max_sectors);
+
+	/* FIXME: Device-Mapper on top of RAID-0 breaks because DM
+	 *        currently doesn't honor MD's merge_bvec_fn routine.
+	 *        In this case, we'll force DM to use PAGE_SIZE or
+	 *        smaller I/O, just to be safe. A better fix is in the
+	 *        works, but add this for the time being so it will at
+	 *        least operate correctly.
+	 */
+	if (q->merge_bvec_fn)
 		rs->max_sectors =
-			min_not_zero(rs->max_sectors, q->max_sectors);
+			min_not_zero(rs->max_sectors,
+				     (unsigned int) (PAGE_SIZE >> 9));
 
-		/* FIXME: Device-Mapper on top of RAID-0 breaks because DM
-		 *        currently doesn't honor MD's merge_bvec_fn routine.
-		 *        In this case, we'll force DM to use PAGE_SIZE or
-		 *        smaller I/O, just to be safe. A better fix is in the
-		 *        works, but add this for the time being so it will at
-		 *        least operate correctly.
-		 */
-		if (q->merge_bvec_fn)
-			rs->max_sectors =
-				min_not_zero(rs->max_sectors,
-					     (unsigned int) (PAGE_SIZE >> 9));
+	rs->max_phys_segments =
+		min_not_zero(rs->max_phys_segments,
+			     q->max_phys_segments);
 
-		rs->max_phys_segments =
-			min_not_zero(rs->max_phys_segments,
-				     q->max_phys_segments);
+	rs->max_hw_segments =
+		min_not_zero(rs->max_hw_segments, q->max_hw_segments);
 
-		rs->max_hw_segments =
-			min_not_zero(rs->max_hw_segments, q->max_hw_segments);
+	rs->hardsect_size = max(rs->hardsect_size, q->hardsect_size);
 
-		rs->hardsect_size = max(rs->hardsect_size, q->hardsect_size);
+	rs->max_segment_size =
+		min_not_zero(rs->max_segment_size, q->max_segment_size);
 
-		rs->max_segment_size =
-			min_not_zero(rs->max_segment_size, q->max_segment_size);
+	rs->seg_boundary_mask =
+		min_not_zero(rs->seg_boundary_mask,
+			     q->seg_boundary_mask);
 
-		rs->seg_boundary_mask =
-			min_not_zero(rs->seg_boundary_mask,
-				     q->seg_boundary_mask);
+	rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
+}
+EXPORT_SYMBOL_GPL(dm_set_device_limits);
 
-		rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
-	}
+int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
+		  sector_t len, int mode, struct dm_dev **result)
+{
+	int r = __table_get_device(ti->table, ti, path,
+				   start, len, mode, result);
+
+	if (!r)
+		dm_set_device_limits(ti, (*result)->bdev);
 
 	return r;
 }
@@ -939,9 +944,20 @@ void dm_table_postsuspend_targets(struct dm_table *t)
 	return suspend_targets(t, 1);
 }
 
-void dm_table_resume_targets(struct dm_table *t)
+int dm_table_resume_targets(struct dm_table *t)
 {
-	int i;
+	int i, r = 0;
+
+	for (i = 0; i < t->num_targets; i++) {
+		struct dm_target *ti = t->targets + i;
+
+		if (!ti->type->preresume)
+			continue;
+
+		r = ti->type->preresume(ti);
+		if (r)
+			return r;
+	}
 
 	for (i = 0; i < t->num_targets; i++) {
 		struct dm_target *ti = t->targets + i;
@@ -949,6 +965,8 @@ void dm_table_resume_targets(struct dm_table *t)
 		if (ti->type->resume)
 			ti->type->resume(ti);
 	}
+
+	return 0;
 }
 
 int dm_table_any_congested(struct dm_table *t, int bdi_bits)
@@ -983,6 +1001,11 @@ int dm_table_flush_all(struct dm_table *t)
 {
 	struct list_head *d, *devices = dm_table_get_devices(t);
 	int ret = 0;
+	unsigned i;
+
+	for (i = 0; i < t->num_targets; i++)
+		if (t->targets[i].type->flush)
+			t->targets[i].type->flush(&t->targets[i]);
 
 	for (d = devices->next; d != devices; d = d->next) {
 		struct dm_dev *dd = list_entry(d, struct dm_dev, list);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index c99bf9f0175..b5764a86c8b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -20,6 +20,7 @@
 #include <linux/idr.h>
 #include <linux/hdreg.h>
 #include <linux/blktrace_api.h>
+#include <linux/smp_lock.h>
 
 #define DM_MSG_PREFIX "core"
 
@@ -101,6 +102,8 @@ struct mapped_device {
 	mempool_t *io_pool;
 	mempool_t *tio_pool;
 
+	struct bio_set *bs;
+
 	/*
 	 * Event handling.
 	 */
@@ -121,16 +124,10 @@ struct mapped_device {
 static kmem_cache_t *_io_cache;
 static kmem_cache_t *_tio_cache;
 
-static struct bio_set *dm_set;
-
 static int __init local_init(void)
 {
 	int r;
 
-	dm_set = bioset_create(16, 16, 4);
-	if (!dm_set)
-		return -ENOMEM;
-
 	/* allocate a slab for the dm_ios */
 	_io_cache = kmem_cache_create("dm_io",
 				      sizeof(struct dm_io), 0, 0, NULL, NULL);
@@ -164,8 +161,6 @@ static void local_exit(void)
 	kmem_cache_destroy(_tio_cache);
 	kmem_cache_destroy(_io_cache);
 
-	bioset_free(dm_set);
-
 	if (unregister_blkdev(_major, _name) < 0)
 		DMERR("unregister_blkdev failed");
 
@@ -288,6 +283,45 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return dm_get_geometry(md, geo);
 }
 
+static int dm_blk_ioctl(struct inode *inode, struct file *file,
+			unsigned int cmd, unsigned long arg)
+{
+	struct mapped_device *md;
+	struct dm_table *map;
+	struct dm_target *tgt;
+	int r = -ENOTTY;
+
+	/* We don't really need this lock, but we do need 'inode'. */
+	unlock_kernel();
+
+	md = inode->i_bdev->bd_disk->private_data;
+
+	map = dm_get_table(md);
+
+	if (!map || !dm_table_get_size(map))
+		goto out;
+
+	/* We only support devices that have a single target */
+	if (dm_table_get_num_targets(map) != 1)
+		goto out;
+
+	tgt = dm_table_get_target(map, 0);
+
+	if (dm_suspended(md)) {
+		r = -EAGAIN;
+		goto out;
+	}
+
+	if (tgt->type->ioctl)
+		r = tgt->type->ioctl(tgt, inode, file, cmd, arg);
+
+out:
+	dm_table_put(map);
+
+	lock_kernel();
+	return r;
+}
+
 static inline struct dm_io *alloc_io(struct mapped_device *md)
 {
 	return mempool_alloc(md->io_pool, GFP_NOIO);
@@ -435,7 +469,7 @@ static int clone_endio(struct bio *bio, unsigned int done, int error)
 {
 	int r = 0;
 	struct target_io *tio = bio->bi_private;
-	struct dm_io *io = tio->io;
+	struct mapped_device *md = tio->io->md;
 	dm_endio_fn endio = tio->ti->type->end_io;
 
 	if (bio->bi_size)
@@ -454,9 +488,15 @@ static int clone_endio(struct bio *bio, unsigned int done, int error)
 			return 1;
 	}
 
-	free_tio(io->md, tio);
-	dec_pending(io, error);
+	dec_pending(tio->io, error);
+
+	/*
+	 * Store md for cleanup instead of tio which is about to get freed.
+	 */
+	bio->bi_private = md->bs;
+
 	bio_put(bio);
+	free_tio(md, tio);
 	return r;
 }
 
@@ -485,6 +525,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
 {
 	int r;
 	sector_t sector;
+	struct mapped_device *md;
 
 	/*
 	 * Sanity checks.
@@ -514,10 +555,14 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
 
 	else if (r < 0) {
 		/* error the io and bail out */
-		struct dm_io *io = tio->io;
-		free_tio(tio->io->md, tio);
-		dec_pending(io, r);
+		md = tio->io->md;
+		dec_pending(tio->io, r);
+		/*
+		 * Store bio_set for cleanup.
+		 */
+		clone->bi_private = md->bs;
 		bio_put(clone);
+		free_tio(md, tio);
 	}
 }
 
@@ -533,7 +578,9 @@ struct clone_info {
 
 static void dm_bio_destructor(struct bio *bio)
 {
-	bio_free(bio, dm_set);
+	struct bio_set *bs = bio->bi_private;
+
+	bio_free(bio, bs);
 }
 
 /*
@@ -541,12 +588,12 @@ static void dm_bio_destructor(struct bio *bio)
  */
 static struct bio *split_bvec(struct bio *bio, sector_t sector,
 			      unsigned short idx, unsigned int offset,
-			      unsigned int len)
+			      unsigned int len, struct bio_set *bs)
 {
 	struct bio *clone;
 	struct bio_vec *bv = bio->bi_io_vec + idx;
 
-	clone = bio_alloc_bioset(GFP_NOIO, 1, dm_set);
+	clone = bio_alloc_bioset(GFP_NOIO, 1, bs);
 	clone->bi_destructor = dm_bio_destructor;
 	*clone->bi_io_vec = *bv;
 
@@ -566,11 +613,13 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
  */
 static struct bio *clone_bio(struct bio *bio, sector_t sector,
 			     unsigned short idx, unsigned short bv_count,
-			     unsigned int len)
+			     unsigned int len, struct bio_set *bs)
 {
 	struct bio *clone;
 
-	clone = bio_clone(bio, GFP_NOIO);
+	clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
+	__bio_clone(clone, bio);
+	clone->bi_destructor = dm_bio_destructor;
 	clone->bi_sector = sector;
 	clone->bi_idx = idx;
 	clone->bi_vcnt = idx + bv_count;
@@ -601,7 +650,8 @@ static void __clone_and_map(struct clone_info *ci)
 		 * the remaining io with a single clone.
 		 */
 		clone = clone_bio(bio, ci->sector, ci->idx,
-				  bio->bi_vcnt - ci->idx, ci->sector_count);
+				  bio->bi_vcnt - ci->idx, ci->sector_count,
+				  ci->md->bs);
 		__map_bio(ti, clone, tio);
 		ci->sector_count = 0;
 
@@ -624,7 +674,8 @@ static void __clone_and_map(struct clone_info *ci)
 			len += bv_len;
 		}
 
-		clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len);
+		clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
+				  ci->md->bs);
 		__map_bio(ti, clone, tio);
 
 		ci->sector += len;
@@ -653,7 +704,8 @@ static void __clone_and_map(struct clone_info *ci)
 			len = min(remaining, max);
 
 			clone = split_bvec(bio, ci->sector, ci->idx,
-					   bv->bv_offset + offset, len);
+					   bv->bv_offset + offset, len,
+					   ci->md->bs);
 
 			__map_bio(ti, clone, tio);
 
@@ -903,7 +955,7 @@ static struct mapped_device *alloc_dev(int minor)
 
 	md->queue = blk_alloc_queue(GFP_KERNEL);
 	if (!md->queue)
-		goto bad1;
+		goto bad1_free_minor;
 
 	md->queue->queuedata = md;
 	md->queue->backing_dev_info.congested_fn = dm_any_congested;
@@ -921,6 +973,10 @@ static struct mapped_device *alloc_dev(int minor)
 	if (!md->tio_pool)
 		goto bad3;
 
+	md->bs = bioset_create(16, 16, 4);
+	if (!md->bs)
+		goto bad_no_bioset;
+
 	md->disk = alloc_disk(1);
 	if (!md->disk)
 		goto bad4;
@@ -948,11 +1004,14 @@ static struct mapped_device *alloc_dev(int minor)
 	return md;
 
  bad4:
+	bioset_free(md->bs);
+ bad_no_bioset:
 	mempool_destroy(md->tio_pool);
  bad3:
 	mempool_destroy(md->io_pool);
  bad2:
 	blk_cleanup_queue(md->queue);
+ bad1_free_minor:
 	free_minor(minor);
  bad1:
 	module_put(THIS_MODULE);
@@ -971,6 +1030,7 @@ static void free_dev(struct mapped_device *md)
 	}
 	mempool_destroy(md->tio_pool);
 	mempool_destroy(md->io_pool);
+	bioset_free(md->bs);
 	del_gendisk(md->disk);
 	free_minor(minor);
 
@@ -1319,7 +1379,9 @@ int dm_resume(struct mapped_device *md)
 	if (!map || !dm_table_get_size(map))
 		goto out;
 
-	dm_table_resume_targets(map);
+	r = dm_table_resume_targets(map);
+	if (r)
+		goto out;
 
 	down_write(&md->io_lock);
 	clear_bit(DMF_BLOCK_IO, &md->flags);
@@ -1337,6 +1399,8 @@ int dm_resume(struct mapped_device *md)
 
 	dm_table_unplug_all(map);
 
+	kobject_uevent(&md->disk->kobj, KOBJ_CHANGE);
+
 	r = 0;
 
 out:
@@ -1377,6 +1441,7 @@ int dm_suspended(struct mapped_device *md)
 static struct block_device_operations dm_blk_dops = {
 	.open = dm_blk_open,
 	.release = dm_blk_close,
+	.ioctl = dm_blk_ioctl,
 	.getgeo = dm_blk_getgeo,
 	.owner = THIS_MODULE
 };
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 3c03c0ecab7..a48ec5e3c1f 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -21,6 +21,11 @@
 #define DMERR(f, arg...) printk(KERN_ERR DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
 #define DMWARN(f, arg...) printk(KERN_WARNING DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
 #define DMINFO(f, arg...) printk(KERN_INFO DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
+#ifdef CONFIG_DM_DEBUG
+#  define DMDEBUG(f, arg...) printk(KERN_DEBUG DM_NAME ": " DM_MSG_PREFIX " DEBUG: " f "\n", ## arg)
+#else
+#  define DMDEBUG(f, arg...) do {} while (0)
+#endif
 
 #define DMEMIT(x...) sz += ((sz >= maxlen) ? \
 			  0 : scnprintf(result + sz, maxlen - sz, x))
@@ -52,7 +57,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q);
 struct list_head *dm_table_get_devices(struct dm_table *t);
 void dm_table_presuspend_targets(struct dm_table *t);
 void dm_table_postsuspend_targets(struct dm_table *t);
-void dm_table_resume_targets(struct dm_table *t);
+int dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
 void dm_table_unplug_all(struct dm_table *t);
 int dm_table_flush_all(struct dm_table *t);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index b99c19c7eb2..c625ddb8833 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -111,6 +111,19 @@ static int linear_issue_flush(request_queue_t *q, struct gendisk *disk,
 	return ret;
 }
 
+static int linear_congested(void *data, int bits)
+{
+	mddev_t *mddev = data;
+	linear_conf_t *conf = mddev_to_conf(mddev);
+	int i, ret = 0;
+
+	for (i = 0; i < mddev->raid_disks && !ret ; i++) {
+		request_queue_t *q = bdev_get_queue(conf->disks[i].rdev->bdev);
+		ret |= bdi_congested(&q->backing_dev_info, bits);
+	}
+	return ret;
+}
+
 static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 {
 	linear_conf_t *conf;
@@ -269,6 +282,8 @@ static int linear_run (mddev_t *mddev)
 	blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
 	mddev->queue->unplug_fn = linear_unplug;
 	mddev->queue->issue_flush_fn = linear_issue_flush;
+	mddev->queue->backing_dev_info.congested_fn = linear_congested;
+	mddev->queue->backing_dev_info.congested_data = mddev;
 	return 0;
 }
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8dbab2ef388..38a0a5741d5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -389,8 +389,12 @@ static int super_written(struct bio *bio, unsigned int bytes_done, int error)
 	if (bio->bi_size)
 		return 1;
 
-	if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags))
+	if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+		printk("md: super_written gets error=%d, uptodate=%d\n",
+		       error, test_bit(BIO_UPTODATE, &bio->bi_flags));
+		WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
 		md_error(mddev, rdev);
+	}
 
 	if (atomic_dec_and_test(&mddev->pending_writes))
 		wake_up(&mddev->sb_wait);
@@ -1587,7 +1591,7 @@ static void sync_sbs(mddev_t * mddev, int nospares)
 	}
 }
 
-void md_update_sb(mddev_t * mddev)
+static void md_update_sb(mddev_t * mddev, int force_change)
 {
 	int err;
 	struct list_head *tmp;
@@ -1598,7 +1602,18 @@ void md_update_sb(mddev_t * mddev)
 repeat:
 	spin_lock_irq(&mddev->write_lock);
 
-	if (mddev->degraded && mddev->sb_dirty == 3)
+	set_bit(MD_CHANGE_PENDING, &mddev->flags);
+	if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
+		force_change = 1;
+	if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
+		/* just a clean<-> dirty transition, possibly leave spares alone,
+		 * though if events isn't the right even/odd, we will have to do
+		 * spares after all
+		 */
+		nospares = 1;
+	if (force_change)
+		nospares = 0;
+	if (mddev->degraded)
 		/* If the array is degraded, then skipping spares is both
 		 * dangerous and fairly pointless.
 		 * Dangerous because a device that was removed from the array
@@ -1608,20 +1623,14 @@ repeat:
 		 * then a recovery will happen and soon that array won't
 		 * be degraded any more and the spare can go back to sleep then.
 		 */
-		mddev->sb_dirty = 1;
+		nospares = 0;
 
 	sync_req = mddev->in_sync;
 	mddev->utime = get_seconds();
-	if (mddev->sb_dirty == 3)
-		/* just a clean<-> dirty transition, possibly leave spares alone,
-		 * though if events isn't the right even/odd, we will have to do
-		 * spares after all
-		 */
-		nospares = 1;
 
 	/* If this is just a dirty<->clean transition, and the array is clean
 	 * and 'events' is odd, we can roll back to the previous clean state */
-	if (mddev->sb_dirty == 3
+	if (nospares
 	    && (mddev->in_sync && mddev->recovery_cp == MaxSector)
 	    && (mddev->events & 1))
 		mddev->events--;
@@ -1652,7 +1661,6 @@ repeat:
 		MD_BUG();
 		mddev->events --;
 	}
-	mddev->sb_dirty = 2;
 	sync_sbs(mddev, nospares);
 
 	/*
@@ -1660,7 +1668,7 @@ repeat:
 	 * nonpersistent superblocks
 	 */
 	if (!mddev->persistent) {
-		mddev->sb_dirty = 0;
+		clear_bit(MD_CHANGE_PENDING, &mddev->flags);
 		spin_unlock_irq(&mddev->write_lock);
 		wake_up(&mddev->sb_wait);
 		return;
@@ -1697,20 +1705,20 @@ repeat:
 			break;
 	}
 	md_super_wait(mddev);
-	/* if there was a failure, sb_dirty was set to 1, and we re-write super */
+	/* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
 
 	spin_lock_irq(&mddev->write_lock);
-	if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) {
+	if (mddev->in_sync != sync_req ||
+	    test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
 		/* have to write it out again */
 		spin_unlock_irq(&mddev->write_lock);
 		goto repeat;
 	}
-	mddev->sb_dirty = 0;
+	clear_bit(MD_CHANGE_PENDING, &mddev->flags);
 	spin_unlock_irq(&mddev->write_lock);
 	wake_up(&mddev->sb_wait);
 
 }
-EXPORT_SYMBOL_GPL(md_update_sb);
 
 /* words written to sysfs files may, or my not, be \n terminated.
  * We want to accept with case. For this we use cmd_match.
@@ -1783,7 +1791,7 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 		else {
 			mddev_t *mddev = rdev->mddev;
 			kick_rdev_from_array(rdev);
-			md_update_sb(mddev);
+			md_update_sb(mddev, 1);
 			md_new_event(mddev);
 			err = 0;
 		}
@@ -2426,7 +2434,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
 			spin_lock_irq(&mddev->write_lock);
 			if (atomic_read(&mddev->writes_pending) == 0) {
 				mddev->in_sync = 1;
-				mddev->sb_dirty = 1;
+				set_bit(MD_CHANGE_CLEAN, &mddev->flags);
 			}
 			spin_unlock_irq(&mddev->write_lock);
 		} else {
@@ -2438,7 +2446,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
 	case active:
 		if (mddev->pers) {
 			restart_array(mddev);
-			mddev->sb_dirty = 0;
+			clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
 			wake_up(&mddev->sb_wait);
 			err = 0;
 		} else {
@@ -2520,6 +2528,36 @@ static struct md_sysfs_entry md_new_device =
 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
 
 static ssize_t
+bitmap_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	char *end;
+	unsigned long chunk, end_chunk;
+
+	if (!mddev->bitmap)
+		goto out;
+	/* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
+	while (*buf) {
+		chunk = end_chunk = simple_strtoul(buf, &end, 0);
+		if (buf == end) break;
+		if (*end == '-') { /* range */
+			buf = end + 1;
+			end_chunk = simple_strtoul(buf, &end, 0);
+			if (buf == end) break;
+		}
+		if (*end && !isspace(*end)) break;
+		bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
+		buf = end;
+		while (isspace(*buf)) buf++;
+	}
+	bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
+out:
+	return len;
+}
+
+static struct md_sysfs_entry md_bitmap =
+__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
+
+static ssize_t
 size_show(mddev_t *mddev, char *page)
 {
 	return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
@@ -2543,7 +2581,7 @@ size_store(mddev_t *mddev, const char *buf, size_t len)
 
 	if (mddev->pers) {
 		err = update_size(mddev, size);
-		md_update_sb(mddev);
+		md_update_sb(mddev, 1);
 	} else {
 		if (mddev->size == 0 ||
 		    mddev->size > size)
@@ -2839,6 +2877,7 @@ static struct attribute *md_redundancy_attrs[] = {
 	&md_sync_completed.attr,
 	&md_suspend_lo.attr,
 	&md_suspend_hi.attr,
+	&md_bitmap.attr,
 	NULL,
 };
 static struct attribute_group md_redundancy_group = {
@@ -3111,8 +3150,8 @@ static int do_md_run(mddev_t * mddev)
 	
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	
-	if (mddev->sb_dirty)
-		md_update_sb(mddev);
+	if (mddev->flags)
+		md_update_sb(mddev, 0);
 
 	set_capacity(disk, mddev->array_size<<1);
 
@@ -3275,10 +3314,10 @@ static int do_md_stop(mddev_t * mddev, int mode)
 			if (mddev->ro)
 				mddev->ro = 0;
 		}
-		if (!mddev->in_sync || mddev->sb_dirty) {
+		if (!mddev->in_sync || mddev->flags) {
 			/* mark array as shutdown cleanly */
 			mddev->in_sync = 1;
-			md_update_sb(mddev);
+			md_update_sb(mddev, 1);
 		}
 		if (mode == 1)
 			set_disk_ro(disk, 1);
@@ -3374,6 +3413,7 @@ static void autorun_devices(int part)
 
 	printk(KERN_INFO "md: autorun ...\n");
 	while (!list_empty(&pending_raid_disks)) {
+		int unit;
 		dev_t dev;
 		LIST_HEAD(candidates);
 		rdev0 = list_entry(pending_raid_disks.next,
@@ -3393,16 +3433,19 @@ static void autorun_devices(int part)
 		 * mostly sane superblocks. It's time to allocate the
 		 * mddev.
 		 */
-		if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) {
+		if (part) {
+			dev = MKDEV(mdp_major,
+				    rdev0->preferred_minor << MdpMinorShift);
+			unit = MINOR(dev) >> MdpMinorShift;
+		} else {
+			dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
+			unit = MINOR(dev);
+		}
+		if (rdev0->preferred_minor != unit) {
 			printk(KERN_INFO "md: unit number in %s is bad: %d\n",
 			       bdevname(rdev0->bdev, b), rdev0->preferred_minor);
 			break;
 		}
-		if (part)
-			dev = MKDEV(mdp_major,
-				    rdev0->preferred_minor << MdpMinorShift);
-		else
-			dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
 
 		md_probe(dev, NULL, NULL);
 		mddev = mddev_find(dev);
@@ -3440,67 +3483,6 @@ static void autorun_devices(int part)
 	printk(KERN_INFO "md: ... autorun DONE.\n");
 }
 
-/*
- * import RAID devices based on one partition
- * if possible, the array gets run as well.
- */
-
-static int autostart_array(dev_t startdev)
-{
-	char b[BDEVNAME_SIZE];
-	int err = -EINVAL, i;
-	mdp_super_t *sb = NULL;
-	mdk_rdev_t *start_rdev = NULL, *rdev;
-
-	start_rdev = md_import_device(startdev, 0, 0);
-	if (IS_ERR(start_rdev))
-		return err;
-
-
-	/* NOTE: this can only work for 0.90.0 superblocks */
-	sb = (mdp_super_t*)page_address(start_rdev->sb_page);
-	if (sb->major_version != 0 ||
-	    sb->minor_version != 90 ) {
-		printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n");
-		export_rdev(start_rdev);
-		return err;
-	}
-
-	if (test_bit(Faulty, &start_rdev->flags)) {
-		printk(KERN_WARNING 
-			"md: can not autostart based on faulty %s!\n",
-			bdevname(start_rdev->bdev,b));
-		export_rdev(start_rdev);
-		return err;
-	}
-	list_add(&start_rdev->same_set, &pending_raid_disks);
-
-	for (i = 0; i < MD_SB_DISKS; i++) {
-		mdp_disk_t *desc = sb->disks + i;
-		dev_t dev = MKDEV(desc->major, desc->minor);
-
-		if (!dev)
-			continue;
-		if (dev == startdev)
-			continue;
-		if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor)
-			continue;
-		rdev = md_import_device(dev, 0, 0);
-		if (IS_ERR(rdev))
-			continue;
-
-		list_add(&rdev->same_set, &pending_raid_disks);
-	}
-
-	/*
-	 * possibly return codes
-	 */
-	autorun_devices(0);
-	return 0;
-
-}
-
-
 static int get_version(void __user * arg)
 {
 	mdu_version_t ver;
@@ -3808,7 +3790,7 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev)
 		goto busy;
 
 	kick_rdev_from_array(rdev);
-	md_update_sb(mddev);
+	md_update_sb(mddev, 1);
 	md_new_event(mddev);
 
 	return 0;
@@ -3885,7 +3867,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
 
 	rdev->raid_disk = -1;
 
-	md_update_sb(mddev);
+	md_update_sb(mddev, 1);
 
 	/*
 	 * Kick recovery, maybe this spare has to be added to the
@@ -4016,7 +3998,8 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
 
 	mddev->max_disks     = MD_SB_DISKS;
 
-	mddev->sb_dirty      = 1;
+	mddev->flags         = 0;
+	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 
 	mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
 	mddev->bitmap_offset = 0;
@@ -4185,7 +4168,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
 			mddev->bitmap_offset = 0;
 		}
 	}
-	md_update_sb(mddev);
+	md_update_sb(mddev, 1);
 	return rv;
 }
 
@@ -4259,27 +4242,6 @@ static int md_ioctl(struct inode *inode, struct file *file,
 		goto abort;
 	}
 
-
-	if (cmd == START_ARRAY) {
-		/* START_ARRAY doesn't need to lock the array as autostart_array
-		 * does the locking, and it could even be a different array
-		 */
-		static int cnt = 3;
-		if (cnt > 0 ) {
-			printk(KERN_WARNING
-			       "md: %s(pid %d) used deprecated START_ARRAY ioctl. "
-			       "This will not be supported beyond July 2006\n",
-			       current->comm, current->pid);
-			cnt--;
-		}
-		err = autostart_array(new_decode_dev(arg));
-		if (err) {
-			printk(KERN_WARNING "md: autostart failed!\n");
-			goto abort;
-		}
-		goto done;
-	}
-
 	err = mddev_lock(mddev);
 	if (err) {
 		printk(KERN_INFO 
@@ -4687,9 +4649,11 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
 	seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
 		   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
 		    "reshape" :
-		      (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
-		       "resync" : "recovery")),
-		      per_milli/10, per_milli % 10,
+		    (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
+		     "check" :
+		     (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
+		      "resync" : "recovery"))),
+		   per_milli/10, per_milli % 10,
 		   (unsigned long long) resync,
 		   (unsigned long long) max_blocks);
 
@@ -5042,12 +5006,12 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
 		spin_lock_irq(&mddev->write_lock);
 		if (mddev->in_sync) {
 			mddev->in_sync = 0;
-			mddev->sb_dirty = 3;
+			set_bit(MD_CHANGE_CLEAN, &mddev->flags);
 			md_wakeup_thread(mddev->thread);
 		}
 		spin_unlock_irq(&mddev->write_lock);
 	}
-	wait_event(mddev->sb_wait, mddev->sb_dirty==0);
+	wait_event(mddev->sb_wait, mddev->flags==0);
 }
 
 void md_write_end(mddev_t *mddev)
@@ -5078,6 +5042,7 @@ void md_do_sync(mddev_t *mddev)
 	int skipped = 0;
 	struct list_head *rtmp;
 	mdk_rdev_t *rdev;
+	char *desc;
 
 	/* just incase thread restarts... */
 	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
@@ -5085,6 +5050,18 @@ void md_do_sync(mddev_t *mddev)
 	if (mddev->ro) /* never try to sync a read-only array */
 		return;
 
+	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
+			desc = "data-check";
+		else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+			desc = "requested-resync";
+		else
+			desc = "resync";
+	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+		desc = "reshape";
+	else
+		desc = "recovery";
+
 	/* we overload curr_resync somewhat here.
 	 * 0 == not engaged in resync at all
 	 * 2 == checking that there is no conflict with another sync
@@ -5128,10 +5105,10 @@ void md_do_sync(mddev_t *mddev)
 				prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE);
 				if (!kthread_should_stop() &&
 				    mddev2->curr_resync >= mddev->curr_resync) {
-					printk(KERN_INFO "md: delaying resync of %s"
-					       " until %s has finished resync (they"
+					printk(KERN_INFO "md: delaying %s of %s"
+					       " until %s has finished (they"
 					       " share one or more physical units)\n",
-					       mdname(mddev), mdname(mddev2));
+					       desc, mdname(mddev), mdname(mddev2));
 					mddev_put(mddev2);
 					schedule();
 					finish_wait(&resync_wait, &wq);
@@ -5167,12 +5144,12 @@ void md_do_sync(mddev_t *mddev)
 				j = rdev->recovery_offset;
 	}
 
-	printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
-	printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
-		" %d KB/sec/disc.\n", speed_min(mddev));
+	printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
+	printk(KERN_INFO "md: minimum _guaranteed_  speed:"
+		" %d KB/sec/disk.\n", speed_min(mddev));
 	printk(KERN_INFO "md: using maximum available idle IO bandwidth "
-	       "(but not more than %d KB/sec) for reconstruction.\n",
-	       speed_max(mddev));
+	       "(but not more than %d KB/sec) for %s.\n",
+	       speed_max(mddev), desc);
 
 	is_mddev_idle(mddev); /* this also initializes IO event counters */
 
@@ -5198,8 +5175,8 @@ void md_do_sync(mddev_t *mddev)
 
 	if (j>2) {
 		printk(KERN_INFO 
-			"md: resuming recovery of %s from checkpoint.\n",
-			mdname(mddev));
+		       "md: resuming %s of %s from checkpoint.\n",
+		       desc, mdname(mddev));
 		mddev->curr_resync = j;
 	}
 
@@ -5282,7 +5259,7 @@ void md_do_sync(mddev_t *mddev)
 			}
 		}
 	}
-	printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev));
+	printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc);
 	/*
 	 * this also signals 'finished resyncing' to md_stop
 	 */
@@ -5302,8 +5279,8 @@ void md_do_sync(mddev_t *mddev)
 			if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
 				if (mddev->curr_resync >= mddev->recovery_cp) {
 					printk(KERN_INFO
-					       "md: checkpointing recovery of %s.\n",
-					       mdname(mddev));
+					       "md: checkpointing %s of %s.\n",
+					       desc, mdname(mddev));
 					mddev->recovery_cp = mddev->curr_resync;
 				}
 			} else
@@ -5317,7 +5294,6 @@ void md_do_sync(mddev_t *mddev)
 				    !test_bit(In_sync, &rdev->flags) &&
 				    rdev->recovery_offset < mddev->curr_resync)
 					rdev->recovery_offset = mddev->curr_resync;
-			mddev->sb_dirty = 1;
 		}
 	}
 
@@ -5374,7 +5350,7 @@ void md_check_recovery(mddev_t *mddev)
 	}
 
 	if ( ! (
-		mddev->sb_dirty ||
+		mddev->flags ||
 		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
 		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
 		(mddev->safemode == 1) ||
@@ -5390,14 +5366,14 @@ void md_check_recovery(mddev_t *mddev)
 		if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
 		    !mddev->in_sync && mddev->recovery_cp == MaxSector) {
 			mddev->in_sync = 1;
-			mddev->sb_dirty = 3;
+			set_bit(MD_CHANGE_CLEAN, &mddev->flags);
 		}
 		if (mddev->safemode == 1)
 			mddev->safemode = 0;
 		spin_unlock_irq(&mddev->write_lock);
 
-		if (mddev->sb_dirty)
-			md_update_sb(mddev);
+		if (mddev->flags)
+			md_update_sb(mddev, 0);
 
 
 		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
@@ -5416,7 +5392,7 @@ void md_check_recovery(mddev_t *mddev)
 				/* activate any spares */
 				mddev->pers->spare_active(mddev);
 			}
-			md_update_sb(mddev);
+			md_update_sb(mddev, 1);
 
 			/* if array is no-longer degraded, then any saved_raid_disk
 			 * information must be scrapped
@@ -5556,22 +5532,15 @@ static void md_geninit(void)
 
 static int __init md_init(void)
 {
-	printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d,"
-			" MD_SB_DISKS=%d\n",
-			MD_MAJOR_VERSION, MD_MINOR_VERSION,
-			MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
-	printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR_HI,
-			BITMAP_MINOR);
-
 	if (register_blkdev(MAJOR_NR, "md"))
 		return -1;
 	if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
 		unregister_blkdev(MAJOR_NR, "md");
 		return -1;
 	}
-	blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE,
-				md_probe, NULL, NULL);
-	blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE,
+	blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE,
+			    md_probe, NULL, NULL);
+	blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
 			    md_probe, NULL, NULL);
 
 	register_reboot_notifier(&md_notifier);
@@ -5630,8 +5599,8 @@ static __exit void md_exit(void)
 	mddev_t *mddev;
 	struct list_head *tmp;
 
-	blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS);
-	blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift);
+	blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS);
+	blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
 
 	unregister_blkdev(MAJOR_NR,"md");
 	unregister_blkdev(mdp_major, "mdp");
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 1cc9de44ce8..171ff41b52b 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -228,6 +228,28 @@ static int multipath_issue_flush(request_queue_t *q, struct gendisk *disk,
 	rcu_read_unlock();
 	return ret;
 }
+static int multipath_congested(void *data, int bits)
+{
+	mddev_t *mddev = data;
+	multipath_conf_t *conf = mddev_to_conf(mddev);
+	int i, ret = 0;
+
+	rcu_read_lock();
+	for (i = 0; i < mddev->raid_disks ; i++) {
+		mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
+		if (rdev && !test_bit(Faulty, &rdev->flags)) {
+			request_queue_t *q = bdev_get_queue(rdev->bdev);
+
+			ret |= bdi_congested(&q->backing_dev_info, bits);
+			/* Just like multipath_map, we just check the
+			 * first available device
+			 */
+			break;
+		}
+	}
+	rcu_read_unlock();
+	return ret;
+}
 
 /*
  * Careful, this can execute in IRQ contexts as well!
@@ -253,7 +275,7 @@ static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
 			char b[BDEVNAME_SIZE];
 			clear_bit(In_sync, &rdev->flags);
 			set_bit(Faulty, &rdev->flags);
-			mddev->sb_dirty = 1;
+			set_bit(MD_CHANGE_DEVS, &mddev->flags);
 			conf->working_disks--;
 			printk(KERN_ALERT "multipath: IO failure on %s,"
 				" disabling IO path. \n	Operation continuing"
@@ -470,7 +492,6 @@ static int multipath_run (mddev_t *mddev)
 	}
 
 	conf->raid_disks = mddev->raid_disks;
-	mddev->sb_dirty = 1;
 	conf->mddev = mddev;
 	spin_lock_init(&conf->device_lock);
 	INIT_LIST_HEAD(&conf->retry_list);
@@ -510,6 +531,8 @@ static int multipath_run (mddev_t *mddev)
 
 	mddev->queue->unplug_fn = multipath_unplug;
 	mddev->queue->issue_flush_fn = multipath_issue_flush;
+	mddev->queue->backing_dev_info.congested_fn = multipath_congested;
+	mddev->queue->backing_dev_info.congested_data = mddev;
 
 	return 0;
 
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index cb8c6317e4e..dfe32149ad3 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -60,6 +60,21 @@ static int raid0_issue_flush(request_queue_t *q, struct gendisk *disk,
 	return ret;
 }
 
+static int raid0_congested(void *data, int bits)
+{
+	mddev_t *mddev = data;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
+	mdk_rdev_t **devlist = conf->strip_zone[0].dev;
+	int i, ret = 0;
+
+	for (i = 0; i < mddev->raid_disks && !ret ; i++) {
+		request_queue_t *q = bdev_get_queue(devlist[i]->bdev);
+
+		ret |= bdi_congested(&q->backing_dev_info, bits);
+	}
+	return ret;
+}
+
 
 static int create_strip_zones (mddev_t *mddev)
 {
@@ -236,6 +251,8 @@ static int create_strip_zones (mddev_t *mddev)
 	mddev->queue->unplug_fn = raid0_unplug;
 
 	mddev->queue->issue_flush_fn = raid0_issue_flush;
+	mddev->queue->backing_dev_info.congested_fn = raid0_congested;
+	mddev->queue->backing_dev_info.congested_data = mddev;
 
 	printk("raid0: done.\n");
 	return 0;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 3b4d69c0562..dc9d2def027 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -271,7 +271,7 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int
 	 */
 	update_head_pos(mirror, r1_bio);
 
-	if (uptodate || conf->working_disks <= 1) {
+	if (uptodate || (conf->raid_disks - conf->mddev->degraded) <= 1) {
 		/*
 		 * Set R1BIO_Uptodate in our master bio, so that
 		 * we will return a good error code for to the higher
@@ -601,6 +601,32 @@ static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,
 	return ret;
 }
 
+static int raid1_congested(void *data, int bits)
+{
+	mddev_t *mddev = data;
+	conf_t *conf = mddev_to_conf(mddev);
+	int i, ret = 0;
+
+	rcu_read_lock();
+	for (i = 0; i < mddev->raid_disks; i++) {
+		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
+		if (rdev && !test_bit(Faulty, &rdev->flags)) {
+			request_queue_t *q = bdev_get_queue(rdev->bdev);
+
+			/* Note the '|| 1' - when read_balance prefers
+			 * non-congested targets, it can be removed
+			 */
+			if ((bits & (1<<BDI_write_congested)) || 1)
+				ret |= bdi_congested(&q->backing_dev_info, bits);
+			else
+				ret &= bdi_congested(&q->backing_dev_info, bits);
+		}
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
+
 /* Barriers....
  * Sometimes we need to suspend IO while we do something else,
  * either some resync/recovery, or reconfigure the array.
@@ -929,7 +955,7 @@ static void status(struct seq_file *seq, mddev_t *mddev)
 	int i;
 
 	seq_printf(seq, " [%d/%d] [", conf->raid_disks,
-						conf->working_disks);
+		   conf->raid_disks - mddev->degraded);
 	rcu_read_lock();
 	for (i = 0; i < conf->raid_disks; i++) {
 		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
@@ -953,26 +979,27 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 	 * else mark the drive as failed
 	 */
 	if (test_bit(In_sync, &rdev->flags)
-	    && conf->working_disks == 1)
+	    && (conf->raid_disks - mddev->degraded) == 1)
 		/*
 		 * Don't fail the drive, act as though we were just a
 		 * normal single drive
 		 */
 		return;
-	if (test_bit(In_sync, &rdev->flags)) {
+	if (test_and_clear_bit(In_sync, &rdev->flags)) {
+		unsigned long flags;
+		spin_lock_irqsave(&conf->device_lock, flags);
 		mddev->degraded++;
-		conf->working_disks--;
+		spin_unlock_irqrestore(&conf->device_lock, flags);
 		/*
 		 * if recovery is running, make sure it aborts.
 		 */
 		set_bit(MD_RECOVERY_ERR, &mddev->recovery);
 	}
-	clear_bit(In_sync, &rdev->flags);
 	set_bit(Faulty, &rdev->flags);
-	mddev->sb_dirty = 1;
+	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 	printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n"
 		"	Operation continuing on %d devices\n",
-		bdevname(rdev->bdev,b), conf->working_disks);
+		bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
 }
 
 static void print_conf(conf_t *conf)
@@ -984,7 +1011,7 @@ static void print_conf(conf_t *conf)
 		printk("(!conf)\n");
 		return;
 	}
-	printk(" --- wd:%d rd:%d\n", conf->working_disks,
+	printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
 		conf->raid_disks);
 
 	rcu_read_lock();
@@ -1023,10 +1050,11 @@ static int raid1_spare_active(mddev_t *mddev)
 		mdk_rdev_t *rdev = conf->mirrors[i].rdev;
 		if (rdev
 		    && !test_bit(Faulty, &rdev->flags)
-		    && !test_bit(In_sync, &rdev->flags)) {
-			conf->working_disks++;
+		    && !test_and_set_bit(In_sync, &rdev->flags)) {
+			unsigned long flags;
+			spin_lock_irqsave(&conf->device_lock, flags);
 			mddev->degraded--;
-			set_bit(In_sync, &rdev->flags);
+			spin_unlock_irqrestore(&conf->device_lock, flags);
 		}
 	}
 
@@ -1368,6 +1396,95 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
  *	3.	Performs writes following reads for array syncronising.
  */
 
+static void fix_read_error(conf_t *conf, int read_disk,
+			   sector_t sect, int sectors)
+{
+	mddev_t *mddev = conf->mddev;
+	while(sectors) {
+		int s = sectors;
+		int d = read_disk;
+		int success = 0;
+		int start;
+		mdk_rdev_t *rdev;
+
+		if (s > (PAGE_SIZE>>9))
+			s = PAGE_SIZE >> 9;
+
+		do {
+			/* Note: no rcu protection needed here
+			 * as this is synchronous in the raid1d thread
+			 * which is the thread that might remove
+			 * a device.  If raid1d ever becomes multi-threaded....
+			 */
+			rdev = conf->mirrors[d].rdev;
+			if (rdev &&
+			    test_bit(In_sync, &rdev->flags) &&
+			    sync_page_io(rdev->bdev,
+					 sect + rdev->data_offset,
+					 s<<9,
+					 conf->tmppage, READ))
+				success = 1;
+			else {
+				d++;
+				if (d == conf->raid_disks)
+					d = 0;
+			}
+		} while (!success && d != read_disk);
+
+		if (!success) {
+			/* Cannot read from anywhere -- bye bye array */
+			md_error(mddev, conf->mirrors[read_disk].rdev);
+			break;
+		}
+		/* write it back and re-read */
+		start = d;
+		while (d != read_disk) {
+			if (d==0)
+				d = conf->raid_disks;
+			d--;
+			rdev = conf->mirrors[d].rdev;
+			if (rdev &&
+			    test_bit(In_sync, &rdev->flags)) {
+				if (sync_page_io(rdev->bdev,
+						 sect + rdev->data_offset,
+						 s<<9, conf->tmppage, WRITE)
+				    == 0)
+					/* Well, this device is dead */
+					md_error(mddev, rdev);
+			}
+		}
+		d = start;
+		while (d != read_disk) {
+			char b[BDEVNAME_SIZE];
+			if (d==0)
+				d = conf->raid_disks;
+			d--;
+			rdev = conf->mirrors[d].rdev;
+			if (rdev &&
+			    test_bit(In_sync, &rdev->flags)) {
+				if (sync_page_io(rdev->bdev,
+						 sect + rdev->data_offset,
+						 s<<9, conf->tmppage, READ)
+				    == 0)
+					/* Well, this device is dead */
+					md_error(mddev, rdev);
+				else {
+					atomic_add(s, &rdev->corrected_errors);
+					printk(KERN_INFO
+					       "raid1:%s: read error corrected "
+					       "(%d sectors at %llu on %s)\n",
+					       mdname(mddev), s,
+					       (unsigned long long)sect +
+					           rdev->data_offset,
+					       bdevname(rdev->bdev, b));
+				}
+			}
+		}
+		sectors -= s;
+		sect += s;
+	}
+}
+
 static void raid1d(mddev_t *mddev)
 {
 	r1bio_t *r1_bio;
@@ -1460,86 +1577,14 @@ static void raid1d(mddev_t *mddev)
 			 * This is all done synchronously while the array is
 			 * frozen
 			 */
-			sector_t sect = r1_bio->sector;
-			int sectors = r1_bio->sectors;
-			freeze_array(conf);
-			if (mddev->ro == 0) while(sectors) {
-				int s = sectors;
-				int d = r1_bio->read_disk;
-				int success = 0;
-
-				if (s > (PAGE_SIZE>>9))
-					s = PAGE_SIZE >> 9;
-
-				do {
-					/* Note: no rcu protection needed here
-					 * as this is synchronous in the raid1d thread
-					 * which is the thread that might remove
-					 * a device.  If raid1d ever becomes multi-threaded....
-					 */
-					rdev = conf->mirrors[d].rdev;
-					if (rdev &&
-					    test_bit(In_sync, &rdev->flags) &&
-					    sync_page_io(rdev->bdev,
-							 sect + rdev->data_offset,
-							 s<<9,
-							 conf->tmppage, READ))
-						success = 1;
-					else {
-						d++;
-						if (d == conf->raid_disks)
-							d = 0;
-					}
-				} while (!success && d != r1_bio->read_disk);
-
-				if (success) {
-					/* write it back and re-read */
-					int start = d;
-					while (d != r1_bio->read_disk) {
-						if (d==0)
-							d = conf->raid_disks;
-						d--;
-						rdev = conf->mirrors[d].rdev;
-						if (rdev &&
-						    test_bit(In_sync, &rdev->flags)) {
-							if (sync_page_io(rdev->bdev,
-									 sect + rdev->data_offset,
-									 s<<9, conf->tmppage, WRITE) == 0)
-								/* Well, this device is dead */
-								md_error(mddev, rdev);
-						}
-					}
-					d = start;
-					while (d != r1_bio->read_disk) {
-						if (d==0)
-							d = conf->raid_disks;
-						d--;
-						rdev = conf->mirrors[d].rdev;
-						if (rdev &&
-						    test_bit(In_sync, &rdev->flags)) {
-							if (sync_page_io(rdev->bdev,
-									 sect + rdev->data_offset,
-									 s<<9, conf->tmppage, READ) == 0)
-								/* Well, this device is dead */
-								md_error(mddev, rdev);
-							else {
-								atomic_add(s, &rdev->corrected_errors);
-								printk(KERN_INFO "raid1:%s: read error corrected (%d sectors at %llu on %s)\n",
-								       mdname(mddev), s, (unsigned long long)(sect + rdev->data_offset), bdevname(rdev->bdev, b));
-							}
-						}
-					}
-				} else {
-					/* Cannot read from anywhere -- bye bye array */
-					md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
-					break;
-				}
-				sectors -= s;
-				sect += s;
+			if (mddev->ro == 0) {
+				freeze_array(conf);
+				fix_read_error(conf, r1_bio->read_disk,
+					       r1_bio->sector,
+					       r1_bio->sectors);
+				unfreeze_array(conf);
 			}
 
-			unfreeze_array(conf);
-
 			bio = r1_bio->bios[r1_bio->read_disk];
 			if ((disk=read_balance(conf, r1_bio)) == -1) {
 				printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
@@ -1884,15 +1929,11 @@ static int run(mddev_t *mddev)
 			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		disk->head_position = 0;
-		if (!test_bit(Faulty, &rdev->flags) && test_bit(In_sync, &rdev->flags))
-			conf->working_disks++;
 	}
 	conf->raid_disks = mddev->raid_disks;
 	conf->mddev = mddev;
 	spin_lock_init(&conf->device_lock);
 	INIT_LIST_HEAD(&conf->retry_list);
-	if (conf->working_disks == 1)
-		mddev->recovery_cp = MaxSector;
 
 	spin_lock_init(&conf->resync_lock);
 	init_waitqueue_head(&conf->wait_barrier);
@@ -1900,11 +1941,6 @@ static int run(mddev_t *mddev)
 	bio_list_init(&conf->pending_bio_list);
 	bio_list_init(&conf->flushing_bio_list);
 
-	if (!conf->working_disks) {
-		printk(KERN_ERR "raid1: no operational mirrors for %s\n",
-			mdname(mddev));
-		goto out_free_conf;
-	}
 
 	mddev->degraded = 0;
 	for (i = 0; i < conf->raid_disks; i++) {
@@ -1917,6 +1953,13 @@ static int run(mddev_t *mddev)
 			mddev->degraded++;
 		}
 	}
+	if (mddev->degraded == conf->raid_disks) {
+		printk(KERN_ERR "raid1: no operational mirrors for %s\n",
+			mdname(mddev));
+		goto out_free_conf;
+	}
+	if (conf->raid_disks - mddev->degraded == 1)
+		mddev->recovery_cp = MaxSector;
 
 	/*
 	 * find the first working one and use it as a starting point
@@ -1948,6 +1991,8 @@ static int run(mddev_t *mddev)
 
 	mddev->queue->unplug_fn = raid1_unplug;
 	mddev->queue->issue_flush_fn = raid1_issue_flush;
+	mddev->queue->backing_dev_info.congested_fn = raid1_congested;
+	mddev->queue->backing_dev_info.congested_data = mddev;
 
 	return 0;
 
@@ -2035,7 +2080,7 @@ static int raid1_reshape(mddev_t *mddev)
 	mirror_info_t *newmirrors;
 	conf_t *conf = mddev_to_conf(mddev);
 	int cnt, raid_disks;
-
+	unsigned long flags;
 	int d, d2;
 
 	/* Cannot change chunk_size, layout, or level */
@@ -2094,7 +2139,9 @@ static int raid1_reshape(mddev_t *mddev)
 	kfree(conf->poolinfo);
 	conf->poolinfo = newpoolinfo;
 
+	spin_lock_irqsave(&conf->device_lock, flags);
 	mddev->degraded += (raid_disks - conf->raid_disks);
+	spin_unlock_irqrestore(&conf->device_lock, flags);
 	conf->raid_disks = mddev->raid_disks = raid_disks;
 	mddev->delta_disks = 0;
 
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 016ddb831c9..1250f0eab4a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -648,6 +648,26 @@ static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
 	return ret;
 }
 
+static int raid10_congested(void *data, int bits)
+{
+	mddev_t *mddev = data;
+	conf_t *conf = mddev_to_conf(mddev);
+	int i, ret = 0;
+
+	rcu_read_lock();
+	for (i = 0; i < mddev->raid_disks && ret == 0; i++) {
+		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
+		if (rdev && !test_bit(Faulty, &rdev->flags)) {
+			request_queue_t *q = bdev_get_queue(rdev->bdev);
+
+			ret |= bdi_congested(&q->backing_dev_info, bits);
+		}
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
+
 /* Barriers....
  * Sometimes we need to suspend IO while we do something else,
  * either some resync/recovery, or reconfigure the array.
@@ -921,7 +941,7 @@ static void status(struct seq_file *seq, mddev_t *mddev)
 			seq_printf(seq, " %d far-copies", conf->far_copies);
 	}
 	seq_printf(seq, " [%d/%d] [", conf->raid_disks,
-						conf->working_disks);
+					conf->raid_disks - mddev->degraded);
 	for (i = 0; i < conf->raid_disks; i++)
 		seq_printf(seq, "%s",
 			      conf->mirrors[i].rdev &&
@@ -941,7 +961,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 	 * else mark the drive as failed
 	 */
 	if (test_bit(In_sync, &rdev->flags)
-	    && conf->working_disks == 1)
+	    && conf->raid_disks-mddev->degraded == 1)
 		/*
 		 * Don't fail the drive, just return an IO error.
 		 * The test should really be more sophisticated than
@@ -950,20 +970,21 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 		 * really dead" tests...
 		 */
 		return;
-	if (test_bit(In_sync, &rdev->flags)) {
+	if (test_and_clear_bit(In_sync, &rdev->flags)) {
+		unsigned long flags;
+		spin_lock_irqsave(&conf->device_lock, flags);
 		mddev->degraded++;
-		conf->working_disks--;
+		spin_unlock_irqrestore(&conf->device_lock, flags);
 		/*
 		 * if recovery is running, make sure it aborts.
 		 */
 		set_bit(MD_RECOVERY_ERR, &mddev->recovery);
 	}
-	clear_bit(In_sync, &rdev->flags);
 	set_bit(Faulty, &rdev->flags);
-	mddev->sb_dirty = 1;
+	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 	printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n"
 		"	Operation continuing on %d devices\n",
-		bdevname(rdev->bdev,b), conf->working_disks);
+		bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
 }
 
 static void print_conf(conf_t *conf)
@@ -976,7 +997,7 @@ static void print_conf(conf_t *conf)
 		printk("(!conf)\n");
 		return;
 	}
-	printk(" --- wd:%d rd:%d\n", conf->working_disks,
+	printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
 		conf->raid_disks);
 
 	for (i = 0; i < conf->raid_disks; i++) {
@@ -1034,10 +1055,11 @@ static int raid10_spare_active(mddev_t *mddev)
 		tmp = conf->mirrors + i;
 		if (tmp->rdev
 		    && !test_bit(Faulty, &tmp->rdev->flags)
-		    && !test_bit(In_sync, &tmp->rdev->flags)) {
-			conf->working_disks++;
+		    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
+			unsigned long flags;
+			spin_lock_irqsave(&conf->device_lock, flags);
 			mddev->degraded--;
-			set_bit(In_sync, &tmp->rdev->flags);
+			spin_unlock_irqrestore(&conf->device_lock, flags);
 		}
 	}
 
@@ -1350,9 +1372,119 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
  *
  *	1.	Retries failed read operations on working mirrors.
  *	2.	Updates the raid superblock when problems encounter.
- *	3.	Performs writes following reads for array syncronising.
+ *	3.	Performs writes following reads for array synchronising.
  */
 
+static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
+{
+	int sect = 0; /* Offset from r10_bio->sector */
+	int sectors = r10_bio->sectors;
+	mdk_rdev_t*rdev;
+	while(sectors) {
+		int s = sectors;
+		int sl = r10_bio->read_slot;
+		int success = 0;
+		int start;
+
+		if (s > (PAGE_SIZE>>9))
+			s = PAGE_SIZE >> 9;
+
+		rcu_read_lock();
+		do {
+			int d = r10_bio->devs[sl].devnum;
+			rdev = rcu_dereference(conf->mirrors[d].rdev);
+			if (rdev &&
+			    test_bit(In_sync, &rdev->flags)) {
+				atomic_inc(&rdev->nr_pending);
+				rcu_read_unlock();
+				success = sync_page_io(rdev->bdev,
+						       r10_bio->devs[sl].addr +
+						       sect + rdev->data_offset,
+						       s<<9,
+						       conf->tmppage, READ);
+				rdev_dec_pending(rdev, mddev);
+				rcu_read_lock();
+				if (success)
+					break;
+			}
+			sl++;
+			if (sl == conf->copies)
+				sl = 0;
+		} while (!success && sl != r10_bio->read_slot);
+		rcu_read_unlock();
+
+		if (!success) {
+			/* Cannot read from anywhere -- bye bye array */
+			int dn = r10_bio->devs[r10_bio->read_slot].devnum;
+			md_error(mddev, conf->mirrors[dn].rdev);
+			break;
+		}
+
+		start = sl;
+		/* write it back and re-read */
+		rcu_read_lock();
+		while (sl != r10_bio->read_slot) {
+			int d;
+			if (sl==0)
+				sl = conf->copies;
+			sl--;
+			d = r10_bio->devs[sl].devnum;
+			rdev = rcu_dereference(conf->mirrors[d].rdev);
+			if (rdev &&
+			    test_bit(In_sync, &rdev->flags)) {
+				atomic_inc(&rdev->nr_pending);
+				rcu_read_unlock();
+				atomic_add(s, &rdev->corrected_errors);
+				if (sync_page_io(rdev->bdev,
+						 r10_bio->devs[sl].addr +
+						 sect + rdev->data_offset,
+						 s<<9, conf->tmppage, WRITE)
+				    == 0)
+					/* Well, this device is dead */
+					md_error(mddev, rdev);
+				rdev_dec_pending(rdev, mddev);
+				rcu_read_lock();
+			}
+		}
+		sl = start;
+		while (sl != r10_bio->read_slot) {
+			int d;
+			if (sl==0)
+				sl = conf->copies;
+			sl--;
+			d = r10_bio->devs[sl].devnum;
+			rdev = rcu_dereference(conf->mirrors[d].rdev);
+			if (rdev &&
+			    test_bit(In_sync, &rdev->flags)) {
+				char b[BDEVNAME_SIZE];
+				atomic_inc(&rdev->nr_pending);
+				rcu_read_unlock();
+				if (sync_page_io(rdev->bdev,
+						 r10_bio->devs[sl].addr +
+						 sect + rdev->data_offset,
+						 s<<9, conf->tmppage, READ) == 0)
+					/* Well, this device is dead */
+					md_error(mddev, rdev);
+				else
+					printk(KERN_INFO
+					       "raid10:%s: read error corrected"
+					       " (%d sectors at %llu on %s)\n",
+					       mdname(mddev), s,
+					       (unsigned long long)sect+
+					            rdev->data_offset,
+					       bdevname(rdev->bdev, b));
+
+				rdev_dec_pending(rdev, mddev);
+				rcu_read_lock();
+			}
+		}
+		rcu_read_unlock();
+
+		sectors -= s;
+		sect += s;
+	}
+}
+
 static void raid10d(mddev_t *mddev)
 {
 	r10bio_t *r10_bio;
@@ -1413,105 +1545,12 @@ static void raid10d(mddev_t *mddev)
 			 * This is all done synchronously while the array is
 			 * frozen.
 			 */
-			int sect = 0; /* Offset from r10_bio->sector */
-			int sectors = r10_bio->sectors;
-			freeze_array(conf);
-			if (mddev->ro == 0) while(sectors) {
-				int s = sectors;
-				int sl = r10_bio->read_slot;
-				int success = 0;
-
-				if (s > (PAGE_SIZE>>9))
-					s = PAGE_SIZE >> 9;
-
-				rcu_read_lock();
-				do {
-					int d = r10_bio->devs[sl].devnum;
-					rdev = rcu_dereference(conf->mirrors[d].rdev);
-					if (rdev &&
-					    test_bit(In_sync, &rdev->flags)) {
-						atomic_inc(&rdev->nr_pending);
-						rcu_read_unlock();
-						success = sync_page_io(rdev->bdev,
-								       r10_bio->devs[sl].addr +
-								       sect + rdev->data_offset,
-								       s<<9,
-								       conf->tmppage, READ);
-						rdev_dec_pending(rdev, mddev);
-						rcu_read_lock();
-						if (success)
-							break;
-					}
-					sl++;
-					if (sl == conf->copies)
-						sl = 0;
-				} while (!success && sl != r10_bio->read_slot);
-				rcu_read_unlock();
-
-				if (success) {
-					int start = sl;
-					/* write it back and re-read */
-					rcu_read_lock();
-					while (sl != r10_bio->read_slot) {
-						int d;
-						if (sl==0)
-							sl = conf->copies;
-						sl--;
-						d = r10_bio->devs[sl].devnum;
-						rdev = rcu_dereference(conf->mirrors[d].rdev);
-						if (rdev &&
-						    test_bit(In_sync, &rdev->flags)) {
-							atomic_inc(&rdev->nr_pending);
-							rcu_read_unlock();
-							atomic_add(s, &rdev->corrected_errors);
-							if (sync_page_io(rdev->bdev,
-									 r10_bio->devs[sl].addr +
-									 sect + rdev->data_offset,
-									 s<<9, conf->tmppage, WRITE) == 0)
-								/* Well, this device is dead */
-								md_error(mddev, rdev);
-							rdev_dec_pending(rdev, mddev);
-							rcu_read_lock();
-						}
-					}
-					sl = start;
-					while (sl != r10_bio->read_slot) {
-						int d;
-						if (sl==0)
-							sl = conf->copies;
-						sl--;
-						d = r10_bio->devs[sl].devnum;
-						rdev = rcu_dereference(conf->mirrors[d].rdev);
-						if (rdev &&
-						    test_bit(In_sync, &rdev->flags)) {
-							atomic_inc(&rdev->nr_pending);
-							rcu_read_unlock();
-							if (sync_page_io(rdev->bdev,
-									 r10_bio->devs[sl].addr +
-									 sect + rdev->data_offset,
-									 s<<9, conf->tmppage, READ) == 0)
-								/* Well, this device is dead */
-								md_error(mddev, rdev);
-							else
-								printk(KERN_INFO "raid10:%s: read error corrected (%d sectors at %llu on %s)\n",
-								       mdname(mddev), s, (unsigned long long)(sect+rdev->data_offset), bdevname(rdev->bdev, b));
-
-							rdev_dec_pending(rdev, mddev);
-							rcu_read_lock();
-						}
-					}
-					rcu_read_unlock();
-				} else {
-					/* Cannot read from anywhere -- bye bye array */
-					md_error(mddev, conf->mirrors[r10_bio->devs[r10_bio->read_slot].devnum].rdev);
-					break;
-				}
-				sectors -= s;
-				sect += s;
+			if (mddev->ro == 0) {
+				freeze_array(conf);
+				fix_read_error(conf, mddev, r10_bio);
+				unfreeze_array(conf);
 			}
 
-			unfreeze_array(conf);
-
 			bio = r10_bio->devs[r10_bio->read_slot].bio;
 			r10_bio->devs[r10_bio->read_slot].bio =
 				mddev->ro ? IO_BLOCKED : NULL;
@@ -2018,8 +2057,6 @@ static int run(mddev_t *mddev)
 			mddev->queue->max_sectors = (PAGE_SIZE>>9);
 
 		disk->head_position = 0;
-		if (!test_bit(Faulty, &rdev->flags) && test_bit(In_sync, &rdev->flags))
-			conf->working_disks++;
 	}
 	conf->raid_disks = mddev->raid_disks;
 	conf->mddev = mddev;
@@ -2077,6 +2114,8 @@ static int run(mddev_t *mddev)
 
 	mddev->queue->unplug_fn = raid10_unplug;
 	mddev->queue->issue_flush_fn = raid10_issue_flush;
+	mddev->queue->backing_dev_info.congested_fn = raid10_congested;
+	mddev->queue->backing_dev_info.congested_data = mddev;
 
 	/* Calculate max read-ahead size.
 	 * We need to readahead at least twice a whole stripe....
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 45006600716..37e4ff661b6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -636,7 +636,6 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
  	struct stripe_head *sh = bi->bi_private;
 	raid5_conf_t *conf = sh->raid_conf;
 	int disks = sh->disks, i;
-	unsigned long flags;
 	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
 
 	if (bi->bi_size)
@@ -654,7 +653,6 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
 		return 0;
 	}
 
-	spin_lock_irqsave(&conf->device_lock, flags);
 	if (!uptodate)
 		md_error(conf->mddev, conf->disks[i].rdev);
 
@@ -662,8 +660,7 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
 	
 	clear_bit(R5_LOCKED, &sh->dev[i].flags);
 	set_bit(STRIPE_HANDLE, &sh->state);
-	__release_stripe(conf, sh);
-	spin_unlock_irqrestore(&conf->device_lock, flags);
+	release_stripe(sh);
 	return 0;
 }
 
@@ -696,12 +693,12 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 	PRINTK("raid5: error called\n");
 
 	if (!test_bit(Faulty, &rdev->flags)) {
-		mddev->sb_dirty = 1;
-		if (test_bit(In_sync, &rdev->flags)) {
-			conf->working_disks--;
+		set_bit(MD_CHANGE_DEVS, &mddev->flags);
+		if (test_and_clear_bit(In_sync, &rdev->flags)) {
+			unsigned long flags;
+			spin_lock_irqsave(&conf->device_lock, flags);
 			mddev->degraded++;
-			conf->failed_disks++;
-			clear_bit(In_sync, &rdev->flags);
+			spin_unlock_irqrestore(&conf->device_lock, flags);
 			/*
 			 * if recovery was running, make sure it aborts.
 			 */
@@ -711,7 +708,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 		printk (KERN_ALERT
 			"raid5: Disk failure on %s, disabling device."
 			" Operation continuing on %d devices\n",
-			bdevname(rdev->bdev,b), conf->working_disks);
+			bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
 	}
 }
 
@@ -1353,10 +1350,9 @@ static int page_is_zero(struct page *p)
 static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
 {
 	int sectors_per_chunk = conf->chunk_size >> 9;
-	sector_t x = stripe;
 	int pd_idx, dd_idx;
-	int chunk_offset = sector_div(x, sectors_per_chunk);
-	stripe = x;
+	int chunk_offset = sector_div(stripe, sectors_per_chunk);
+
 	raid5_compute_sector(stripe*(disks-1)*sectors_per_chunk
 			     + chunk_offset, disks, disks-1, &dd_idx, &pd_idx, conf);
 	return pd_idx;
@@ -2597,6 +2593,24 @@ static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk,
 	return ret;
 }
 
+static int raid5_congested(void *data, int bits)
+{
+	mddev_t *mddev = data;
+	raid5_conf_t *conf = mddev_to_conf(mddev);
+
+	/* No difference between reads and writes.  Just check
+	 * how busy the stripe_cache is
+	 */
+	if (conf->inactive_blocked)
+		return 1;
+	if (conf->quiesce)
+		return 1;
+	if (list_empty_careful(&conf->inactive_list))
+		return 1;
+
+	return 0;
+}
+
 static int make_request(request_queue_t *q, struct bio * bi)
 {
 	mddev_t *mddev = q->queuedata;
@@ -2781,9 +2795,9 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 		wait_event(conf->wait_for_overlap,
 			   atomic_read(&conf->reshape_stripes)==0);
 		mddev->reshape_position = conf->expand_progress;
-		mddev->sb_dirty = 1;
+		set_bit(MD_CHANGE_DEVS, &mddev->flags);
 		md_wakeup_thread(mddev->thread);
-		wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
+		wait_event(mddev->sb_wait, mddev->flags == 0 ||
 			   kthread_should_stop());
 		spin_lock_irq(&conf->device_lock);
 		conf->expand_lo = mddev->reshape_position;
@@ -3074,6 +3088,7 @@ static int run(mddev_t *mddev)
 	mdk_rdev_t *rdev;
 	struct disk_info *disk;
 	struct list_head *tmp;
+	int working_disks = 0;
 
 	if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) {
 		printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n",
@@ -3176,14 +3191,14 @@ static int run(mddev_t *mddev)
 			printk(KERN_INFO "raid5: device %s operational as raid"
 				" disk %d\n", bdevname(rdev->bdev,b),
 				raid_disk);
-			conf->working_disks++;
+			working_disks++;
 		}
 	}
 
 	/*
 	 * 0 for a fully functional array, 1 or 2 for a degraded array.
 	 */
-	mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks;
+	mddev->degraded = conf->raid_disks - working_disks;
 	conf->mddev = mddev;
 	conf->chunk_size = mddev->chunk_size;
 	conf->level = mddev->level;
@@ -3218,7 +3233,7 @@ static int run(mddev_t *mddev)
 	if (mddev->degraded > conf->max_degraded) {
 		printk(KERN_ERR "raid5: not enough operational devices for %s"
 			" (%d/%d failed)\n",
-			mdname(mddev), conf->failed_disks, conf->raid_disks);
+			mdname(mddev), mddev->degraded, conf->raid_disks);
 		goto abort;
 	}
 
@@ -3299,6 +3314,9 @@ static int run(mddev_t *mddev)
 
 	mddev->queue->unplug_fn = raid5_unplug_device;
 	mddev->queue->issue_flush_fn = raid5_issue_flush;
+	mddev->queue->backing_dev_info.congested_fn = raid5_congested;
+	mddev->queue->backing_dev_info.congested_data = mddev;
+
 	mddev->array_size =  mddev->size * (conf->previous_raid_disks -
 					    conf->max_degraded);
 
@@ -3375,7 +3393,7 @@ static void status (struct seq_file *seq, mddev_t *mddev)
 	int i;
 
 	seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
-	seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks);
+	seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
 	for (i = 0; i < conf->raid_disks; i++)
 		seq_printf (seq, "%s",
 			       conf->disks[i].rdev &&
@@ -3397,8 +3415,8 @@ static void print_raid5_conf (raid5_conf_t *conf)
 		printk("(conf==NULL)\n");
 		return;
 	}
-	printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
-		 conf->working_disks, conf->failed_disks);
+	printk(" --- rd:%d wd:%d\n", conf->raid_disks,
+		 conf->raid_disks - conf->mddev->degraded);
 
 	for (i = 0; i < conf->raid_disks; i++) {
 		char b[BDEVNAME_SIZE];
@@ -3420,11 +3438,11 @@ static int raid5_spare_active(mddev_t *mddev)
 		tmp = conf->disks + i;
 		if (tmp->rdev
 		    && !test_bit(Faulty, &tmp->rdev->flags)
-		    && !test_bit(In_sync, &tmp->rdev->flags)) {
+		    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
+			unsigned long flags;
+			spin_lock_irqsave(&conf->device_lock, flags);
 			mddev->degraded--;
-			conf->failed_disks--;
-			conf->working_disks++;
-			set_bit(In_sync, &tmp->rdev->flags);
+			spin_unlock_irqrestore(&conf->device_lock, flags);
 		}
 	}
 	print_raid5_conf(conf);
@@ -3560,6 +3578,7 @@ static int raid5_start_reshape(mddev_t *mddev)
 	struct list_head *rtmp;
 	int spares = 0;
 	int added_devices = 0;
+	unsigned long flags;
 
 	if (mddev->degraded ||
 	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
@@ -3593,7 +3612,6 @@ static int raid5_start_reshape(mddev_t *mddev)
 			if (raid5_add_disk(mddev, rdev)) {
 				char nm[20];
 				set_bit(In_sync, &rdev->flags);
-				conf->working_disks++;
 				added_devices++;
 				rdev->recovery_offset = 0;
 				sprintf(nm, "rd%d", rdev->raid_disk);
@@ -3602,10 +3620,12 @@ static int raid5_start_reshape(mddev_t *mddev)
 				break;
 		}
 
+	spin_lock_irqsave(&conf->device_lock, flags);
 	mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices;
+	spin_unlock_irqrestore(&conf->device_lock, flags);
 	mddev->raid_disks = conf->raid_disks;
 	mddev->reshape_position = 0;
-	mddev->sb_dirty = 1;
+	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 
 	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);