96 files changed, 6646 insertions, 2308 deletions
diff --git a/drivers/base/iommu.c b/drivers/base/iommu.c
index c2d1eed9037..9f0e672f4be 100644
--- a/drivers/base/iommu.c
+++ b/drivers/base/iommu.c
@@ -98,3 +98,10 @@ phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain,
 	return iommu_ops->iova_to_phys(domain, iova);
 }
 EXPORT_SYMBOL_GPL(iommu_iova_to_phys);
+
+int iommu_domain_has_cap(struct iommu_domain *domain,
+			 unsigned long cap)
+{
+	return iommu_ops->domain_has_cap(domain, cap);
+}
+EXPORT_SYMBOL_GPL(iommu_domain_has_cap);
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 45c5a33daf4..31693bc2444 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -4,6 +4,7 @@
  * Filesystem request handling methods
  */
 
+#include <linux/ata.h>
 #include <linux/hdreg.h>
 #include <linux/blkdev.h>
 #include <linux/skbuff.h>
@@ -267,7 +268,7 @@ aoecmd_ata_rw(struct aoedev *d)
 		writebit = 0;
 	}
 
-	ah->cmdstat = WIN_READ | writebit | extbit;
+	ah->cmdstat = ATA_CMD_PIO_READ | writebit | extbit;
 
 	/* mark all tracking fields and load out */
 	buf->nframesout += 1;
@@ -362,10 +363,10 @@ resend(struct aoedev *d, struct aoetgt *t, struct frame *f)
 	switch (ah->cmdstat) {
 	default:
 		break;
-	case WIN_READ:
-	case WIN_READ_EXT:
-	case WIN_WRITE:
-	case WIN_WRITE_EXT:
+	case ATA_CMD_PIO_READ:
+	case ATA_CMD_PIO_READ_EXT:
+	case ATA_CMD_PIO_WRITE:
+	case ATA_CMD_PIO_WRITE_EXT:
 		put_lba(ah, f->lba);
 
 		n = f->bcnt;
@@ -812,8 +813,8 @@ aoecmd_ata_rsp(struct sk_buff *skb)
 			d->htgt = NULL;
 		n = ahout->scnt << 9;
 		switch (ahout->cmdstat) {
-		case WIN_READ:
-		case WIN_READ_EXT:
+		case ATA_CMD_PIO_READ:
+		case ATA_CMD_PIO_READ_EXT:
 			if (skb->len - sizeof *hin - sizeof *ahin < n) {
 				printk(KERN_ERR
 					"aoe: %s.  skb->len=%d need=%ld\n",
@@ -823,8 +824,8 @@ aoecmd_ata_rsp(struct sk_buff *skb)
 				return;
 			}
 			memcpy(f->bufaddr, ahin+1, n);
-		case WIN_WRITE:
-		case WIN_WRITE_EXT:
+		case ATA_CMD_PIO_WRITE:
+		case ATA_CMD_PIO_WRITE_EXT:
 			ifp = getif(t, skb->dev);
 			if (ifp) {
 				ifp->lost = 0;
@@ -838,7 +839,7 @@ aoecmd_ata_rsp(struct sk_buff *skb)
 				goto xmit;
 			}
 			break;
-		case WIN_IDENTIFY:
+		case ATA_CMD_ID_ATA:
 			if (skb->len - sizeof *hin - sizeof *ahin < 512) {
 				printk(KERN_INFO
 					"aoe: runt data size in ataid.  skb->len=%d\n",
@@ -914,7 +915,7 @@ aoecmd_ata_id(struct aoedev *d)
 
 	/* set up ata header */
 	ah->scnt = 1;
-	ah->cmdstat = WIN_IDENTIFY;
+	ah->cmdstat = ATA_CMD_ID_ATA;
 	ah->lba3 = 0xa0;
 
 	skb->dev = t->ifp->nd;
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index 482c0c4b964..3c11f062a18 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -42,6 +42,8 @@
 #include <linux/ata.h>
 #include <linux/hdreg.h>
 
+#define HD_IRQ 14
+
 #define REALLY_SLOW_IO
 #include <asm/system.h>
 #include <asm/io.h>
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index 119be3442f2..6cccdc3f522 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -89,6 +89,7 @@
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
+#include <linux/ata.h>
 #include <linux/hdreg.h>
 #include <linux/platform_device.h>
 #if defined(CONFIG_OF)
@@ -208,7 +209,7 @@ struct ace_device {
 	struct gendisk *gd;
 
 	/* Inserted CF card parameters */
-	struct hd_driveid cf_id;
+	u16 cf_id[ATA_ID_WORDS];
 };
 
 static int ace_major;
@@ -402,21 +403,14 @@ static void ace_dump_regs(struct ace_device *ace)
 		 ace_in32(ace, ACE_CFGLBA), ace_in(ace, ACE_FATSTAT));
 }
 
-void ace_fix_driveid(struct hd_driveid *id)
+void ace_fix_driveid(u16 *id)
 {
 #if defined(__BIG_ENDIAN)
-	u16 *buf = (void *)id;
 	int i;
 
 	/* All half words have wrong byte order; swap the bytes */
-	for (i = 0; i < sizeof(struct hd_driveid); i += 2, buf++)
-		*buf = le16_to_cpu(*buf);
-
-	/* Some of the data values are 32bit; swap the half words  */
-	id->lba_capacity = ((id->lba_capacity >> 16) & 0x0000FFFF) |
-	    ((id->lba_capacity << 16) & 0xFFFF0000);
-	id->spg = ((id->spg >> 16) & 0x0000FFFF) |
-	    ((id->spg << 16) & 0xFFFF0000);
+	for (i = 0; i < ATA_ID_WORDS; i++, id++)
+		*id = le16_to_cpu(*id);
 #endif
 }
 
@@ -614,7 +608,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
 		break;
 
 	case ACE_FSM_STATE_IDENTIFY_COMPLETE:
-		ace_fix_driveid(&ace->cf_id);
+		ace_fix_driveid(&ace->cf_id[0]);
 		ace_dump_mem(&ace->cf_id, 512);	/* Debug: Dump out disk ID */
 
 		if (ace->data_result) {
@@ -627,9 +621,10 @@ static void ace_fsm_dostate(struct ace_device *ace)
 			ace->media_change = 0;
 
 			/* Record disk parameters */
-			set_capacity(ace->gd, ace->cf_id.lba_capacity);
+			set_capacity(ace->gd,
+				ata_id_u32(&ace->cf_id, ATA_ID_LBA_CAPACITY));
 			dev_info(ace->dev, "capacity: %i sectors\n",
-				 ace->cf_id.lba_capacity);
+				ata_id_u32(&ace->cf_id, ATA_ID_LBA_CAPACITY));
 		}
 
 		/* We're done, drop to IDLE state and notify waiters */
@@ -928,12 +923,13 @@ static int ace_release(struct gendisk *disk, fmode_t mode)
 static int ace_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
 	struct ace_device *ace = bdev->bd_disk->private_data;
+	u16 *cf_id = &ace->cf_id[0];
 
 	dev_dbg(ace->dev, "ace_getgeo()\n");
 
-	geo->heads = ace->cf_id.heads;
-	geo->sectors = ace->cf_id.sectors;
-	geo->cylinders = ace->cf_id.cyls;
+	geo->heads	= cf_id[ATA_ID_HEADS];
+	geo->sectors	= cf_id[ATA_ID_SECTORS];
+	geo->cylinders	= cf_id[ATA_ID_CYLS];
 
 	return 0;
 }
diff --git a/drivers/char/hw_random/timeriomem-rng.c b/drivers/char/hw_random/timeriomem-rng.c
index 10ad41be589..dcd352ad0e7 100644
--- a/drivers/char/hw_random/timeriomem-rng.c
+++ b/drivers/char/hw_random/timeriomem-rng.c
@@ -90,10 +90,30 @@ static struct hwrng timeriomem_rng_ops = {
 
 static int __init timeriomem_rng_probe(struct platform_device *pdev)
 {
+	struct resource *res, *mem;
 	int ret;
 
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+
+	if (!res)
+		return -ENOENT;
+
+	mem = request_mem_region(res->start, res->end - res->start + 1,
+				 pdev->name);
+	if (mem == NULL)
+		return -EBUSY;
+
+	dev_set_drvdata(&pdev->dev, mem);
+
 	timeriomem_rng_data = pdev->dev.platform_data;
 
+	timeriomem_rng_data->address = ioremap(res->start,
+						res->end - res->start + 1);
+	if (!timeriomem_rng_data->address) {
+		ret = -ENOMEM;
+		goto err_ioremap;
+	}
+
 	if (timeriomem_rng_data->period != 0
 		&& usecs_to_jiffies(timeriomem_rng_data->period) > 0) {
 		timeriomem_rng_timer.expires = jiffies;
@@ -104,23 +124,34 @@ static int __init timeriomem_rng_probe(struct platform_device *pdev)
 	timeriomem_rng_data->present = 1;
 
 	ret = hwrng_register(&timeriomem_rng_ops);
-	if (ret) {
-		dev_err(&pdev->dev, "problem registering\n");
-		return ret;
-	}
+	if (ret)
+		goto err_register;
 
 	dev_info(&pdev->dev, "32bits from 0x%p @ %dus\n",
 			timeriomem_rng_data->address,
 			timeriomem_rng_data->period);
 
 	return 0;
+
+err_register:
+	dev_err(&pdev->dev, "problem registering\n");
+	iounmap(timeriomem_rng_data->address);
+err_ioremap:
+	release_resource(mem);
+
+	return ret;
 }
 
 static int __devexit timeriomem_rng_remove(struct platform_device *pdev)
 {
+	struct resource *mem = dev_get_drvdata(&pdev->dev);
+
 	del_timer_sync(&timeriomem_rng_timer);
 	hwrng_unregister(&timeriomem_rng_ops);
 
+	iounmap(timeriomem_rng_data->address);
+	release_resource(mem);
+
 	return 0;
 }
 
diff --git a/drivers/crypto/ixp4xx_crypto.c b/drivers/crypto/ixp4xx_crypto.c
index d9e751be8c5..af9761ccf9f 100644
--- a/drivers/crypto/ixp4xx_crypto.c
+++ b/drivers/crypto/ixp4xx_crypto.c
@@ -101,6 +101,7 @@ struct buffer_desc {
 	u32 phys_addr;
 	u32 __reserved[4];
 	struct buffer_desc *next;
+	enum dma_data_direction dir;
 };
 
 struct crypt_ctl {
@@ -132,14 +133,10 @@ struct crypt_ctl {
 struct ablk_ctx {
 	struct buffer_desc *src;
 	struct buffer_desc *dst;
-	unsigned src_nents;
-	unsigned dst_nents;
 };
 
 struct aead_ctx {
 	struct buffer_desc *buffer;
-	unsigned short assoc_nents;
-	unsigned short src_nents;
 	struct scatterlist ivlist;
 	/* used when the hmac is not on one sg entry */
 	u8 *hmac_virt;
@@ -312,7 +309,7 @@ static struct crypt_ctl *get_crypt_desc_emerg(void)
 	}
 }
 
-static void free_buf_chain(struct buffer_desc *buf, u32 phys)
+static void free_buf_chain(struct device *dev, struct buffer_desc *buf,u32 phys)
 {
 	while (buf) {
 		struct buffer_desc *buf1;
@@ -320,6 +317,7 @@ static void free_buf_chain(struct buffer_desc *buf, u32 phys)
 
 		buf1 = buf->next;
 		phys1 = buf->phys_next;
+		dma_unmap_single(dev, buf->phys_next, buf->buf_len, buf->dir);
 		dma_pool_free(buffer_pool, buf, phys);
 		buf = buf1;
 		phys = phys1;
@@ -348,7 +346,6 @@ static void one_packet(dma_addr_t phys)
 	struct crypt_ctl *crypt;
 	struct ixp_ctx *ctx;
 	int failed;
-	enum dma_data_direction src_direction = DMA_BIDIRECTIONAL;
 
 	failed = phys & 0x1 ? -EBADMSG : 0;
 	phys &= ~0x3;
@@ -358,13 +355,8 @@ static void one_packet(dma_addr_t phys)
 	case CTL_FLAG_PERFORM_AEAD: {
 		struct aead_request *req = crypt->data.aead_req;
 		struct aead_ctx *req_ctx = aead_request_ctx(req);
-		dma_unmap_sg(dev, req->assoc, req_ctx->assoc_nents,
-				DMA_TO_DEVICE);
-		dma_unmap_sg(dev, &req_ctx->ivlist, 1, DMA_BIDIRECTIONAL);
-		dma_unmap_sg(dev, req->src, req_ctx->src_nents,
-				DMA_BIDIRECTIONAL);
 
-		free_buf_chain(req_ctx->buffer, crypt->src_buf);
+		free_buf_chain(dev, req_ctx->buffer, crypt->src_buf);
 		if (req_ctx->hmac_virt) {
 			finish_scattered_hmac(crypt);
 		}
@@ -374,16 +366,11 @@ static void one_packet(dma_addr_t phys)
 	case CTL_FLAG_PERFORM_ABLK: {
 		struct ablkcipher_request *req = crypt->data.ablk_req;
 		struct ablk_ctx *req_ctx = ablkcipher_request_ctx(req);
-		int nents;
+
 		if (req_ctx->dst) {
-			nents = req_ctx->dst_nents;
-			dma_unmap_sg(dev, req->dst, nents, DMA_FROM_DEVICE);
-			free_buf_chain(req_ctx->dst, crypt->dst_buf);
-			src_direction = DMA_TO_DEVICE;
+			free_buf_chain(dev, req_ctx->dst, crypt->dst_buf);
 		}
-		nents = req_ctx->src_nents;
-		dma_unmap_sg(dev, req->src, nents, src_direction);
-		free_buf_chain(req_ctx->src, crypt->src_buf);
+		free_buf_chain(dev, req_ctx->src, crypt->src_buf);
 		req->base.complete(&req->base, failed);
 		break;
 	}
@@ -750,56 +737,35 @@ static int setup_cipher(struct crypto_tfm *tfm, int encrypt,
 	return 0;
 }
 
-static int count_sg(struct scatterlist *sg, int nbytes)
+static struct buffer_desc *chainup_buffers(struct device *dev,
+		struct scatterlist *sg,	unsigned nbytes,
+		struct buffer_desc *buf, gfp_t flags,
+		enum dma_data_direction dir)
 {
-	int i;
-	for (i = 0; nbytes > 0; i++, sg = sg_next(sg))
-		nbytes -= sg->length;
-	return i;
-}
-
-static struct buffer_desc *chainup_buffers(struct scatterlist *sg,
-			unsigned nbytes, struct buffer_desc *buf, gfp_t flags)
-{
-	int nents = 0;
-
-	while (nbytes > 0) {
+	for (;nbytes > 0; sg = scatterwalk_sg_next(sg)) {
+		unsigned len = min(nbytes, sg->length);
 		struct buffer_desc *next_buf;
 		u32 next_buf_phys;
-		unsigned len = min(nbytes, sg_dma_len(sg));
+		void *ptr;
 
-		nents++;
 		nbytes -= len;
-		if (!buf->phys_addr) {
-			buf->phys_addr = sg_dma_address(sg);
-			buf->buf_len = len;
-			buf->next = NULL;
-			buf->phys_next = 0;
-			goto next;
-		}
-		/* Two consecutive chunks on one page may be handled by the old
-		 * buffer descriptor, increased by the length of the new one
-		 */
-		if (sg_dma_address(sg) == buf->phys_addr + buf->buf_len) {
-			buf->buf_len += len;
-			goto next;
-		}
+		ptr = page_address(sg_page(sg)) + sg->offset;
 		next_buf = dma_pool_alloc(buffer_pool, flags, &next_buf_phys);
-		if (!next_buf)
-			return NULL;
+		if (!next_buf) {
+			buf = NULL;
+			break;
+		}
+		sg_dma_address(sg) = dma_map_single(dev, ptr, len, dir);
 		buf->next = next_buf;
 		buf->phys_next = next_buf_phys;
-
 		buf = next_buf;
-		buf->next = NULL;
-		buf->phys_next = 0;
+
 		buf->phys_addr = sg_dma_address(sg);
 		buf->buf_len = len;
-next:
-		if (nbytes > 0) {
-			sg = sg_next(sg);
-		}
+		buf->dir = dir;
 	}
+	buf->next = NULL;
+	buf->phys_next = 0;
 	return buf;
 }
 
@@ -860,12 +826,12 @@ static int ablk_perform(struct ablkcipher_request *req, int encrypt)
 	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
 	struct ixp_ctx *ctx = crypto_ablkcipher_ctx(tfm);
 	unsigned ivsize = crypto_ablkcipher_ivsize(tfm);
-	int ret = -ENOMEM;
 	struct ix_sa_dir *dir;
 	struct crypt_ctl *crypt;
-	unsigned int nbytes = req->nbytes, nents;
+	unsigned int nbytes = req->nbytes;
 	enum dma_data_direction src_direction = DMA_BIDIRECTIONAL;
 	struct ablk_ctx *req_ctx = ablkcipher_request_ctx(req);
+	struct buffer_desc src_hook;
 	gfp_t flags = req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ?
 				GFP_KERNEL : GFP_ATOMIC;
 
@@ -878,7 +844,7 @@ static int ablk_perform(struct ablkcipher_request *req, int encrypt)
 
 	crypt = get_crypt_desc();
 	if (!crypt)
-		return ret;
+		return -ENOMEM;
 
 	crypt->data.ablk_req = req;
 	crypt->crypto_ctx = dir->npe_ctx_phys;
@@ -891,53 +857,41 @@ static int ablk_perform(struct ablkcipher_request *req, int encrypt)
 	BUG_ON(ivsize && !req->info);
 	memcpy(crypt->iv, req->info, ivsize);
 	if (req->src != req->dst) {
+		struct buffer_desc dst_hook;
 		crypt->mode |= NPE_OP_NOT_IN_PLACE;
-		nents = count_sg(req->dst, nbytes);
 		/* This was never tested by Intel
 		 * for more than one dst buffer, I think. */
-		BUG_ON(nents != 1);
-		req_ctx->dst_nents = nents;
-		dma_map_sg(dev, req->dst, nents, DMA_FROM_DEVICE);
-		req_ctx->dst = dma_pool_alloc(buffer_pool, flags,&crypt->dst_buf);
-		if (!req_ctx->dst)
-			goto unmap_sg_dest;
-		req_ctx->dst->phys_addr = 0;
-		if (!chainup_buffers(req->dst, nbytes, req_ctx->dst, flags))
+		BUG_ON(req->dst->length < nbytes);
+		req_ctx->dst = NULL;
+		if (!chainup_buffers(dev, req->dst, nbytes, &dst_hook,
+					flags, DMA_FROM_DEVICE))
 			goto free_buf_dest;
 		src_direction = DMA_TO_DEVICE;
+		req_ctx->dst = dst_hook.next;
+		crypt->dst_buf = dst_hook.phys_next;
 	} else {
 		req_ctx->dst = NULL;
-		req_ctx->dst_nents = 0;
 	}
-	nents = count_sg(req->src, nbytes);
-	req_ctx->src_nents = nents;
-	dma_map_sg(dev, req->src, nents, src_direction);
-
-	req_ctx->src = dma_pool_alloc(buffer_pool, flags, &crypt->src_buf);
-	if (!req_ctx->src)
-		goto unmap_sg_src;
-	req_ctx->src->phys_addr = 0;
-	if (!chainup_buffers(req->src, nbytes, req_ctx->src, flags))
+	req_ctx->src = NULL;
+	if (!chainup_buffers(dev, req->src, nbytes, &src_hook,
+				flags, src_direction))
 		goto free_buf_src;
 
+	req_ctx->src = src_hook.next;
+	crypt->src_buf = src_hook.phys_next;
 	crypt->ctl_flags |= CTL_FLAG_PERFORM_ABLK;
 	qmgr_put_entry(SEND_QID, crypt_virt2phys(crypt));
 	BUG_ON(qmgr_stat_overflow(SEND_QID));
 	return -EINPROGRESS;
 
 free_buf_src:
-	free_buf_chain(req_ctx->src, crypt->src_buf);
-unmap_sg_src:
-	dma_unmap_sg(dev, req->src, req_ctx->src_nents, src_direction);
+	free_buf_chain(dev, req_ctx->src, crypt->src_buf);
 free_buf_dest:
 	if (req->src != req->dst) {
-		free_buf_chain(req_ctx->dst, crypt->dst_buf);
-unmap_sg_dest:
-		dma_unmap_sg(dev, req->src, req_ctx->dst_nents,
-			DMA_FROM_DEVICE);
+		free_buf_chain(dev, req_ctx->dst, crypt->dst_buf);
 	}
 	crypt->ctl_flags = CTL_FLAG_UNUSED;
-	return ret;
+	return -ENOMEM;
 }
 
 static int ablk_encrypt(struct ablkcipher_request *req)
@@ -985,7 +939,7 @@ static int hmac_inconsistent(struct scatterlist *sg, unsigned start,
 			break;
 
 		offset += sg->length;
-		sg = sg_next(sg);
+		sg = scatterwalk_sg_next(sg);
 	}
 	return (start + nbytes > offset + sg->length);
 }
@@ -997,11 +951,10 @@ static int aead_perform(struct aead_request *req, int encrypt,
 	struct ixp_ctx *ctx = crypto_aead_ctx(tfm);
 	unsigned ivsize = crypto_aead_ivsize(tfm);
 	unsigned authsize = crypto_aead_authsize(tfm);
-	int ret = -ENOMEM;
 	struct ix_sa_dir *dir;
 	struct crypt_ctl *crypt;
-	unsigned int cryptlen, nents;
-	struct buffer_desc *buf;
+	unsigned int cryptlen;
+	struct buffer_desc *buf, src_hook;
 	struct aead_ctx *req_ctx = aead_request_ctx(req);
 	gfp_t flags = req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ?
 				GFP_KERNEL : GFP_ATOMIC;
@@ -1022,7 +975,7 @@ static int aead_perform(struct aead_request *req, int encrypt,
 	}
 	crypt = get_crypt_desc();
 	if (!crypt)
-		return ret;
+		return -ENOMEM;
 
 	crypt->data.aead_req = req;
 	crypt->crypto_ctx = dir->npe_ctx_phys;
@@ -1041,31 +994,27 @@ static int aead_perform(struct aead_request *req, int encrypt,
 		BUG(); /* -ENOTSUP because of my lazyness */
 	}
 
-	req_ctx->buffer = dma_pool_alloc(buffer_pool, flags, &crypt->src_buf);
-	if (!req_ctx->buffer)
-		goto out;
-	req_ctx->buffer->phys_addr = 0;
 	/* ASSOC data */
-	nents = count_sg(req->assoc, req->assoclen);
-	req_ctx->assoc_nents = nents;
-	dma_map_sg(dev, req->assoc, nents, DMA_TO_DEVICE);
-	buf = chainup_buffers(req->assoc, req->assoclen, req_ctx->buffer,flags);
+	buf = chainup_buffers(dev, req->assoc, req->assoclen, &src_hook,
+		flags, DMA_TO_DEVICE);
+	req_ctx->buffer = src_hook.next;
+	crypt->src_buf = src_hook.phys_next;
 	if (!buf)
-		goto unmap_sg_assoc;
+		goto out;
 	/* IV */
 	sg_init_table(&req_ctx->ivlist, 1);
 	sg_set_buf(&req_ctx->ivlist, iv, ivsize);
-	dma_map_sg(dev, &req_ctx->ivlist, 1, DMA_BIDIRECTIONAL);
-	buf = chainup_buffers(&req_ctx->ivlist, ivsize, buf, flags);
+	buf = chainup_buffers(dev, &req_ctx->ivlist, ivsize, buf, flags,
+			DMA_BIDIRECTIONAL);
 	if (!buf)
-		goto unmap_sg_iv;
+		goto free_chain;
 	if (unlikely(hmac_inconsistent(req->src, cryptlen, authsize))) {
 		/* The 12 hmac bytes are scattered,
 		 * we need to copy them into a safe buffer */
 		req_ctx->hmac_virt = dma_pool_alloc(buffer_pool, flags,
 				&crypt->icv_rev_aes);
 		if (unlikely(!req_ctx->hmac_virt))
-			goto unmap_sg_iv;
+			goto free_chain;
 		if (!encrypt) {
 			scatterwalk_map_and_copy(req_ctx->hmac_virt,
 				req->src, cryptlen, authsize, 0);
@@ -1075,33 +1024,28 @@ static int aead_perform(struct aead_request *req, int encrypt,
 		req_ctx->hmac_virt = NULL;
 	}
 	/* Crypt */
-	nents = count_sg(req->src, cryptlen + authsize);
-	req_ctx->src_nents = nents;
-	dma_map_sg(dev, req->src, nents, DMA_BIDIRECTIONAL);
-	buf = chainup_buffers(req->src, cryptlen + authsize, buf, flags);
+	buf = chainup_buffers(dev, req->src, cryptlen + authsize, buf, flags,
+			DMA_BIDIRECTIONAL);
 	if (!buf)
-		goto unmap_sg_src;
+		goto free_hmac_virt;
 	if (!req_ctx->hmac_virt) {
 		crypt->icv_rev_aes = buf->phys_addr + buf->buf_len - authsize;
 	}
+
 	crypt->ctl_flags |= CTL_FLAG_PERFORM_AEAD;
 	qmgr_put_entry(SEND_QID, crypt_virt2phys(crypt));
 	BUG_ON(qmgr_stat_overflow(SEND_QID));
 	return -EINPROGRESS;
-unmap_sg_src:
-	dma_unmap_sg(dev, req->src, req_ctx->src_nents, DMA_BIDIRECTIONAL);
+free_hmac_virt:
 	if (req_ctx->hmac_virt) {
 		dma_pool_free(buffer_pool, req_ctx->hmac_virt,
 				crypt->icv_rev_aes);
 	}
-unmap_sg_iv:
-	dma_unmap_sg(dev, &req_ctx->ivlist, 1, DMA_BIDIRECTIONAL);
-unmap_sg_assoc:
-	dma_unmap_sg(dev, req->assoc, req_ctx->assoc_nents, DMA_TO_DEVICE);
-	free_buf_chain(req_ctx->buffer, crypt->src_buf);
+free_chain:
+	free_buf_chain(dev, req_ctx->buffer, crypt->src_buf);
 out:
 	crypt->ctl_flags = CTL_FLAG_UNUSED;
-	return ret;
+	return -ENOMEM;
 }
 
 static int aead_setup(struct crypto_aead *tfm, unsigned int authsize)
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 48ea59e7967..3b3c01b6f1e 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -98,6 +98,17 @@ config NET_DMA
 	  Say Y here if you enabled INTEL_IOATDMA or FSL_DMA, otherwise
 	  say N.
 
+config ASYNC_TX_DMA
+	bool "Async_tx: Offload support for the async_tx api"
+	depends on DMA_ENGINE
+	help
+	  This allows the async_tx api to take advantage of offload engines for
+	  memcpy, memset, xor, and raid6 p+q operations.  If your platform has
+	  a dma engine that can perform raid operations and you have enabled
+	  MD_RAID456 say Y.
+
+	  If unsure, say N.
+
 config DMATEST
 	tristate "DMA Test client"
 	depends on DMA_ENGINE
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 280a9d263eb..92438e9dacc 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -507,6 +507,7 @@ struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, v
 			 * published in the general-purpose allocator
 			 */
 			dma_cap_set(DMA_PRIVATE, device->cap_mask);
+			device->privatecnt++;
 			err = dma_chan_get(chan);
 
 			if (err == -ENODEV) {
@@ -518,6 +519,8 @@ struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, v
 				       dma_chan_name(chan), err);
 			else
 				break;
+			if (--device->privatecnt == 0)
+				dma_cap_clear(DMA_PRIVATE, device->cap_mask);
 			chan->private = NULL;
 			chan = NULL;
 		}
@@ -537,6 +540,9 @@ void dma_release_channel(struct dma_chan *chan)
 	WARN_ONCE(chan->client_count != 1,
 		  "chan reference count %d != 1\n", chan->client_count);
 	dma_chan_put(chan);
+	/* drop PRIVATE cap enabled by __dma_request_channel() */
+	if (--chan->device->privatecnt == 0)
+		dma_cap_clear(DMA_PRIVATE, chan->device->cap_mask);
 	chan->private = NULL;
 	mutex_unlock(&dma_list_mutex);
 }
@@ -602,6 +608,24 @@ void dmaengine_put(void)
 }
 EXPORT_SYMBOL(dmaengine_put);
 
+static int get_dma_id(struct dma_device *device)
+{
+	int rc;
+
+ idr_retry:
+	if (!idr_pre_get(&dma_idr, GFP_KERNEL))
+		return -ENOMEM;
+	mutex_lock(&dma_list_mutex);
+	rc = idr_get_new(&dma_idr, NULL, &device->dev_id);
+	mutex_unlock(&dma_list_mutex);
+	if (rc == -EAGAIN)
+		goto idr_retry;
+	else if (rc != 0)
+		return rc;
+
+	return 0;
+}
+
 /**
  * dma_async_device_register - registers DMA devices found
  * @device: &dma_device
@@ -640,27 +664,25 @@ int dma_async_device_register(struct dma_device *device)
 	idr_ref = kmalloc(sizeof(*idr_ref), GFP_KERNEL);
 	if (!idr_ref)
 		return -ENOMEM;
-	atomic_set(idr_ref, 0);
- idr_retry:
-	if (!idr_pre_get(&dma_idr, GFP_KERNEL))
-		return -ENOMEM;
-	mutex_lock(&dma_list_mutex);
-	rc = idr_get_new(&dma_idr, NULL, &device->dev_id);
-	mutex_unlock(&dma_list_mutex);
-	if (rc == -EAGAIN)
-		goto idr_retry;
-	else if (rc != 0)
+	rc = get_dma_id(device);
+	if (rc != 0) {
+		kfree(idr_ref);
 		return rc;
+	}
+
+	atomic_set(idr_ref, 0);
 
 	/* represent channels in sysfs. Probably want devs too */
 	list_for_each_entry(chan, &device->channels, device_node) {
+		rc = -ENOMEM;
 		chan->local = alloc_percpu(typeof(*chan->local));
 		if (chan->local == NULL)
-			continue;
+			goto err_out;
 		chan->dev = kzalloc(sizeof(*chan->dev), GFP_KERNEL);
 		if (chan->dev == NULL) {
 			free_percpu(chan->local);
-			continue;
+			chan->local = NULL;
+			goto err_out;
 		}
 
 		chan->chan_id = chancnt++;
@@ -677,6 +699,8 @@ int dma_async_device_register(struct dma_device *device)
 		if (rc) {
 			free_percpu(chan->local);
 			chan->local = NULL;
+			kfree(chan->dev);
+			atomic_dec(idr_ref);
 			goto err_out;
 		}
 		chan->client_count = 0;
@@ -701,12 +725,23 @@ int dma_async_device_register(struct dma_device *device)
 			}
 		}
 	list_add_tail_rcu(&device->global_node, &dma_device_list);
+	if (dma_has_cap(DMA_PRIVATE, device->cap_mask))
+		device->privatecnt++;	/* Always private */
 	dma_channel_rebalance();
 	mutex_unlock(&dma_list_mutex);
 
 	return 0;
 
 err_out:
+	/* if we never registered a channel just release the idr */
+	if (atomic_read(idr_ref) == 0) {
+		mutex_lock(&dma_list_mutex);
+		idr_remove(&dma_idr, device->dev_id);
+		mutex_unlock(&dma_list_mutex);
+		kfree(idr_ref);
+		return rc;
+	}
+
 	list_for_each_entry(chan, &device->channels, device_node) {
 		if (chan->local == NULL)
 			continue;
@@ -893,6 +928,7 @@ void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx,
 {
 	tx->chan = chan;
 	spin_lock_init(&tx->lock);
+	INIT_LIST_HEAD(&tx->tx_list);
 }
 EXPORT_SYMBOL(dma_async_tx_descriptor_init);
 
diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c
index e190d8b3070..a27c0fb1bc1 100644
--- a/drivers/dma/dmatest.c
+++ b/drivers/dma/dmatest.c
@@ -38,6 +38,11 @@ module_param(max_channels, uint, S_IRUGO);
 MODULE_PARM_DESC(max_channels,
 		"Maximum number of channels to use (default: all)");
 
+static unsigned int xor_sources = 3;
+module_param(xor_sources, uint, S_IRUGO);
+MODULE_PARM_DESC(xor_sources,
+		"Number of xor source buffers (default: 3)");
+
 /*
  * Initialization patterns. All bytes in the source buffer has bit 7
  * set, all bytes in the destination buffer has bit 7 cleared.
@@ -59,8 +64,9 @@ struct dmatest_thread {
 	struct list_head	node;
 	struct task_struct	*task;
 	struct dma_chan		*chan;
-	u8			*srcbuf;
-	u8			*dstbuf;
+	u8			**srcs;
+	u8			**dsts;
+	enum dma_transaction_type type;
 };
 
 struct dmatest_chan {
@@ -98,30 +104,37 @@ static unsigned long dmatest_random(void)
 	return buf;
 }
 
-static void dmatest_init_srcbuf(u8 *buf, unsigned int start, unsigned int len)
+static void dmatest_init_srcs(u8 **bufs, unsigned int start, unsigned int len)
 {
 	unsigned int i;
-
-	for (i = 0; i < start; i++)
-		buf[i] = PATTERN_SRC | (~i & PATTERN_COUNT_MASK);
-	for ( ; i < start + len; i++)
-		buf[i] = PATTERN_SRC | PATTERN_COPY
-			| (~i & PATTERN_COUNT_MASK);;
-	for ( ; i < test_buf_size; i++)
-		buf[i] = PATTERN_SRC | (~i & PATTERN_COUNT_MASK);
+	u8 *buf;
+
+	for (; (buf = *bufs); bufs++) {
+		for (i = 0; i < start; i++)
+			buf[i] = PATTERN_SRC | (~i & PATTERN_COUNT_MASK);
+		for ( ; i < start + len; i++)
+			buf[i] = PATTERN_SRC | PATTERN_COPY
+				| (~i & PATTERN_COUNT_MASK);;
+		for ( ; i < test_buf_size; i++)
+			buf[i] = PATTERN_SRC | (~i & PATTERN_COUNT_MASK);
+		buf++;
+	}
 }
 
-static void dmatest_init_dstbuf(u8 *buf, unsigned int start, unsigned int len)
+static void dmatest_init_dsts(u8 **bufs, unsigned int start, unsigned int len)
 {
 	unsigned int i;
-
-	for (i = 0; i < start; i++)
-		buf[i] = PATTERN_DST | (~i & PATTERN_COUNT_MASK);
-	for ( ; i < start + len; i++)
-		buf[i] = PATTERN_DST | PATTERN_OVERWRITE
-			| (~i & PATTERN_COUNT_MASK);
-	for ( ; i < test_buf_size; i++)
-		buf[i] = PATTERN_DST | (~i & PATTERN_COUNT_MASK);
+	u8 *buf;
+
+	for (; (buf = *bufs); bufs++) {
+		for (i = 0; i < start; i++)
+			buf[i] = PATTERN_DST | (~i & PATTERN_COUNT_MASK);
+		for ( ; i < start + len; i++)
+			buf[i] = PATTERN_DST | PATTERN_OVERWRITE
+				| (~i & PATTERN_COUNT_MASK);
+		for ( ; i < test_buf_size; i++)
+			buf[i] = PATTERN_DST | (~i & PATTERN_COUNT_MASK);
+	}
 }
 
 static void dmatest_mismatch(u8 actual, u8 pattern, unsigned int index,
@@ -150,23 +163,30 @@ static void dmatest_mismatch(u8 actual, u8 pattern, unsigned int index,
 				thread_name, index, expected, actual);
 }
 
-static unsigned int dmatest_verify(u8 *buf, unsigned int start,
+static unsigned int dmatest_verify(u8 **bufs, unsigned int start,
 		unsigned int end, unsigned int counter, u8 pattern,
 		bool is_srcbuf)
 {
 	unsigned int i;
 	unsigned int error_count = 0;
 	u8 actual;
-
-	for (i = start; i < end; i++) {
-		actual = buf[i];
-		if (actual != (pattern | (~counter & PATTERN_COUNT_MASK))) {
-			if (error_count < 32)
-				dmatest_mismatch(actual, pattern, i, counter,
-						is_srcbuf);
-			error_count++;
+	u8 expected;
+	u8 *buf;
+	unsigned int counter_orig = counter;
+
+	for (; (buf = *bufs); bufs++) {
+		counter = counter_orig;
+		for (i = start; i < end; i++) {
+			actual = buf[i];
+			expected = pattern | (~counter & PATTERN_COUNT_MASK);
+			if (actual != expected) {
+				if (error_count < 32)
+					dmatest_mismatch(actual, pattern, i,
+							 counter, is_srcbuf);
+				error_count++;
+			}
+			counter++;
 		}
-		counter++;
 	}
 
 	if (error_count > 32)
@@ -176,12 +196,17 @@ static unsigned int dmatest_verify(u8 *buf, unsigned int start,
 	return error_count;
 }
 
+static void dmatest_callback(void *completion)
+{
+	complete(completion);
+}
+
 /*
  * This function repeatedly tests DMA transfers of various lengths and
- * offsets until it is told to exit by kthread_stop(). There may be
- * multiple threads running this function in parallel for a single
- * channel, and there may be multiple channels being tested in
- * parallel.
+ * offsets for a given operation type until it is told to exit by
+ * kthread_stop(). There may be multiple threads running this function
+ * in parallel for a single channel, and there may be multiple channels
+ * being tested in parallel.
  *
  * Before each test, the source and destination buffer is initialized
  * with a known pattern. This pattern is different depending on
@@ -201,25 +226,57 @@ static int dmatest_func(void *data)
 	unsigned int		total_tests = 0;
 	dma_cookie_t		cookie;
 	enum dma_status		status;
+	enum dma_ctrl_flags 	flags;
 	int			ret;
+	int			src_cnt;
+	int			dst_cnt;
+	int			i;
 
 	thread_name = current->comm;
 
 	ret = -ENOMEM;
-	thread->srcbuf = kmalloc(test_buf_size, GFP_KERNEL);
-	if (!thread->srcbuf)
-		goto err_srcbuf;
-	thread->dstbuf = kmalloc(test_buf_size, GFP_KERNEL);
-	if (!thread->dstbuf)
-		goto err_dstbuf;
 
 	smp_rmb();
 	chan = thread->chan;
+	if (thread->type == DMA_MEMCPY)
+		src_cnt = dst_cnt = 1;
+	else if (thread->type == DMA_XOR) {
+		src_cnt = xor_sources | 1; /* force odd to ensure dst = src */
+		dst_cnt = 1;
+	} else
+		goto err_srcs;
+
+	thread->srcs = kcalloc(src_cnt+1, sizeof(u8 *), GFP_KERNEL);
+	if (!thread->srcs)
+		goto err_srcs;
+	for (i = 0; i < src_cnt; i++) {
+		thread->srcs[i] = kmalloc(test_buf_size, GFP_KERNEL);
+		if (!thread->srcs[i])
+			goto err_srcbuf;
+	}
+	thread->srcs[i] = NULL;
+
+	thread->dsts = kcalloc(dst_cnt+1, sizeof(u8 *), GFP_KERNEL);
+	if (!thread->dsts)
+		goto err_dsts;
+	for (i = 0; i < dst_cnt; i++) {
+		thread->dsts[i] = kmalloc(test_buf_size, GFP_KERNEL);
+		if (!thread->dsts[i])
+			goto err_dstbuf;
+	}
+	thread->dsts[i] = NULL;
+
+	set_user_nice(current, 10);
+
+	flags = DMA_CTRL_ACK | DMA_COMPL_SKIP_DEST_UNMAP | DMA_PREP_INTERRUPT;
 
 	while (!kthread_should_stop()) {
 		struct dma_device *dev = chan->device;
-		struct dma_async_tx_descriptor *tx;
-		dma_addr_t dma_src, dma_dest;
+		struct dma_async_tx_descriptor *tx = NULL;
+		dma_addr_t dma_srcs[src_cnt];
+		dma_addr_t dma_dsts[dst_cnt];
+		struct completion cmp;
+		unsigned long tmo = msecs_to_jiffies(3000);
 
 		total_tests++;
 
@@ -227,22 +284,41 @@ static int dmatest_func(void *data)
 		src_off = dmatest_random() % (test_buf_size - len + 1);
 		dst_off = dmatest_random() % (test_buf_size - len + 1);
 
-		dmatest_init_srcbuf(thread->srcbuf, src_off, len);
-		dmatest_init_dstbuf(thread->dstbuf, dst_off, len);
+		dmatest_init_srcs(thread->srcs, src_off, len);
+		dmatest_init_dsts(thread->dsts, dst_off, len);
 
-		dma_src = dma_map_single(dev->dev, thread->srcbuf + src_off,
-				len, DMA_TO_DEVICE);
+		for (i = 0; i < src_cnt; i++) {
+			u8 *buf = thread->srcs[i] + src_off;
+
+			dma_srcs[i] = dma_map_single(dev->dev, buf, len,
+						     DMA_TO_DEVICE);
+		}
 		/* map with DMA_BIDIRECTIONAL to force writeback/invalidate */
-		dma_dest = dma_map_single(dev->dev, thread->dstbuf,
-				test_buf_size, DMA_BIDIRECTIONAL);
+		for (i = 0; i < dst_cnt; i++) {
+			dma_dsts[i] = dma_map_single(dev->dev, thread->dsts[i],
+						     test_buf_size,
+						     DMA_BIDIRECTIONAL);
+		}
+
+		if (thread->type == DMA_MEMCPY)
+			tx = dev->device_prep_dma_memcpy(chan,
+							 dma_dsts[0] + dst_off,
+							 dma_srcs[0], len,
+							 flags);
+		else if (thread->type == DMA_XOR)
+			tx = dev->device_prep_dma_xor(chan,
+						      dma_dsts[0] + dst_off,
+						      dma_srcs, xor_sources,
+						      len, flags);
 
-		tx = dev->device_prep_dma_memcpy(chan, dma_dest + dst_off,
-				dma_src, len,
-				DMA_CTRL_ACK | DMA_COMPL_SKIP_DEST_UNMAP);
 		if (!tx) {
-			dma_unmap_single(dev->dev, dma_src, len, DMA_TO_DEVICE);
-			dma_unmap_single(dev->dev, dma_dest,
-					test_buf_size, DMA_BIDIRECTIONAL);
+			for (i = 0; i < src_cnt; i++)
+				dma_unmap_single(dev->dev, dma_srcs[i], len,
+						 DMA_TO_DEVICE);
+			for (i = 0; i < dst_cnt; i++)
+				dma_unmap_single(dev->dev, dma_dsts[i],
+						 test_buf_size,
+						 DMA_BIDIRECTIONAL);
 			pr_warning("%s: #%u: prep error with src_off=0x%x "
 					"dst_off=0x%x len=0x%x\n",
 					thread_name, total_tests - 1,
@@ -251,7 +327,10 @@ static int dmatest_func(void *data)
 			failed_tests++;
 			continue;
 		}
-		tx->callback = NULL;
+
+		init_completion(&cmp);
+		tx->callback = dmatest_callback;
+		tx->callback_param = &cmp;
 		cookie = tx->tx_submit(tx);
 
 		if (dma_submit_error(cookie)) {
@@ -263,44 +342,50 @@ static int dmatest_func(void *data)
 			failed_tests++;
 			continue;
 		}
-		dma_async_memcpy_issue_pending(chan);
+		dma_async_issue_pending(chan);
 
-		do {
-			msleep(1);
-			status = dma_async_memcpy_complete(
-					chan, cookie, NULL, NULL);
-		} while (status == DMA_IN_PROGRESS);
+		tmo = wait_for_completion_timeout(&cmp, tmo);
+		status = dma_async_is_tx_complete(chan, cookie, NULL, NULL);
 
-		if (status == DMA_ERROR) {
-			pr_warning("%s: #%u: error during copy\n",
-					thread_name, total_tests - 1);
+		if (tmo == 0) {
+			pr_warning("%s: #%u: test timed out\n",
+				   thread_name, total_tests - 1);
+			failed_tests++;
+			continue;
+		} else if (status != DMA_SUCCESS) {
+			pr_warning("%s: #%u: got completion callback,"
+				   " but status is \'%s\'\n",
+				   thread_name, total_tests - 1,
+				   status == DMA_ERROR ? "error" : "in progress");
 			failed_tests++;
 			continue;
 		}
+
 		/* Unmap by myself (see DMA_COMPL_SKIP_DEST_UNMAP above) */
-		dma_unmap_single(dev->dev, dma_dest,
-				test_buf_size, DMA_BIDIRECTIONAL);
+		for (i = 0; i < dst_cnt; i++)
+			dma_unmap_single(dev->dev, dma_dsts[i], test_buf_size,
+					 DMA_BIDIRECTIONAL);
 
 		error_count = 0;
 
 		pr_debug("%s: verifying source buffer...\n", thread_name);
-		error_count += dmatest_verify(thread->srcbuf, 0, src_off,
+		error_count += dmatest_verify(thread->srcs, 0, src_off,
 				0, PATTERN_SRC, true);
-		error_count += dmatest_verify(thread->srcbuf, src_off,
+		error_count += dmatest_verify(thread->srcs, src_off,
 				src_off + len, src_off,
 				PATTERN_SRC | PATTERN_COPY, true);
-		error_count += dmatest_verify(thread->srcbuf, src_off + len,
+		error_count += dmatest_verify(thread->srcs, src_off + len,
 				test_buf_size, src_off + len,
 				PATTERN_SRC, true);
 
 		pr_debug("%s: verifying dest buffer...\n",
 				thread->task->comm);
-		error_count += dmatest_verify(thread->dstbuf, 0, dst_off,
+		error_count += dmatest_verify(thread->dsts, 0, dst_off,
 				0, PATTERN_DST, false);
-		error_count += dmatest_verify(thread->dstbuf, dst_off,
+		error_count += dmatest_verify(thread->dsts, dst_off,
 				dst_off + len, src_off,
 				PATTERN_SRC | PATTERN_COPY, false);
-		error_count += dmatest_verify(thread->dstbuf, dst_off + len,
+		error_count += dmatest_verify(thread->dsts, dst_off + len,
 				test_buf_size, dst_off + len,
 				PATTERN_DST, false);
 
@@ -319,10 +404,16 @@ static int dmatest_func(void *data)
 	}
 
 	ret = 0;
-	kfree(thread->dstbuf);
+	for (i = 0; thread->dsts[i]; i++)
+		kfree(thread->dsts[i]);
 err_dstbuf:
-	kfree(thread->srcbuf);
+	kfree(thread->dsts);
+err_dsts:
+	for (i = 0; thread->srcs[i]; i++)
+		kfree(thread->srcs[i]);
 err_srcbuf:
+	kfree(thread->srcs);
+err_srcs:
 	pr_notice("%s: terminating after %u tests, %u failures (status %d)\n",
 			thread_name, total_tests, failed_tests, ret);
 	return ret;
@@ -344,35 +435,36 @@ static void dmatest_cleanup_channel(struct dmatest_chan *dtc)
 	kfree(dtc);
 }
 
-static int dmatest_add_channel(struct dma_chan *chan)
+static int dmatest_add_threads(struct dmatest_chan *dtc, enum dma_transaction_type type)
 {
-	struct dmatest_chan	*dtc;
-	struct dmatest_thread	*thread;
-	unsigned int		i;
-
-	dtc = kmalloc(sizeof(struct dmatest_chan), GFP_KERNEL);
-	if (!dtc) {
-		pr_warning("dmatest: No memory for %s\n", dma_chan_name(chan));
-		return -ENOMEM;
-	}
+	struct dmatest_thread *thread;
+	struct dma_chan *chan = dtc->chan;
+	char *op;
+	unsigned int i;
 
-	dtc->chan = chan;
-	INIT_LIST_HEAD(&dtc->threads);
+	if (type == DMA_MEMCPY)
+		op = "copy";
+	else if (type == DMA_XOR)
+		op = "xor";
+	else
+		return -EINVAL;
 
 	for (i = 0; i < threads_per_chan; i++) {
 		thread = kzalloc(sizeof(struct dmatest_thread), GFP_KERNEL);
 		if (!thread) {
-			pr_warning("dmatest: No memory for %s-test%u\n",
-				   dma_chan_name(chan), i);
+			pr_warning("dmatest: No memory for %s-%s%u\n",
+				   dma_chan_name(chan), op, i);
+
 			break;
 		}
 		thread->chan = dtc->chan;
+		thread->type = type;
 		smp_wmb();
-		thread->task = kthread_run(dmatest_func, thread, "%s-test%u",
-				dma_chan_name(chan), i);
+		thread->task = kthread_run(dmatest_func, thread, "%s-%s%u",
+				dma_chan_name(chan), op, i);
 		if (IS_ERR(thread->task)) {
-			pr_warning("dmatest: Failed to run thread %s-test%u\n",
-					dma_chan_name(chan), i);
+			pr_warning("dmatest: Failed to run thread %s-%s%u\n",
+					dma_chan_name(chan), op, i);
 			kfree(thread);
 			break;
 		}
@@ -382,7 +474,36 @@ static int dmatest_add_channel(struct dma_chan *chan)
 		list_add_tail(&thread->node, &dtc->threads);
 	}
 
-	pr_info("dmatest: Started %u threads using %s\n", i, dma_chan_name(chan));
+	return i;
+}
+
+static int dmatest_add_channel(struct dma_chan *chan)
+{
+	struct dmatest_chan	*dtc;
+	struct dma_device	*dma_dev = chan->device;
+	unsigned int		thread_count = 0;
+	unsigned int		cnt;
+
+	dtc = kmalloc(sizeof(struct dmatest_chan), GFP_KERNEL);
+	if (!dtc) {
+		pr_warning("dmatest: No memory for %s\n", dma_chan_name(chan));
+		return -ENOMEM;
+	}
+
+	dtc->chan = chan;
+	INIT_LIST_HEAD(&dtc->threads);
+
+	if (dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask)) {
+		cnt = dmatest_add_threads(dtc, DMA_MEMCPY);
+		thread_count += cnt > 0 ?: 0;
+	}
+	if (dma_has_cap(DMA_XOR, dma_dev->cap_mask)) {
+		cnt = dmatest_add_threads(dtc, DMA_XOR);
+		thread_count += cnt > 0 ?: 0;
+	}
+
+	pr_info("dmatest: Started %u threads using %s\n",
+		thread_count, dma_chan_name(chan));
 
 	list_add_tail(&dtc->node, &dmatest_channels);
 	nr_channels++;
diff --git a/drivers/dma/dw_dmac.c b/drivers/dma/dw_dmac.c
index 20ad3d26bec..98c9a847bf5 100644
--- a/drivers/dma/dw_dmac.c
+++ b/drivers/dma/dw_dmac.c
@@ -363,6 +363,82 @@ static void dwc_handle_error(struct dw_dma *dw, struct dw_dma_chan *dwc)
 	dwc_descriptor_complete(dwc, bad_desc);
 }
 
+/* --------------------- Cyclic DMA API extensions -------------------- */
+
+inline dma_addr_t dw_dma_get_src_addr(struct dma_chan *chan)
+{
+	struct dw_dma_chan *dwc = to_dw_dma_chan(chan);
+	return channel_readl(dwc, SAR);
+}
+EXPORT_SYMBOL(dw_dma_get_src_addr);
+
+inline dma_addr_t dw_dma_get_dst_addr(struct dma_chan *chan)
+{
+	struct dw_dma_chan *dwc = to_dw_dma_chan(chan);
+	return channel_readl(dwc, DAR);
+}
+EXPORT_SYMBOL(dw_dma_get_dst_addr);
+
+/* called with dwc->lock held and all DMAC interrupts disabled */
+static void dwc_handle_cyclic(struct dw_dma *dw, struct dw_dma_chan *dwc,
+		u32 status_block, u32 status_err, u32 status_xfer)
+{
+	if (status_block & dwc->mask) {
+		void (*callback)(void *param);
+		void *callback_param;
+
+		dev_vdbg(chan2dev(&dwc->chan), "new cyclic period llp 0x%08x\n",
+				channel_readl(dwc, LLP));
+		dma_writel(dw, CLEAR.BLOCK, dwc->mask);
+
+		callback = dwc->cdesc->period_callback;
+		callback_param = dwc->cdesc->period_callback_param;
+		if (callback) {
+			spin_unlock(&dwc->lock);
+			callback(callback_param);
+			spin_lock(&dwc->lock);
+		}
+	}
+
+	/*
+	 * Error and transfer complete are highly unlikely, and will most
+	 * likely be due to a configuration error by the user.
+	 */
+	if (unlikely(status_err & dwc->mask) ||
+			unlikely(status_xfer & dwc->mask)) {
+		int i;
+
+		dev_err(chan2dev(&dwc->chan), "cyclic DMA unexpected %s "
+				"interrupt, stopping DMA transfer\n",
+				status_xfer ? "xfer" : "error");
+		dev_err(chan2dev(&dwc->chan),
+			"  SAR: 0x%x DAR: 0x%x LLP: 0x%x CTL: 0x%x:%08x\n",
+			channel_readl(dwc, SAR),
+			channel_readl(dwc, DAR),
+			channel_readl(dwc, LLP),
+			channel_readl(dwc, CTL_HI),
+			channel_readl(dwc, CTL_LO));
+
+		channel_clear_bit(dw, CH_EN, dwc->mask);
+		while (dma_readl(dw, CH_EN) & dwc->mask)
+			cpu_relax();
+
+		/* make sure DMA does not restart by loading a new list */
+		channel_writel(dwc, LLP, 0);
+		channel_writel(dwc, CTL_LO, 0);
+		channel_writel(dwc, CTL_HI, 0);
+
+		dma_writel(dw, CLEAR.BLOCK, dwc->mask);
+		dma_writel(dw, CLEAR.ERROR, dwc->mask);
+		dma_writel(dw, CLEAR.XFER, dwc->mask);
+
+		for (i = 0; i < dwc->cdesc->periods; i++)
+			dwc_dump_lli(dwc, &dwc->cdesc->desc[i]->lli);
+	}
+}
+
+/* ------------------------------------------------------------------------- */
+
 static void dw_dma_tasklet(unsigned long data)
 {
 	struct dw_dma *dw = (struct dw_dma *)data;
@@ -382,7 +458,10 @@ static void dw_dma_tasklet(unsigned long data)
 	for (i = 0; i < dw->dma.chancnt; i++) {
 		dwc = &dw->chan[i];
 		spin_lock(&dwc->lock);
-		if (status_err & (1 << i))
+		if (test_bit(DW_DMA_IS_CYCLIC, &dwc->flags))
+			dwc_handle_cyclic(dw, dwc, status_block, status_err,
+					status_xfer);
+		else if (status_err & (1 << i))
 			dwc_handle_error(dw, dwc);
 		else if ((status_block | status_xfer) & (1 << i))
 			dwc_scan_descriptors(dw, dwc);
@@ -826,7 +905,6 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan)
 		dma_async_tx_descriptor_init(&desc->txd, chan);
 		desc->txd.tx_submit = dwc_tx_submit;
 		desc->txd.flags = DMA_CTRL_ACK;
-		INIT_LIST_HEAD(&desc->txd.tx_list);
 		desc->txd.phys = dma_map_single(chan2parent(chan), &desc->lli,
 				sizeof(desc->lli), DMA_TO_DEVICE);
 		dwc_desc_put(dwc, desc);
@@ -884,6 +962,257 @@ static void dwc_free_chan_resources(struct dma_chan *chan)
 	dev_vdbg(chan2dev(chan), "free_chan_resources done\n");
 }
 
+/* --------------------- Cyclic DMA API extensions -------------------- */
+
+/**
+ * dw_dma_cyclic_start - start the cyclic DMA transfer
+ * @chan: the DMA channel to start
+ *
+ * Must be called with soft interrupts disabled. Returns zero on success or
+ * -errno on failure.
+ */
+int dw_dma_cyclic_start(struct dma_chan *chan)
+{
+	struct dw_dma_chan	*dwc = to_dw_dma_chan(chan);
+	struct dw_dma		*dw = to_dw_dma(dwc->chan.device);
+
+	if (!test_bit(DW_DMA_IS_CYCLIC, &dwc->flags)) {
+		dev_err(chan2dev(&dwc->chan), "missing prep for cyclic DMA\n");
+		return -ENODEV;
+	}
+
+	spin_lock(&dwc->lock);
+
+	/* assert channel is idle */
+	if (dma_readl(dw, CH_EN) & dwc->mask) {
+		dev_err(chan2dev(&dwc->chan),
+			"BUG: Attempted to start non-idle channel\n");
+		dev_err(chan2dev(&dwc->chan),
+			"  SAR: 0x%x DAR: 0x%x LLP: 0x%x CTL: 0x%x:%08x\n",
+			channel_readl(dwc, SAR),
+			channel_readl(dwc, DAR),
+			channel_readl(dwc, LLP),
+			channel_readl(dwc, CTL_HI),
+			channel_readl(dwc, CTL_LO));
+		spin_unlock(&dwc->lock);
+		return -EBUSY;
+	}
+
+	dma_writel(dw, CLEAR.BLOCK, dwc->mask);
+	dma_writel(dw, CLEAR.ERROR, dwc->mask);
+	dma_writel(dw, CLEAR.XFER, dwc->mask);
+
+	/* setup DMAC channel registers */
+	channel_writel(dwc, LLP, dwc->cdesc->desc[0]->txd.phys);
+	channel_writel(dwc, CTL_LO, DWC_CTLL_LLP_D_EN | DWC_CTLL_LLP_S_EN);
+	channel_writel(dwc, CTL_HI, 0);
+
+	channel_set_bit(dw, CH_EN, dwc->mask);
+
+	spin_unlock(&dwc->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(dw_dma_cyclic_start);
+
+/**
+ * dw_dma_cyclic_stop - stop the cyclic DMA transfer
+ * @chan: the DMA channel to stop
+ *
+ * Must be called with soft interrupts disabled.
+ */
+void dw_dma_cyclic_stop(struct dma_chan *chan)
+{
+	struct dw_dma_chan	*dwc = to_dw_dma_chan(chan);
+	struct dw_dma		*dw = to_dw_dma(dwc->chan.device);
+
+	spin_lock(&dwc->lock);
+
+	channel_clear_bit(dw, CH_EN, dwc->mask);
+	while (dma_readl(dw, CH_EN) & dwc->mask)
+		cpu_relax();
+
+	spin_unlock(&dwc->lock);
+}
+EXPORT_SYMBOL(dw_dma_cyclic_stop);
+
+/**
+ * dw_dma_cyclic_prep - prepare the cyclic DMA transfer
+ * @chan: the DMA channel to prepare
+ * @buf_addr: physical DMA address where the buffer starts
+ * @buf_len: total number of bytes for the entire buffer
+ * @period_len: number of bytes for each period
+ * @direction: transfer direction, to or from device
+ *
+ * Must be called before trying to start the transfer. Returns a valid struct
+ * dw_cyclic_desc if successful or an ERR_PTR(-errno) if not successful.
+ */
+struct dw_cyclic_desc *dw_dma_cyclic_prep(struct dma_chan *chan,
+		dma_addr_t buf_addr, size_t buf_len, size_t period_len,
+		enum dma_data_direction direction)
+{
+	struct dw_dma_chan		*dwc = to_dw_dma_chan(chan);
+	struct dw_cyclic_desc		*cdesc;
+	struct dw_cyclic_desc		*retval = NULL;
+	struct dw_desc			*desc;
+	struct dw_desc			*last = NULL;
+	struct dw_dma_slave		*dws = chan->private;
+	unsigned long			was_cyclic;
+	unsigned int			reg_width;
+	unsigned int			periods;
+	unsigned int			i;
+
+	spin_lock_bh(&dwc->lock);
+	if (!list_empty(&dwc->queue) || !list_empty(&dwc->active_list)) {
+		spin_unlock_bh(&dwc->lock);
+		dev_dbg(chan2dev(&dwc->chan),
+				"queue and/or active list are not empty\n");
+		return ERR_PTR(-EBUSY);
+	}
+
+	was_cyclic = test_and_set_bit(DW_DMA_IS_CYCLIC, &dwc->flags);
+	spin_unlock_bh(&dwc->lock);
+	if (was_cyclic) {
+		dev_dbg(chan2dev(&dwc->chan),
+				"channel already prepared for cyclic DMA\n");
+		return ERR_PTR(-EBUSY);
+	}
+
+	retval = ERR_PTR(-EINVAL);
+	reg_width = dws->reg_width;
+	periods = buf_len / period_len;
+
+	/* Check for too big/unaligned periods and unaligned DMA buffer. */
+	if (period_len > (DWC_MAX_COUNT << reg_width))
+		goto out_err;
+	if (unlikely(period_len & ((1 << reg_width) - 1)))
+		goto out_err;
+	if (unlikely(buf_addr & ((1 << reg_width) - 1)))
+		goto out_err;
+	if (unlikely(!(direction & (DMA_TO_DEVICE | DMA_FROM_DEVICE))))
+		goto out_err;
+
+	retval = ERR_PTR(-ENOMEM);
+
+	if (periods > NR_DESCS_PER_CHANNEL)
+		goto out_err;
+
+	cdesc = kzalloc(sizeof(struct dw_cyclic_desc), GFP_KERNEL);
+	if (!cdesc)
+		goto out_err;
+
+	cdesc->desc = kzalloc(sizeof(struct dw_desc *) * periods, GFP_KERNEL);
+	if (!cdesc->desc)
+		goto out_err_alloc;
+
+	for (i = 0; i < periods; i++) {
+		desc = dwc_desc_get(dwc);
+		if (!desc)
+			goto out_err_desc_get;
+
+		switch (direction) {
+		case DMA_TO_DEVICE:
+			desc->lli.dar = dws->tx_reg;
+			desc->lli.sar = buf_addr + (period_len * i);
+			desc->lli.ctllo = (DWC_DEFAULT_CTLLO
+					| DWC_CTLL_DST_WIDTH(reg_width)
+					| DWC_CTLL_SRC_WIDTH(reg_width)
+					| DWC_CTLL_DST_FIX
+					| DWC_CTLL_SRC_INC
+					| DWC_CTLL_FC_M2P
+					| DWC_CTLL_INT_EN);
+			break;
+		case DMA_FROM_DEVICE:
+			desc->lli.dar = buf_addr + (period_len * i);
+			desc->lli.sar = dws->rx_reg;
+			desc->lli.ctllo = (DWC_DEFAULT_CTLLO
+					| DWC_CTLL_SRC_WIDTH(reg_width)
+					| DWC_CTLL_DST_WIDTH(reg_width)
+					| DWC_CTLL_DST_INC
+					| DWC_CTLL_SRC_FIX
+					| DWC_CTLL_FC_P2M
+					| DWC_CTLL_INT_EN);
+			break;
+		default:
+			break;
+		}
+
+		desc->lli.ctlhi = (period_len >> reg_width);
+		cdesc->desc[i] = desc;
+
+		if (last) {
+			last->lli.llp = desc->txd.phys;
+			dma_sync_single_for_device(chan2parent(chan),
+					last->txd.phys, sizeof(last->lli),
+					DMA_TO_DEVICE);
+		}
+
+		last = desc;
+	}
+
+	/* lets make a cyclic list */
+	last->lli.llp = cdesc->desc[0]->txd.phys;
+	dma_sync_single_for_device(chan2parent(chan), last->txd.phys,
+			sizeof(last->lli), DMA_TO_DEVICE);
+
+	dev_dbg(chan2dev(&dwc->chan), "cyclic prepared buf 0x%08x len %zu "
+			"period %zu periods %d\n", buf_addr, buf_len,
+			period_len, periods);
+
+	cdesc->periods = periods;
+	dwc->cdesc = cdesc;
+
+	return cdesc;
+
+out_err_desc_get:
+	while (i--)
+		dwc_desc_put(dwc, cdesc->desc[i]);
+out_err_alloc:
+	kfree(cdesc);
+out_err:
+	clear_bit(DW_DMA_IS_CYCLIC, &dwc->flags);
+	return (struct dw_cyclic_desc *)retval;
+}
+EXPORT_SYMBOL(dw_dma_cyclic_prep);
+
+/**
+ * dw_dma_cyclic_free - free a prepared cyclic DMA transfer
+ * @chan: the DMA channel to free
+ */
+void dw_dma_cyclic_free(struct dma_chan *chan)
+{
+	struct dw_dma_chan	*dwc = to_dw_dma_chan(chan);
+	struct dw_dma		*dw = to_dw_dma(dwc->chan.device);
+	struct dw_cyclic_desc	*cdesc = dwc->cdesc;
+	int			i;
+
+	dev_dbg(chan2dev(&dwc->chan), "cyclic free\n");
+
+	if (!cdesc)
+		return;
+
+	spin_lock_bh(&dwc->lock);
+
+	channel_clear_bit(dw, CH_EN, dwc->mask);
+	while (dma_readl(dw, CH_EN) & dwc->mask)
+		cpu_relax();
+
+	dma_writel(dw, CLEAR.BLOCK, dwc->mask);
+	dma_writel(dw, CLEAR.ERROR, dwc->mask);
+	dma_writel(dw, CLEAR.XFER, dwc->mask);
+
+	spin_unlock_bh(&dwc->lock);
+
+	for (i = 0; i < cdesc->periods; i++)
+		dwc_desc_put(dwc, cdesc->desc[i]);
+
+	kfree(cdesc->desc);
+	kfree(cdesc);
+
+	clear_bit(DW_DMA_IS_CYCLIC, &dwc->flags);
+}
+EXPORT_SYMBOL(dw_dma_cyclic_free);
+
 /*----------------------------------------------------------------------*/
 
 static void dw_dma_off(struct dw_dma *dw)
diff --git a/drivers/dma/dw_dmac_regs.h b/drivers/dma/dw_dmac_regs.h
index b252b202c5c..13a58076703 100644
--- a/drivers/dma/dw_dmac_regs.h
+++ b/drivers/dma/dw_dmac_regs.h
@@ -126,6 +126,10 @@ struct dw_dma_regs {
 
 #define DW_REGLEN		0x400
 
+enum dw_dmac_flags {
+	DW_DMA_IS_CYCLIC = 0,
+};
+
 struct dw_dma_chan {
 	struct dma_chan		chan;
 	void __iomem		*ch_regs;
@@ -134,10 +138,12 @@ struct dw_dma_chan {
 	spinlock_t		lock;
 
 	/* these other elements are all protected by lock */
+	unsigned long		flags;
 	dma_cookie_t		completed;
 	struct list_head	active_list;
 	struct list_head	queue;
 	struct list_head	free_list;
+	struct dw_cyclic_desc	*cdesc;
 
 	unsigned int		descs_allocated;
 };
@@ -158,7 +164,6 @@ static inline struct dw_dma_chan *to_dw_dma_chan(struct dma_chan *chan)
 	return container_of(chan, struct dw_dma_chan, chan);
 }
 
-
 struct dw_dma {
 	struct dma_device	dma;
 	void __iomem		*regs;
diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index 86d6da47f55..da8a8ed9e41 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -354,7 +354,6 @@ static struct fsl_desc_sw *fsl_dma_alloc_descriptor(
 		dma_async_tx_descriptor_init(&desc_sw->async_tx,
 						&fsl_chan->common);
 		desc_sw->async_tx.tx_submit = fsl_dma_tx_submit;
-		INIT_LIST_HEAD(&desc_sw->async_tx.tx_list);
 		desc_sw->async_tx.phys = pdesc;
 	}
 
diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c
index 5905cd36bcd..e4fc33c1c32 100644
--- a/drivers/dma/ioat_dma.c
+++ b/drivers/dma/ioat_dma.c
@@ -693,7 +693,6 @@ static struct ioat_desc_sw *ioat_dma_alloc_descriptor(
 		desc_sw->async_tx.tx_submit = ioat2_tx_submit;
 		break;
 	}
-	INIT_LIST_HEAD(&desc_sw->async_tx.tx_list);
 
 	desc_sw->hw = desc;
 	desc_sw->async_tx.phys = phys;
diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c
index 16adbe61cfb..2f052265122 100644
--- a/drivers/dma/iop-adma.c
+++ b/drivers/dma/iop-adma.c
@@ -498,7 +498,6 @@ static int iop_adma_alloc_chan_resources(struct dma_chan *chan)
 		slot->async_tx.tx_submit = iop_adma_tx_submit;
 		INIT_LIST_HEAD(&slot->chain_node);
 		INIT_LIST_HEAD(&slot->slot_node);
-		INIT_LIST_HEAD(&slot->async_tx.tx_list);
 		hw_desc = (char *) iop_chan->device->dma_desc_pool;
 		slot->async_tx.phys =
 			(dma_addr_t) &hw_desc[idx * IOP_ADMA_SLOT_SIZE];
diff --git a/drivers/dma/ipu/ipu_idmac.c b/drivers/dma/ipu/ipu_idmac.c
index da781d10789..e202a6ce557 100644
--- a/drivers/dma/ipu/ipu_idmac.c
+++ b/drivers/dma/ipu/ipu_idmac.c
@@ -28,6 +28,9 @@
 #define FS_VF_IN_VALID	0x00000002
 #define FS_ENC_IN_VALID	0x00000001
 
+static int ipu_disable_channel(struct idmac *idmac, struct idmac_channel *ichan,
+			       bool wait_for_stop);
+
 /*
  * There can be only one, we could allocate it dynamically, but then we'd have
  * to add an extra parameter to some functions, and use something as ugly as
@@ -107,7 +110,7 @@ static uint32_t bytes_per_pixel(enum pixel_fmt fmt)
 	}
 }
 
-/* Enable / disable direct write to memory by the Camera Sensor Interface */
+/* Enable direct write to memory by the Camera Sensor Interface */
 static void ipu_ic_enable_task(struct ipu *ipu, enum ipu_channel channel)
 {
 	uint32_t ic_conf, mask;
@@ -126,6 +129,7 @@ static void ipu_ic_enable_task(struct ipu *ipu, enum ipu_channel channel)
 	idmac_write_icreg(ipu, ic_conf, IC_CONF);
 }
 
+/* Called under spin_lock_irqsave(&ipu_data.lock) */
 static void ipu_ic_disable_task(struct ipu *ipu, enum ipu_channel channel)
 {
 	uint32_t ic_conf, mask;
@@ -422,7 +426,7 @@ static void ipu_ch_param_set_size(union chan_param_mem *params,
 		break;
 	default:
 		dev_err(ipu_data.dev,
-			"mxc ipu: unimplemented pixel format %d\n", pixel_fmt);
+			"mx3 ipu: unimplemented pixel format %d\n", pixel_fmt);
 		break;
 	}
 
@@ -433,20 +437,20 @@ static void ipu_ch_param_set_burst_size(union chan_param_mem *params,
 					uint16_t burst_pixels)
 {
 	params->pp.npb = burst_pixels - 1;
-};
+}
 
 static void ipu_ch_param_set_buffer(union chan_param_mem *params,
 				    dma_addr_t buf0, dma_addr_t buf1)
 {
 	params->pp.eba0 = buf0;
 	params->pp.eba1 = buf1;
-};
+}
 
 static void ipu_ch_param_set_rotation(union chan_param_mem *params,
 				      enum ipu_rotate_mode rotate)
 {
 	params->pp.bam = rotate;
-};
+}
 
 static void ipu_write_param_mem(uint32_t addr, uint32_t *data,
 				uint32_t num_words)
@@ -571,7 +575,7 @@ static uint32_t dma_param_addr(uint32_t dma_ch)
 {
 	/* Channel Parameter Memory */
 	return 0x10000 | (dma_ch << 4);
-};
+}
 
 static void ipu_channel_set_priority(struct ipu *ipu, enum ipu_channel channel,
 				     bool prio)
@@ -611,7 +615,8 @@ static uint32_t ipu_channel_conf_mask(enum ipu_channel channel)
 
 /**
  * ipu_enable_channel() - enable an IPU channel.
- * @channel:	channel ID.
+ * @idmac:	IPU DMAC context.
+ * @ichan:	IDMAC channel.
  * @return:	0 on success or negative error code on failure.
  */
 static int ipu_enable_channel(struct idmac *idmac, struct idmac_channel *ichan)
@@ -649,7 +654,7 @@ static int ipu_enable_channel(struct idmac *idmac, struct idmac_channel *ichan)
 
 /**
  * ipu_init_channel_buffer() - initialize a buffer for logical IPU channel.
- * @channel:	channel ID.
+ * @ichan:	IDMAC channel.
  * @pixel_fmt:	pixel format of buffer. Pixel format is a FOURCC ASCII code.
  * @width:	width of buffer in pixels.
  * @height:	height of buffer in pixels.
@@ -687,7 +692,7 @@ static int ipu_init_channel_buffer(struct idmac_channel *ichan,
 	}
 
 	/* IC channel's stride must be a multiple of 8 pixels */
-	if ((channel <= 13) && (stride % 8)) {
+	if ((channel <= IDMAC_IC_13) && (stride % 8)) {
 		dev_err(ipu->dev, "Stride must be 8 pixel multiple\n");
 		return -EINVAL;
 	}
@@ -752,7 +757,7 @@ static void ipu_select_buffer(enum ipu_channel channel, int buffer_n)
 
 /**
  * ipu_update_channel_buffer() - update physical address of a channel buffer.
- * @channel:	channel ID.
+ * @ichan:	IDMAC channel.
  * @buffer_n:	buffer number to update.
  *		0 or 1 are the only valid values.
  * @phyaddr:	buffer physical address.
@@ -760,9 +765,10 @@ static void ipu_select_buffer(enum ipu_channel channel, int buffer_n)
  *              function will fail if the buffer is set to ready.
  */
 /* Called under spin_lock(_irqsave)(&ichan->lock) */
-static int ipu_update_channel_buffer(enum ipu_channel channel,
+static int ipu_update_channel_buffer(struct idmac_channel *ichan,
 				     int buffer_n, dma_addr_t phyaddr)
 {
+	enum ipu_channel channel = ichan->dma_chan.chan_id;
 	uint32_t reg;
 	unsigned long flags;
 
@@ -771,8 +777,8 @@ static int ipu_update_channel_buffer(enum ipu_channel channel,
 	if (buffer_n == 0) {
 		reg = idmac_read_ipureg(&ipu_data, IPU_CHA_BUF0_RDY);
 		if (reg & (1UL << channel)) {
-			spin_unlock_irqrestore(&ipu_data.lock, flags);
-			return -EACCES;
+			ipu_ic_disable_task(&ipu_data, channel);
+			ichan->status = IPU_CHANNEL_READY;
 		}
 
 		/* 44.3.3.1.9 - Row Number 1 (WORD1, offset 0) */
@@ -782,8 +788,8 @@ static int ipu_update_channel_buffer(enum ipu_channel channel,
 	} else {
 		reg = idmac_read_ipureg(&ipu_data, IPU_CHA_BUF1_RDY);
 		if (reg & (1UL << channel)) {
-			spin_unlock_irqrestore(&ipu_data.lock, flags);
-			return -EACCES;
+			ipu_ic_disable_task(&ipu_data, channel);
+			ichan->status = IPU_CHANNEL_READY;
 		}
 
 		/* Check if double-buffering is already enabled */
@@ -805,6 +811,39 @@ static int ipu_update_channel_buffer(enum ipu_channel channel,
 }
 
 /* Called under spin_lock_irqsave(&ichan->lock) */
+static int ipu_submit_buffer(struct idmac_channel *ichan,
+	struct idmac_tx_desc *desc, struct scatterlist *sg, int buf_idx)
+{
+	unsigned int chan_id = ichan->dma_chan.chan_id;
+	struct device *dev = &ichan->dma_chan.dev->device;
+	int ret;
+
+	if (async_tx_test_ack(&desc->txd))
+		return -EINTR;
+
+	/*
+	 * On first invocation this shouldn't be necessary, the call to
+	 * ipu_init_channel_buffer() above will set addresses for us, so we
+	 * could make it conditional on status >= IPU_CHANNEL_ENABLED, but
+	 * doing it again shouldn't hurt either.
+	 */
+	ret = ipu_update_channel_buffer(ichan, buf_idx,
+					sg_dma_address(sg));
+
+	if (ret < 0) {
+		dev_err(dev, "Updating sg %p on channel 0x%x buffer %d failed!\n",
+			sg, chan_id, buf_idx);
+		return ret;
+	}
+
+	ipu_select_buffer(chan_id, buf_idx);
+	dev_dbg(dev, "Updated sg %p on channel 0x%x buffer %d\n",
+		sg, chan_id, buf_idx);
+
+	return 0;
+}
+
+/* Called under spin_lock_irqsave(&ichan->lock) */
 static int ipu_submit_channel_buffers(struct idmac_channel *ichan,
 				      struct idmac_tx_desc *desc)
 {
@@ -815,20 +854,10 @@ static int ipu_submit_channel_buffers(struct idmac_channel *ichan,
 		if (!ichan->sg[i]) {
 			ichan->sg[i] = sg;
 
-			/*
-			 * On first invocation this shouldn't be necessary, the
-			 * call to ipu_init_channel_buffer() above will set
-			 * addresses for us, so we could make it conditional
-			 * on status >= IPU_CHANNEL_ENABLED, but doing it again
-			 * shouldn't hurt either.
-			 */
-			ret = ipu_update_channel_buffer(ichan->dma_chan.chan_id, i,
-							sg_dma_address(sg));
+			ret = ipu_submit_buffer(ichan, desc, sg, i);
 			if (ret < 0)
 				return ret;
 
-			ipu_select_buffer(ichan->dma_chan.chan_id, i);
-
 			sg = sg_next(sg);
 		}
 	}
@@ -842,19 +871,22 @@ static dma_cookie_t idmac_tx_submit(struct dma_async_tx_descriptor *tx)
 	struct idmac_channel *ichan = to_idmac_chan(tx->chan);
 	struct idmac *idmac = to_idmac(tx->chan->device);
 	struct ipu *ipu = to_ipu(idmac);
+	struct device *dev = &ichan->dma_chan.dev->device;
 	dma_cookie_t cookie;
 	unsigned long flags;
+	int ret;
 
 	/* Sanity check */
 	if (!list_empty(&desc->list)) {
 		/* The descriptor doesn't belong to client */
-		dev_err(&ichan->dma_chan.dev->device,
-			"Descriptor %p not prepared!\n", tx);
+		dev_err(dev, "Descriptor %p not prepared!\n", tx);
 		return -EBUSY;
 	}
 
 	mutex_lock(&ichan->chan_mutex);
 
+	async_tx_clear_ack(tx);
+
 	if (ichan->status < IPU_CHANNEL_READY) {
 		struct idmac_video_param *video = &ichan->params.video;
 		/*
@@ -878,16 +910,7 @@ static dma_cookie_t idmac_tx_submit(struct dma_async_tx_descriptor *tx)
 			goto out;
 	}
 
-	/* ipu->lock can be taken under ichan->lock, but not v.v. */
-	spin_lock_irqsave(&ichan->lock, flags);
-
-	/* submit_buffers() atomically verifies and fills empty sg slots */
-	cookie = ipu_submit_channel_buffers(ichan, desc);
-
-	spin_unlock_irqrestore(&ichan->lock, flags);
-
-	if (cookie < 0)
-		goto out;
+	dev_dbg(dev, "Submitting sg %p\n", &desc->sg[0]);
 
 	cookie = ichan->dma_chan.cookie;
 
@@ -897,24 +920,40 @@ static dma_cookie_t idmac_tx_submit(struct dma_async_tx_descriptor *tx)
 	/* from dmaengine.h: "last cookie value returned to client" */
 	ichan->dma_chan.cookie = cookie;
 	tx->cookie = cookie;
+
+	/* ipu->lock can be taken under ichan->lock, but not v.v. */
 	spin_lock_irqsave(&ichan->lock, flags);
+
 	list_add_tail(&desc->list, &ichan->queue);
+	/* submit_buffers() atomically verifies and fills empty sg slots */
+	ret = ipu_submit_channel_buffers(ichan, desc);
+
 	spin_unlock_irqrestore(&ichan->lock, flags);
 
+	if (ret < 0) {
+		cookie = ret;
+		goto dequeue;
+	}
+
 	if (ichan->status < IPU_CHANNEL_ENABLED) {
-		int ret = ipu_enable_channel(idmac, ichan);
+		ret = ipu_enable_channel(idmac, ichan);
 		if (ret < 0) {
 			cookie = ret;
-			spin_lock_irqsave(&ichan->lock, flags);
-			list_del_init(&desc->list);
-			spin_unlock_irqrestore(&ichan->lock, flags);
-			tx->cookie = cookie;
-			ichan->dma_chan.cookie = cookie;
+			goto dequeue;
 		}
 	}
 
 	dump_idmac_reg(ipu);
 
+dequeue:
+	if (cookie < 0) {
+		spin_lock_irqsave(&ichan->lock, flags);
+		list_del_init(&desc->list);
+		spin_unlock_irqrestore(&ichan->lock, flags);
+		tx->cookie = cookie;
+		ichan->dma_chan.cookie = cookie;
+	}
+
 out:
 	mutex_unlock(&ichan->chan_mutex);
 
@@ -944,8 +983,6 @@ static int idmac_desc_alloc(struct idmac_channel *ichan, int n)
 		memset(txd, 0, sizeof(*txd));
 		dma_async_tx_descriptor_init(txd, &ichan->dma_chan);
 		txd->tx_submit		= idmac_tx_submit;
-		txd->chan		= &ichan->dma_chan;
-		INIT_LIST_HEAD(&txd->tx_list);
 
 		list_add(&desc->list, &ichan->free_list);
 
@@ -1161,6 +1198,24 @@ static int ipu_disable_channel(struct idmac *idmac, struct idmac_channel *ichan,
 	return 0;
 }
 
+static struct scatterlist *idmac_sg_next(struct idmac_channel *ichan,
+	struct idmac_tx_desc **desc, struct scatterlist *sg)
+{
+	struct scatterlist *sgnew = sg ? sg_next(sg) : NULL;
+
+	if (sgnew)
+		/* next sg-element in this list */
+		return sgnew;
+
+	if ((*desc)->list.next == &ichan->queue)
+		/* No more descriptors on the queue */
+		return NULL;
+
+	/* Fetch next descriptor */
+	*desc = list_entry((*desc)->list.next, struct idmac_tx_desc, list);
+	return (*desc)->sg;
+}
+
 /*
  * We have several possibilities here:
  * current BUF		next BUF
@@ -1176,23 +1231,46 @@ static int ipu_disable_channel(struct idmac *idmac, struct idmac_channel *ichan,
 static irqreturn_t idmac_interrupt(int irq, void *dev_id)
 {
 	struct idmac_channel *ichan = dev_id;
+	struct device *dev = &ichan->dma_chan.dev->device;
 	unsigned int chan_id = ichan->dma_chan.chan_id;
 	struct scatterlist **sg, *sgnext, *sgnew = NULL;
 	/* Next transfer descriptor */
-	struct idmac_tx_desc *desc = NULL, *descnew;
+	struct idmac_tx_desc *desc, *descnew;
 	dma_async_tx_callback callback;
 	void *callback_param;
 	bool done = false;
-	u32	ready0 = idmac_read_ipureg(&ipu_data, IPU_CHA_BUF0_RDY),
-		ready1 = idmac_read_ipureg(&ipu_data, IPU_CHA_BUF1_RDY),
-		curbuf = idmac_read_ipureg(&ipu_data, IPU_CHA_CUR_BUF);
+	u32 ready0, ready1, curbuf, err;
+	unsigned long flags;
 
 	/* IDMAC has cleared the respective BUFx_RDY bit, we manage the buffer */
 
-	pr_debug("IDMAC irq %d\n", irq);
+	dev_dbg(dev, "IDMAC irq %d, buf %d\n", irq, ichan->active_buffer);
+
+	spin_lock_irqsave(&ipu_data.lock, flags);
+
+	ready0	= idmac_read_ipureg(&ipu_data, IPU_CHA_BUF0_RDY);
+	ready1	= idmac_read_ipureg(&ipu_data, IPU_CHA_BUF1_RDY);
+	curbuf	= idmac_read_ipureg(&ipu_data, IPU_CHA_CUR_BUF);
+	err	= idmac_read_ipureg(&ipu_data, IPU_INT_STAT_4);
+
+	if (err & (1 << chan_id)) {
+		idmac_write_ipureg(&ipu_data, 1 << chan_id, IPU_INT_STAT_4);
+		spin_unlock_irqrestore(&ipu_data.lock, flags);
+		/*
+		 * Doing this
+		 * ichan->sg[0] = ichan->sg[1] = NULL;
+		 * you can force channel re-enable on the next tx_submit(), but
+		 * this is dirty - think about descriptors with multiple
+		 * sg elements.
+		 */
+		dev_warn(dev, "NFB4EOF on channel %d, ready %x, %x, cur %x\n",
+			 chan_id, ready0, ready1, curbuf);
+		return IRQ_HANDLED;
+	}
+	spin_unlock_irqrestore(&ipu_data.lock, flags);
+
 	/* Other interrupts do not interfere with this channel */
 	spin_lock(&ichan->lock);
-
 	if (unlikely(chan_id != IDMAC_SDC_0 && chan_id != IDMAC_SDC_1 &&
 		     ((curbuf >> chan_id) & 1) == ichan->active_buffer)) {
 		int i = 100;
@@ -1207,19 +1285,23 @@ static irqreturn_t idmac_interrupt(int irq, void *dev_id)
 
 		if (!i) {
 			spin_unlock(&ichan->lock);
-			dev_dbg(ichan->dma_chan.device->dev,
+			dev_dbg(dev,
 				"IRQ on active buffer on channel %x, active "
 				"%d, ready %x, %x, current %x!\n", chan_id,
 				ichan->active_buffer, ready0, ready1, curbuf);
 			return IRQ_NONE;
-		}
+		} else
+			dev_dbg(dev,
+				"Buffer deactivated on channel %x, active "
+				"%d, ready %x, %x, current %x, rest %d!\n", chan_id,
+				ichan->active_buffer, ready0, ready1, curbuf, i);
 	}
 
 	if (unlikely((ichan->active_buffer && (ready1 >> chan_id) & 1) ||
 		     (!ichan->active_buffer && (ready0 >> chan_id) & 1)
 		     )) {
 		spin_unlock(&ichan->lock);
-		dev_dbg(ichan->dma_chan.device->dev,
+		dev_dbg(dev,
 			"IRQ with active buffer still ready on channel %x, "
 			"active %d, ready %x, %x!\n", chan_id,
 			ichan->active_buffer, ready0, ready1);
@@ -1227,8 +1309,9 @@ static irqreturn_t idmac_interrupt(int irq, void *dev_id)
 	}
 
 	if (unlikely(list_empty(&ichan->queue))) {
+		ichan->sg[ichan->active_buffer] = NULL;
 		spin_unlock(&ichan->lock);
-		dev_err(ichan->dma_chan.device->dev,
+		dev_err(dev,
 			"IRQ without queued buffers on channel %x, active %d, "
 			"ready %x, %x!\n", chan_id,
 			ichan->active_buffer, ready0, ready1);
@@ -1243,40 +1326,44 @@ static irqreturn_t idmac_interrupt(int irq, void *dev_id)
 	sg = &ichan->sg[ichan->active_buffer];
 	sgnext = ichan->sg[!ichan->active_buffer];
 
+	if (!*sg) {
+		spin_unlock(&ichan->lock);
+		return IRQ_HANDLED;
+	}
+
+	desc = list_entry(ichan->queue.next, struct idmac_tx_desc, list);
+	descnew = desc;
+
+	dev_dbg(dev, "IDMAC irq %d, dma 0x%08x, next dma 0x%08x, current %d, curbuf 0x%08x\n",
+		irq, sg_dma_address(*sg), sgnext ? sg_dma_address(sgnext) : 0, ichan->active_buffer, curbuf);
+
+	/* Find the descriptor of sgnext */
+	sgnew = idmac_sg_next(ichan, &descnew, *sg);
+	if (sgnext != sgnew)
+		dev_err(dev, "Submitted buffer %p, next buffer %p\n", sgnext, sgnew);
+
 	/*
 	 * if sgnext == NULL sg must be the last element in a scatterlist and
 	 * queue must be empty
 	 */
 	if (unlikely(!sgnext)) {
-		if (unlikely(sg_next(*sg))) {
-			dev_err(ichan->dma_chan.device->dev,
-				"Broken buffer-update locking on channel %x!\n",
-				chan_id);
-			/* We'll let the user catch up */
+		if (!WARN_ON(sg_next(*sg)))
+			dev_dbg(dev, "Underrun on channel %x\n", chan_id);
+		ichan->sg[!ichan->active_buffer] = sgnew;
+
+		if (unlikely(sgnew)) {
+			ipu_submit_buffer(ichan, descnew, sgnew, !ichan->active_buffer);
 		} else {
-			/* Underrun */
+			spin_lock_irqsave(&ipu_data.lock, flags);
 			ipu_ic_disable_task(&ipu_data, chan_id);
-			dev_dbg(ichan->dma_chan.device->dev,
-				"Underrun on channel %x\n", chan_id);
+			spin_unlock_irqrestore(&ipu_data.lock, flags);
 			ichan->status = IPU_CHANNEL_READY;
 			/* Continue to check for complete descriptor */
 		}
 	}
 
-	desc = list_entry(ichan->queue.next, struct idmac_tx_desc, list);
-
-	/* First calculate and submit the next sg element */
-	if (likely(sgnext))
-		sgnew = sg_next(sgnext);
-
-	if (unlikely(!sgnew)) {
-		/* Start a new scatterlist, if any queued */
-		if (likely(desc->list.next != &ichan->queue)) {
-			descnew = list_entry(desc->list.next,
-					     struct idmac_tx_desc, list);
-			sgnew = &descnew->sg[0];
-		}
-	}
+	/* Calculate and submit the next sg element */
+	sgnew = idmac_sg_next(ichan, &descnew, sgnew);
 
 	if (unlikely(!sg_next(*sg)) || !sgnext) {
 		/*
@@ -1289,17 +1376,13 @@ static irqreturn_t idmac_interrupt(int irq, void *dev_id)
 
 	*sg = sgnew;
 
-	if (likely(sgnew)) {
-		int ret;
-
-		ret = ipu_update_channel_buffer(chan_id, ichan->active_buffer,
-						sg_dma_address(*sg));
-		if (ret < 0)
-			dev_err(ichan->dma_chan.device->dev,
-				"Failed to update buffer on channel %x buffer %d!\n",
-				chan_id, ichan->active_buffer);
-		else
-			ipu_select_buffer(chan_id, ichan->active_buffer);
+	if (likely(sgnew) &&
+	    ipu_submit_buffer(ichan, descnew, sgnew, ichan->active_buffer) < 0) {
+		callback = desc->txd.callback;
+		callback_param = desc->txd.callback_param;
+		spin_unlock(&ichan->lock);
+		callback(callback_param);
+		spin_lock(&ichan->lock);
 	}
 
 	/* Flip the active buffer - even if update above failed */
@@ -1327,13 +1410,20 @@ static void ipu_gc_tasklet(unsigned long arg)
 		struct idmac_channel *ichan = ipu->channel + i;
 		struct idmac_tx_desc *desc;
 		unsigned long flags;
-		int j;
+		struct scatterlist *sg;
+		int j, k;
 
 		for (j = 0; j < ichan->n_tx_desc; j++) {
 			desc = ichan->desc + j;
 			spin_lock_irqsave(&ichan->lock, flags);
 			if (async_tx_test_ack(&desc->txd)) {
 				list_move(&desc->list, &ichan->free_list);
+				for_each_sg(desc->sg, sg, desc->sg_len, k) {
+					if (ichan->sg[0] == sg)
+						ichan->sg[0] = NULL;
+					else if (ichan->sg[1] == sg)
+						ichan->sg[1] = NULL;
+				}
 				async_tx_clear_ack(&desc->txd);
 			}
 			spin_unlock_irqrestore(&ichan->lock, flags);
@@ -1341,13 +1431,7 @@ static void ipu_gc_tasklet(unsigned long arg)
 	}
 }
 
-/*
- * At the time .device_alloc_chan_resources() method is called, we cannot know,
- * whether the client will accept the channel. Thus we must only check, if we
- * can satisfy client's request but the only real criterion to verify, whether
- * the client has accepted our offer is the client_count. That's why we have to
- * perform the rest of our allocation tasks on the first call to this function.
- */
+/* Allocate and initialise a transfer descriptor. */
 static struct dma_async_tx_descriptor *idmac_prep_slave_sg(struct dma_chan *chan,
 		struct scatterlist *sgl, unsigned int sg_len,
 		enum dma_data_direction direction, unsigned long tx_flags)
@@ -1358,8 +1442,8 @@ static struct dma_async_tx_descriptor *idmac_prep_slave_sg(struct dma_chan *chan
 	unsigned long flags;
 
 	/* We only can handle these three channels so far */
-	if (ichan->dma_chan.chan_id != IDMAC_SDC_0 && ichan->dma_chan.chan_id != IDMAC_SDC_1 &&
-	    ichan->dma_chan.chan_id != IDMAC_IC_7)
+	if (chan->chan_id != IDMAC_SDC_0 && chan->chan_id != IDMAC_SDC_1 &&
+	    chan->chan_id != IDMAC_IC_7)
 		return NULL;
 
 	if (direction != DMA_FROM_DEVICE && direction != DMA_TO_DEVICE) {
@@ -1400,7 +1484,7 @@ static void idmac_issue_pending(struct dma_chan *chan)
 
 	/* This is not always needed, but doesn't hurt either */
 	spin_lock_irqsave(&ipu->lock, flags);
-	ipu_select_buffer(ichan->dma_chan.chan_id, ichan->active_buffer);
+	ipu_select_buffer(chan->chan_id, ichan->active_buffer);
 	spin_unlock_irqrestore(&ipu->lock, flags);
 
 	/*
@@ -1432,8 +1516,7 @@ static void __idmac_terminate_all(struct dma_chan *chan)
 			struct idmac_tx_desc *desc = ichan->desc + i;
 			if (list_empty(&desc->list))
 				/* Descriptor was prepared, but not submitted */
-				list_add(&desc->list,
-					 &ichan->free_list);
+				list_add(&desc->list, &ichan->free_list);
 
 			async_tx_clear_ack(&desc->txd);
 		}
@@ -1458,6 +1541,28 @@ static void idmac_terminate_all(struct dma_chan *chan)
 	mutex_unlock(&ichan->chan_mutex);
 }
 
+#ifdef DEBUG
+static irqreturn_t ic_sof_irq(int irq, void *dev_id)
+{
+	struct idmac_channel *ichan = dev_id;
+	printk(KERN_DEBUG "Got SOF IRQ %d on Channel %d\n",
+	       irq, ichan->dma_chan.chan_id);
+	disable_irq(irq);
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t ic_eof_irq(int irq, void *dev_id)
+{
+	struct idmac_channel *ichan = dev_id;
+	printk(KERN_DEBUG "Got EOF IRQ %d on Channel %d\n",
+	       irq, ichan->dma_chan.chan_id);
+	disable_irq(irq);
+	return IRQ_HANDLED;
+}
+
+static int ic_sof = -EINVAL, ic_eof = -EINVAL;
+#endif
+
 static int idmac_alloc_chan_resources(struct dma_chan *chan)
 {
 	struct idmac_channel *ichan = to_idmac_chan(chan);
@@ -1471,31 +1576,49 @@ static int idmac_alloc_chan_resources(struct dma_chan *chan)
 	chan->cookie		= 1;
 	ichan->completed	= -ENXIO;
 
-	ret = ipu_irq_map(ichan->dma_chan.chan_id);
+	ret = ipu_irq_map(chan->chan_id);
 	if (ret < 0)
 		goto eimap;
 
 	ichan->eof_irq = ret;
+
+	/*
+	 * Important to first disable the channel, because maybe someone
+	 * used it before us, e.g., the bootloader
+	 */
+	ipu_disable_channel(idmac, ichan, true);
+
+	ret = ipu_init_channel(idmac, ichan);
+	if (ret < 0)
+		goto eichan;
+
 	ret = request_irq(ichan->eof_irq, idmac_interrupt, 0,
 			  ichan->eof_name, ichan);
 	if (ret < 0)
 		goto erirq;
 
-	ret = ipu_init_channel(idmac, ichan);
-	if (ret < 0)
-		goto eichan;
+#ifdef DEBUG
+	if (chan->chan_id == IDMAC_IC_7) {
+		ic_sof = ipu_irq_map(69);
+		if (ic_sof > 0)
+			request_irq(ic_sof, ic_sof_irq, 0, "IC SOF", ichan);
+		ic_eof = ipu_irq_map(70);
+		if (ic_eof > 0)
+			request_irq(ic_eof, ic_eof_irq, 0, "IC EOF", ichan);
+	}
+#endif
 
 	ichan->status = IPU_CHANNEL_INITIALIZED;
 
-	dev_dbg(&ichan->dma_chan.dev->device, "Found channel 0x%x, irq %d\n",
-		ichan->dma_chan.chan_id, ichan->eof_irq);
+	dev_dbg(&chan->dev->device, "Found channel 0x%x, irq %d\n",
+		chan->chan_id, ichan->eof_irq);
 
 	return ret;
 
-eichan:
-	free_irq(ichan->eof_irq, ichan);
 erirq:
-	ipu_irq_unmap(ichan->dma_chan.chan_id);
+	ipu_uninit_channel(idmac, ichan);
+eichan:
+	ipu_irq_unmap(chan->chan_id);
 eimap:
 	return ret;
 }
@@ -1510,8 +1633,22 @@ static void idmac_free_chan_resources(struct dma_chan *chan)
 	__idmac_terminate_all(chan);
 
 	if (ichan->status > IPU_CHANNEL_FREE) {
+#ifdef DEBUG
+		if (chan->chan_id == IDMAC_IC_7) {
+			if (ic_sof > 0) {
+				free_irq(ic_sof, ichan);
+				ipu_irq_unmap(69);
+				ic_sof = -EINVAL;
+			}
+			if (ic_eof > 0) {
+				free_irq(ic_eof, ichan);
+				ipu_irq_unmap(70);
+				ic_eof = -EINVAL;
+			}
+		}
+#endif
 		free_irq(ichan->eof_irq, ichan);
-		ipu_irq_unmap(ichan->dma_chan.chan_id);
+		ipu_irq_unmap(chan->chan_id);
 	}
 
 	ichan->status = IPU_CHANNEL_FREE;
@@ -1573,7 +1710,7 @@ static int __init ipu_idmac_init(struct ipu *ipu)
 		dma_chan->device	= &idmac->dma;
 		dma_chan->cookie	= 1;
 		dma_chan->chan_id	= i;
-		list_add_tail(&ichan->dma_chan.device_node, &dma->channels);
+		list_add_tail(&dma_chan->device_node, &dma->channels);
 	}
 
 	idmac_write_icreg(ipu, 0x00000070, IDMAC_CONF);
@@ -1581,7 +1718,7 @@ static int __init ipu_idmac_init(struct ipu *ipu)
 	return dma_async_device_register(&idmac->dma);
 }
 
-static void ipu_idmac_exit(struct ipu *ipu)
+static void __exit ipu_idmac_exit(struct ipu *ipu)
 {
 	int i;
 	struct idmac *idmac = &ipu->idmac;
@@ -1600,7 +1737,7 @@ static void ipu_idmac_exit(struct ipu *ipu)
  * IPU common probe / remove
  */
 
-static int ipu_probe(struct platform_device *pdev)
+static int __init ipu_probe(struct platform_device *pdev)
 {
 	struct ipu_platform_data *pdata = pdev->dev.platform_data;
 	struct resource *mem_ipu, *mem_ic;
@@ -1700,7 +1837,7 @@ err_noirq:
 	return ret;
 }
 
-static int ipu_remove(struct platform_device *pdev)
+static int __exit ipu_remove(struct platform_device *pdev)
 {
 	struct ipu *ipu = platform_get_drvdata(pdev);
 
@@ -1725,7 +1862,7 @@ static struct platform_driver ipu_platform_driver = {
 		.name	= "ipu-core",
 		.owner	= THIS_MODULE,
 	},
-	.remove		= ipu_remove,
+	.remove		= __exit_p(ipu_remove),
 };
 
 static int __init ipu_init(void)
diff --git a/drivers/dma/ipu/ipu_irq.c b/drivers/dma/ipu/ipu_irq.c
index 83f532cc767..dd8ebc75b66 100644
--- a/drivers/dma/ipu/ipu_irq.c
+++ b/drivers/dma/ipu/ipu_irq.c
@@ -352,7 +352,7 @@ static struct irq_chip ipu_irq_chip = {
 };
 
 /* Install the IRQ handler */
-int ipu_irq_attach_irq(struct ipu *ipu, struct platform_device *dev)
+int __init ipu_irq_attach_irq(struct ipu *ipu, struct platform_device *dev)
 {
 	struct ipu_platform_data *pdata = dev->dev.platform_data;
 	unsigned int irq, irq_base, i;
diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c
index cb7f26fb9f1..ddab94f5122 100644
--- a/drivers/dma/mv_xor.c
+++ b/drivers/dma/mv_xor.c
@@ -632,7 +632,6 @@ static int mv_xor_alloc_chan_resources(struct dma_chan *chan)
 		slot->async_tx.tx_submit = mv_xor_tx_submit;
 		INIT_LIST_HEAD(&slot->chain_node);
 		INIT_LIST_HEAD(&slot->slot_node);
-		INIT_LIST_HEAD(&slot->async_tx.tx_list);
 		hw_desc = (char *) mv_chan->device->dma_desc_pool;
 		slot->async_tx.phys =
 			(dma_addr_t) &hw_desc[idx * MV_XOR_SLOT_SIZE];
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 2281b5098e9..36e0675be9f 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -121,6 +121,7 @@ config MD_RAID10
 config MD_RAID456
 	tristate "RAID-4/RAID-5/RAID-6 mode"
 	depends on BLK_DEV_MD
+	select MD_RAID6_PQ
 	select ASYNC_MEMCPY
 	select ASYNC_XOR
 	---help---
@@ -151,34 +152,8 @@ config MD_RAID456
 
 	  If unsure, say Y.
 
-config MD_RAID5_RESHAPE
-	bool "Support adding drives to a raid-5 array"
-	depends on MD_RAID456
-	default y
-	---help---
-	  A RAID-5 set can be expanded by adding extra drives. This
-	  requires "restriping" the array which means (almost) every
-	  block must be written to a different place.
-
-          This option allows such restriping to be done while the array
-	  is online.
-
-	  You will need mdadm version 2.4.1 or later to use this
-	  feature safely.  During the early stage of reshape there is
-	  a critical section where live data is being over-written.  A
-	  crash during this time needs extra care for recovery.  The
-	  newer mdadm takes a copy of the data in the critical section
-	  and will restore it, if necessary, after a crash.
-
-	  The mdadm usage is e.g.
-	       mdadm --grow /dev/md1 --raid-disks=6
-	  to grow '/dev/md1' to having 6 disks.
-
-	  Note: The array can only be expanded, not contracted.
-	  There should be enough spares already present to make the new
-	  array workable.
-
-	  If unsure, say Y.
+config MD_RAID6_PQ
+	tristate
 
 config MD_MULTIPATH
 	tristate "Multipath I/O support"
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 72880b7e28d..45cc5951d92 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -2,20 +2,21 @@
 # Makefile for the kernel software RAID and LVM drivers.
 #
 
-dm-mod-objs	:= dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
+dm-mod-y	+= dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
 		   dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o
-dm-multipath-objs := dm-path-selector.o dm-mpath.o
-dm-snapshot-objs := dm-snap.o dm-exception-store.o dm-snap-transient.o \
+dm-multipath-y	+= dm-path-selector.o dm-mpath.o
+dm-snapshot-y	+= dm-snap.o dm-exception-store.o dm-snap-transient.o \
 		    dm-snap-persistent.o
-dm-mirror-objs	:= dm-raid1.o
-md-mod-objs     := md.o bitmap.o
-raid456-objs	:= raid5.o raid6algos.o raid6recov.o raid6tables.o \
+dm-mirror-y	+= dm-raid1.o
+md-mod-y	+= md.o bitmap.o
+raid456-y	+= raid5.o
+raid6_pq-y	+= raid6algos.o raid6recov.o raid6tables.o \
 		   raid6int1.o raid6int2.o raid6int4.o \
 		   raid6int8.o raid6int16.o raid6int32.o \
 		   raid6altivec1.o raid6altivec2.o raid6altivec4.o \
 		   raid6altivec8.o \
 		   raid6mmx.o raid6sse1.o raid6sse2.o
-hostprogs-y	:= mktables
+hostprogs-y	+= mktables
 
 # Note: link order is important.  All raid personalities
 # and must come before md.o, as they each initialise 
@@ -26,6 +27,7 @@ obj-$(CONFIG_MD_LINEAR)		+= linear.o
 obj-$(CONFIG_MD_RAID0)		+= raid0.o
 obj-$(CONFIG_MD_RAID1)		+= raid1.o
 obj-$(CONFIG_MD_RAID10)		+= raid10.o
+obj-$(CONFIG_MD_RAID6_PQ)	+= raid6_pq.o
 obj-$(CONFIG_MD_RAID456)	+= raid456.o
 obj-$(CONFIG_MD_MULTIPATH)	+= multipath.o
 obj-$(CONFIG_MD_FAULTY)		+= faulty.o
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 71994376339..f8a9f7ab2cb 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -16,6 +16,7 @@
  * wait if count gets too high, wake when it drops to half.
  */
 
+#include <linux/blkdev.h>
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
@@ -26,8 +27,8 @@
 #include <linux/file.h>
 #include <linux/mount.h>
 #include <linux/buffer_head.h>
-#include <linux/raid/md.h>
-#include <linux/raid/bitmap.h>
+#include "md.h"
+#include "bitmap.h"
 
 /* debug macros */
 
@@ -111,9 +112,10 @@ static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int creat
 	unsigned char *mappage;
 
 	if (page >= bitmap->pages) {
-		printk(KERN_ALERT
-			"%s: invalid bitmap page request: %lu (> %lu)\n",
-			bmname(bitmap), page, bitmap->pages-1);
+		/* This can happen if bitmap_start_sync goes beyond
+		 * End-of-device while looking for a whole page.
+		 * It is harmless.
+		 */
 		return -EINVAL;
 	}
 
@@ -265,7 +267,6 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
 	list_for_each_continue_rcu(pos, &mddev->disks) {
 		rdev = list_entry(pos, mdk_rdev_t, same_set);
 		if (rdev->raid_disk >= 0 &&
-		    test_bit(In_sync, &rdev->flags) &&
 		    !test_bit(Faulty, &rdev->flags)) {
 			/* this is a usable devices */
 			atomic_inc(&rdev->nr_pending);
@@ -297,7 +298,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 				    + size/512 > 0)
 					/* bitmap runs in to metadata */
 					goto bad_alignment;
-				if (rdev->data_offset + mddev->size*2
+				if (rdev->data_offset + mddev->dev_sectors
 				    > rdev->sb_start + bitmap->offset)
 					/* data runs in to bitmap */
 					goto bad_alignment;
@@ -570,7 +571,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO ||
 		 le32_to_cpu(sb->version) > BITMAP_MAJOR_HI)
 		reason = "unrecognized superblock version";
-	else if (chunksize < PAGE_SIZE)
+	else if (chunksize < 512)
 		reason = "bitmap chunksize too small";
 	else if ((1 << ffz(~chunksize)) != chunksize)
 		reason = "bitmap chunksize not a power of 2";
@@ -1306,6 +1307,9 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
 		PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n",
 		  atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);
 	}
+	if (bitmap->mddev->degraded)
+		/* Never clear bits or update events_cleared when degraded */
+		success = 0;
 
 	while (sectors) {
 		int blocks;
@@ -1345,8 +1349,8 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
 	}
 }
 
-int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
-			int degraded)
+static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
+			       int degraded)
 {
 	bitmap_counter_t *bmc;
 	int rv;
@@ -1374,6 +1378,29 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
 	return rv;
 }
 
+int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
+		      int degraded)
+{
+	/* bitmap_start_sync must always report on multiples of whole
+	 * pages, otherwise resync (which is very PAGE_SIZE based) will
+	 * get confused.
+	 * So call __bitmap_start_sync repeatedly (if needed) until
+	 * At least PAGE_SIZE>>9 blocks are covered.
+	 * Return the 'or' of the result.
+	 */
+	int rv = 0;
+	int blocks1;
+
+	*blocks = 0;
+	while (*blocks < (PAGE_SIZE>>9)) {
+		rv |= __bitmap_start_sync(bitmap, offset,
+					  &blocks1, degraded);
+		offset += blocks1;
+		*blocks += blocks1;
+	}
+	return rv;
+}
+
 void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted)
 {
 	bitmap_counter_t *bmc;
@@ -1443,6 +1470,8 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
 	wait_event(bitmap->mddev->recovery_wait,
 		   atomic_read(&bitmap->mddev->recovery_active) == 0);
 
+	bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync;
+	set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
 	sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1);
 	s = 0;
 	while (s < sector && s < bitmap->mddev->resync_max_sectors) {
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
new file mode 100644
index 00000000000..e98900671ca
--- /dev/null
+++ b/drivers/md/bitmap.h
@@ -0,0 +1,288 @@
+/*
+ * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
+ *
+ * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
+ */
+#ifndef BITMAP_H
+#define BITMAP_H 1
+
+#define BITMAP_MAJOR_LO 3
+/* version 4 insists the bitmap is in little-endian order
+ * with version 3, it is host-endian which is non-portable
+ */
+#define BITMAP_MAJOR_HI 4
+#define	BITMAP_MAJOR_HOSTENDIAN 3
+
+#define BITMAP_MINOR 39
+
+/*
+ * in-memory bitmap:
+ *
+ * Use 16 bit block counters to track pending writes to each "chunk".
+ * The 2 high order bits are special-purpose, the first is a flag indicating
+ * whether a resync is needed.  The second is a flag indicating whether a
+ * resync is active.
+ * This means that the counter is actually 14 bits:
+ *
+ * +--------+--------+------------------------------------------------+
+ * | resync | resync |               counter                          |
+ * | needed | active |                                                |
+ * |  (0-1) |  (0-1) |              (0-16383)                         |
+ * +--------+--------+------------------------------------------------+
+ *
+ * The "resync needed" bit is set when:
+ *    a '1' bit is read from storage at startup.
+ *    a write request fails on some drives
+ *    a resync is aborted on a chunk with 'resync active' set
+ * It is cleared (and resync-active set) when a resync starts across all drives
+ * of the chunk.
+ *
+ *
+ * The "resync active" bit is set when:
+ *    a resync is started on all drives, and resync_needed is set.
+ *       resync_needed will be cleared (as long as resync_active wasn't already set).
+ * It is cleared when a resync completes.
+ *
+ * The counter counts pending write requests, plus the on-disk bit.
+ * When the counter is '1' and the resync bits are clear, the on-disk
+ * bit can be cleared aswell, thus setting the counter to 0.
+ * When we set a bit, or in the counter (to start a write), if the fields is
+ * 0, we first set the disk bit and set the counter to 1.
+ *
+ * If the counter is 0, the on-disk bit is clear and the stipe is clean
+ * Anything that dirties the stipe pushes the counter to 2 (at least)
+ * and sets the on-disk bit (lazily).
+ * If a periodic sweep find the counter at 2, it is decremented to 1.
+ * If the sweep find the counter at 1, the on-disk bit is cleared and the
+ * counter goes to zero.
+ *
+ * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block
+ * counters as a fallback when "page" memory cannot be allocated:
+ *
+ * Normal case (page memory allocated):
+ *
+ *     page pointer (32-bit)
+ *
+ *     [ ] ------+
+ *               |
+ *               +-------> [   ][   ]..[   ] (4096 byte page == 2048 counters)
+ *                          c1   c2    c2048
+ *
+ * Hijacked case (page memory allocation failed):
+ *
+ *     hijacked page pointer (32-bit)
+ *
+ *     [		  ][		  ] (no page memory allocated)
+ *      counter #1 (16-bit) counter #2 (16-bit)
+ *
+ */
+
+#ifdef __KERNEL__
+
+#define PAGE_BITS (PAGE_SIZE << 3)
+#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3)
+
+typedef __u16 bitmap_counter_t;
+#define COUNTER_BITS 16
+#define COUNTER_BIT_SHIFT 4
+#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8)
+#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3)
+
+#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1)))
+#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2)))
+#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1)
+#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK)
+#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK)
+#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX)
+
+/* how many counters per page? */
+#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS)
+/* same, except a shift value for more efficient bitops */
+#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT)
+/* same, except a mask value for more efficient bitops */
+#define PAGE_COUNTER_MASK  (PAGE_COUNTER_RATIO - 1)
+
+#define BITMAP_BLOCK_SIZE 512
+#define BITMAP_BLOCK_SHIFT 9
+
+/* how many blocks per chunk? (this is variable) */
+#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT)
+#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
+#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)
+
+/* when hijacked, the counters and bits represent even larger "chunks" */
+/* there will be 1024 chunks represented by each counter in the page pointers */
+#define PAGEPTR_BLOCK_RATIO(bitmap) \
+			(CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1)
+#define PAGEPTR_BLOCK_SHIFT(bitmap) \
+			(CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
+#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
+
+/*
+ * on-disk bitmap:
+ *
+ * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
+ * file a page at a time. There's a superblock at the start of the file.
+ */
+
+/* map chunks (bits) to file pages - offset by the size of the superblock */
+#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3))
+
+#endif
+
+/*
+ * bitmap structures:
+ */
+
+#define BITMAP_MAGIC 0x6d746962
+
+/* use these for bitmap->flags and bitmap->sb->state bit-fields */
+enum bitmap_state {
+	BITMAP_STALE  = 0x002,  /* the bitmap file is out of date or had -EIO */
+	BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */
+	BITMAP_HOSTENDIAN = 0x8000,
+};
+
+/* the superblock at the front of the bitmap file -- little endian */
+typedef struct bitmap_super_s {
+	__le32 magic;        /*  0  BITMAP_MAGIC */
+	__le32 version;      /*  4  the bitmap major for now, could change... */
+	__u8  uuid[16];      /*  8  128 bit uuid - must match md device uuid */
+	__le64 events;       /* 24  event counter for the bitmap (1)*/
+	__le64 events_cleared;/*32  event counter when last bit cleared (2) */
+	__le64 sync_size;    /* 40  the size of the md device's sync range(3) */
+	__le32 state;        /* 48  bitmap state information */
+	__le32 chunksize;    /* 52  the bitmap chunk size in bytes */
+	__le32 daemon_sleep; /* 56  seconds between disk flushes */
+	__le32 write_behind; /* 60  number of outstanding write-behind writes */
+
+	__u8  pad[256 - 64]; /* set to zero */
+} bitmap_super_t;
+
+/* notes:
+ * (1) This event counter is updated before the eventcounter in the md superblock
+ *    When a bitmap is loaded, it is only accepted if this event counter is equal
+ *    to, or one greater than, the event counter in the superblock.
+ * (2) This event counter is updated when the other one is *if*and*only*if* the
+ *    array is not degraded.  As bits are not cleared when the array is degraded,
+ *    this represents the last time that any bits were cleared.
+ *    If a device is being added that has an event count with this value or
+ *    higher, it is accepted as conforming to the bitmap.
+ * (3)This is the number of sectors represented by the bitmap, and is the range that
+ *    resync happens across.  For raid1 and raid5/6 it is the size of individual
+ *    devices.  For raid10 it is the size of the array.
+ */
+
+#ifdef __KERNEL__
+
+/* the in-memory bitmap is represented by bitmap_pages */
+struct bitmap_page {
+	/*
+	 * map points to the actual memory page
+	 */
+	char *map;
+	/*
+	 * in emergencies (when map cannot be alloced), hijack the map
+	 * pointer and use it as two counters itself
+	 */
+	unsigned int hijacked:1;
+	/*
+	 * count of dirty bits on the page
+	 */
+	unsigned int  count:31;
+};
+
+/* keep track of bitmap file pages that have pending writes on them */
+struct page_list {
+	struct list_head list;
+	struct page *page;
+};
+
+/* the main bitmap structure - one per mddev */
+struct bitmap {
+	struct bitmap_page *bp;
+	unsigned long pages; /* total number of pages in the bitmap */
+	unsigned long missing_pages; /* number of pages not yet allocated */
+
+	mddev_t *mddev; /* the md device that the bitmap is for */
+
+	int counter_bits; /* how many bits per block counter */
+
+	/* bitmap chunksize -- how much data does each bit represent? */
+	unsigned long chunksize;
+	unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
+	unsigned long chunks; /* total number of data chunks for the array */
+
+	/* We hold a count on the chunk currently being synced, and drop
+	 * it when the last block is started.  If the resync is aborted
+	 * midway, we need to be able to drop that count, so we remember
+	 * the counted chunk..
+	 */
+	unsigned long syncchunk;
+
+	__u64	events_cleared;
+	int need_sync;
+
+	/* bitmap spinlock */
+	spinlock_t lock;
+
+	long offset; /* offset from superblock if file is NULL */
+	struct file *file; /* backing disk file */
+	struct page *sb_page; /* cached copy of the bitmap file superblock */
+	struct page **filemap; /* list of cache pages for the file */
+	unsigned long *filemap_attr; /* attributes associated w/ filemap pages */
+	unsigned long file_pages; /* number of pages in the file */
+	int last_page_size; /* bytes in the last page */
+
+	unsigned long flags;
+
+	int allclean;
+
+	unsigned long max_write_behind; /* write-behind mode */
+	atomic_t behind_writes;
+
+	/*
+	 * the bitmap daemon - periodically wakes up and sweeps the bitmap
+	 * file, cleaning up bits and flushing out pages to disk as necessary
+	 */
+	unsigned long daemon_lastrun; /* jiffies of last run */
+	unsigned long daemon_sleep; /* how many seconds between updates? */
+	unsigned long last_end_sync; /* when we lasted called end_sync to
+				      * update bitmap with resync progress */
+
+	atomic_t pending_writes; /* pending writes to the bitmap file */
+	wait_queue_head_t write_wait;
+	wait_queue_head_t overflow_wait;
+
+};
+
+/* the bitmap API */
+
+/* these are used only by md/bitmap */
+int  bitmap_create(mddev_t *mddev);
+void bitmap_flush(mddev_t *mddev);
+void bitmap_destroy(mddev_t *mddev);
+
+void bitmap_print_sb(struct bitmap *bitmap);
+void bitmap_update_sb(struct bitmap *bitmap);
+
+int  bitmap_setallbits(struct bitmap *bitmap);
+void bitmap_write_all(struct bitmap *bitmap);
+
+void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e);
+
+/* these are exported */
+int bitmap_startwrite(struct bitmap *bitmap, sector_t offset,
+			unsigned long sectors, int behind);
+void bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
+			unsigned long sectors, int success, int behind);
+int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded);
+void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted);
+void bitmap_close_sync(struct bitmap *bitmap);
+void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
+
+void bitmap_unplug(struct bitmap *bitmap);
+void bitmap_daemon_work(struct bitmap *bitmap);
+#endif
+
+#endif
diff --git a/drivers/md/dm-bio-list.h b/drivers/md/dm-bio-list.h
index d4509be0fe6..345098b4ca7 100644
--- a/drivers/md/dm-bio-list.h
+++ b/drivers/md/dm-bio-list.h
@@ -52,6 +52,16 @@ static inline void bio_list_add(struct bio_list *bl, struct bio *bio)
 	bl->tail = bio;
 }
 
+static inline void bio_list_add_head(struct bio_list *bl, struct bio *bio)
+{
+	bio->bi_next = bl->head;
+
+	bl->head = bio;
+
+	if (!bl->tail)
+		bl->tail = bio;
+}
+
 static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2)
 {
 	if (!bl2->head)
diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h
index d3ec217847d..3a8cfa2645c 100644
--- a/drivers/md/dm-bio-record.h
+++ b/drivers/md/dm-bio-record.h
@@ -16,30 +16,56 @@
  * functions in this file help the target record and restore the
  * original bio state.
  */
+
+struct dm_bio_vec_details {
+#if PAGE_SIZE < 65536
+	__u16 bv_len;
+	__u16 bv_offset;
+#else
+	unsigned bv_len;
+	unsigned bv_offset;
+#endif
+};
+
 struct dm_bio_details {
 	sector_t bi_sector;
 	struct block_device *bi_bdev;
 	unsigned int bi_size;
 	unsigned short bi_idx;
 	unsigned long bi_flags;
+	struct dm_bio_vec_details bi_io_vec[BIO_MAX_PAGES];
 };
 
 static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio)
 {
+	unsigned i;
+
 	bd->bi_sector = bio->bi_sector;
 	bd->bi_bdev = bio->bi_bdev;
 	bd->bi_size = bio->bi_size;
 	bd->bi_idx = bio->bi_idx;
 	bd->bi_flags = bio->bi_flags;
+
+	for (i = 0; i < bio->bi_vcnt; i++) {
+		bd->bi_io_vec[i].bv_len = bio->bi_io_vec[i].bv_len;
+		bd->bi_io_vec[i].bv_offset = bio->bi_io_vec[i].bv_offset;
+	}
 }
 
 static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio)
 {
+	unsigned i;
+
 	bio->bi_sector = bd->bi_sector;
 	bio->bi_bdev = bd->bi_bdev;
 	bio->bi_size = bd->bi_size;
 	bio->bi_idx = bd->bi_idx;
 	bio->bi_flags = bd->bi_flags;
+
+	for (i = 0; i < bio->bi_vcnt; i++) {
+		bio->bi_io_vec[i].bv_len = bd->bi_io_vec[i].bv_len;
+		bio->bi_io_vec[i].bv_offset = bd->bi_io_vec[i].bv_offset;
+	}
 }
 
 #endif
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index bfefd079a95..53394e863c7 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1156,8 +1156,7 @@ bad_ivmode:
 	crypto_free_ablkcipher(tfm);
 bad_cipher:
 	/* Must zero key material before freeing */
-	memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8));
-	kfree(cc);
+	kzfree(cc);
 	return -EINVAL;
 }
 
@@ -1183,8 +1182,7 @@ static void crypt_dtr(struct dm_target *ti)
 	dm_put_device(ti, cc->dev);
 
 	/* Must zero key material before freeing */
-	memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8));
-	kfree(cc);
+	kzfree(cc);
 }
 
 static int crypt_map(struct dm_target *ti, struct bio *bio,
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index dccbfb0e010..a2e26c24214 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -7,6 +7,7 @@
 
 #include "dm-exception-store.h"
 
+#include <linux/ctype.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/vmalloc.h>
@@ -14,6 +15,257 @@
 
 #define DM_MSG_PREFIX "snapshot exception stores"
 
+static LIST_HEAD(_exception_store_types);
+static DEFINE_SPINLOCK(_lock);
+
+static struct dm_exception_store_type *__find_exception_store_type(const char *name)
+{
+	struct dm_exception_store_type *type;
+
+	list_for_each_entry(type, &_exception_store_types, list)
+		if (!strcmp(name, type->name))
+			return type;
+
+	return NULL;
+}
+
+static struct dm_exception_store_type *_get_exception_store_type(const char *name)
+{
+	struct dm_exception_store_type *type;
+
+	spin_lock(&_lock);
+
+	type = __find_exception_store_type(name);
+
+	if (type && !try_module_get(type->module))
+		type = NULL;
+
+	spin_unlock(&_lock);
+
+	return type;
+}
+
+/*
+ * get_type
+ * @type_name
+ *
+ * Attempt to retrieve the dm_exception_store_type by name.  If not already
+ * available, attempt to load the appropriate module.
+ *
+ * Exstore modules are named "dm-exstore-" followed by the 'type_name'.
+ * Modules may contain multiple types.
+ * This function will first try the module "dm-exstore-<type_name>",
+ * then truncate 'type_name' on the last '-' and try again.
+ *
+ * For example, if type_name was "clustered-shared", it would search
+ * 'dm-exstore-clustered-shared' then 'dm-exstore-clustered'.
+ *
+ * 'dm-exception-store-<type_name>' is too long of a name in my
+ * opinion, which is why I've chosen to have the files
+ * containing exception store implementations be 'dm-exstore-<type_name>'.
+ * If you want your module to be autoloaded, you will follow this
+ * naming convention.
+ *
+ * Returns: dm_exception_store_type* on success, NULL on failure
+ */
+static struct dm_exception_store_type *get_type(const char *type_name)
+{
+	char *p, *type_name_dup;
+	struct dm_exception_store_type *type;
+
+	type = _get_exception_store_type(type_name);
+	if (type)
+		return type;
+
+	type_name_dup = kstrdup(type_name, GFP_KERNEL);
+	if (!type_name_dup) {
+		DMERR("No memory left to attempt load for \"%s\"", type_name);
+		return NULL;
+	}
+
+	while (request_module("dm-exstore-%s", type_name_dup) ||
+	       !(type = _get_exception_store_type(type_name))) {
+		p = strrchr(type_name_dup, '-');
+		if (!p)
+			break;
+		p[0] = '\0';
+	}
+
+	if (!type)
+		DMWARN("Module for exstore type \"%s\" not found.", type_name);
+
+	kfree(type_name_dup);
+
+	return type;
+}
+
+static void put_type(struct dm_exception_store_type *type)
+{
+	spin_lock(&_lock);
+	module_put(type->module);
+	spin_unlock(&_lock);
+}
+
+int dm_exception_store_type_register(struct dm_exception_store_type *type)
+{
+	int r = 0;
+
+	spin_lock(&_lock);
+	if (!__find_exception_store_type(type->name))
+		list_add(&type->list, &_exception_store_types);
+	else
+		r = -EEXIST;
+	spin_unlock(&_lock);
+
+	return r;
+}
+EXPORT_SYMBOL(dm_exception_store_type_register);
+
+int dm_exception_store_type_unregister(struct dm_exception_store_type *type)
+{
+	spin_lock(&_lock);
+
+	if (!__find_exception_store_type(type->name)) {
+		spin_unlock(&_lock);
+		return -EINVAL;
+	}
+
+	list_del(&type->list);
+
+	spin_unlock(&_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(dm_exception_store_type_unregister);
+
+/*
+ * Round a number up to the nearest 'size' boundary.  size must
+ * be a power of 2.
+ */
+static ulong round_up(ulong n, ulong size)
+{
+	size--;
+	return (n + size) & ~size;
+}
+
+static int set_chunk_size(struct dm_exception_store *store,
+			  const char *chunk_size_arg, char **error)
+{
+	unsigned long chunk_size_ulong;
+	char *value;
+
+	chunk_size_ulong = simple_strtoul(chunk_size_arg, &value, 10);
+	if (*chunk_size_arg == '\0' || *value != '\0') {
+		*error = "Invalid chunk size";
+		return -EINVAL;
+	}
+
+	if (!chunk_size_ulong) {
+		store->chunk_size = store->chunk_mask = store->chunk_shift = 0;
+		return 0;
+	}
+
+	/*
+	 * Chunk size must be multiple of page size.  Silently
+	 * round up if it's not.
+	 */
+	chunk_size_ulong = round_up(chunk_size_ulong, PAGE_SIZE >> 9);
+
+	/* Check chunk_size is a power of 2 */
+	if (!is_power_of_2(chunk_size_ulong)) {
+		*error = "Chunk size is not a power of 2";
+		return -EINVAL;
+	}
+
+	/* Validate the chunk size against the device block size */
+	if (chunk_size_ulong % (bdev_hardsect_size(store->cow->bdev) >> 9)) {
+		*error = "Chunk size is not a multiple of device blocksize";
+		return -EINVAL;
+	}
+
+	store->chunk_size = chunk_size_ulong;
+	store->chunk_mask = chunk_size_ulong - 1;
+	store->chunk_shift = ffs(chunk_size_ulong) - 1;
+
+	return 0;
+}
+
+int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
+			      unsigned *args_used,
+			      struct dm_exception_store **store)
+{
+	int r = 0;
+	struct dm_exception_store_type *type;
+	struct dm_exception_store *tmp_store;
+	char persistent;
+
+	if (argc < 3) {
+		ti->error = "Insufficient exception store arguments";
+		return -EINVAL;
+	}
+
+	tmp_store = kmalloc(sizeof(*tmp_store), GFP_KERNEL);
+	if (!tmp_store) {
+		ti->error = "Exception store allocation failed";
+		return -ENOMEM;
+	}
+
+	persistent = toupper(*argv[1]);
+	if (persistent != 'P' && persistent != 'N') {
+		ti->error = "Persistent flag is not P or N";
+		return -EINVAL;
+	}
+
+	type = get_type(argv[1]);
+	if (!type) {
+		ti->error = "Exception store type not recognised";
+		r = -EINVAL;
+		goto bad_type;
+	}
+
+	tmp_store->type = type;
+	tmp_store->ti = ti;
+
+	r = dm_get_device(ti, argv[0], 0, 0,
+			  FMODE_READ | FMODE_WRITE, &tmp_store->cow);
+	if (r) {
+		ti->error = "Cannot get COW device";
+		goto bad_cow;
+	}
+
+	r = set_chunk_size(tmp_store, argv[2], &ti->error);
+	if (r)
+		goto bad_cow;
+
+	r = type->ctr(tmp_store, 0, NULL);
+	if (r) {
+		ti->error = "Exception store type constructor failed";
+		goto bad_ctr;
+	}
+
+	*args_used = 3;
+	*store = tmp_store;
+	return 0;
+
+bad_ctr:
+	dm_put_device(ti, tmp_store->cow);
+bad_cow:
+	put_type(type);
+bad_type:
+	kfree(tmp_store);
+	return r;
+}
+EXPORT_SYMBOL(dm_exception_store_create);
+
+void dm_exception_store_destroy(struct dm_exception_store *store)
+{
+	store->type->dtr(store);
+	dm_put_device(store->ti, store->cow);
+	put_type(store->type);
+	kfree(store);
+}
+EXPORT_SYMBOL(dm_exception_store_destroy);
+
 int dm_exception_store_init(void)
 {
 	int r;
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index bb9f33d5daa..0a2e6e7f67b 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -37,11 +37,18 @@ struct dm_snap_exception {
  * Abstraction to handle the meta/layout of exception stores (the
  * COW device).
  */
-struct dm_exception_store {
+struct dm_exception_store;
+struct dm_exception_store_type {
+	const char *name;
+	struct module *module;
+
+	int (*ctr) (struct dm_exception_store *store,
+		    unsigned argc, char **argv);
+
 	/*
 	 * Destroys this object when you've finished with it.
 	 */
-	void (*destroy) (struct dm_exception_store *store);
+	void (*dtr) (struct dm_exception_store *store);
 
 	/*
 	 * The target shouldn't read the COW device until this is
@@ -72,8 +79,9 @@ struct dm_exception_store {
 	 */
 	void (*drop_snapshot) (struct dm_exception_store *store);
 
-	int (*status) (struct dm_exception_store *store, status_type_t status,
-		       char *result, unsigned int maxlen);
+	unsigned (*status) (struct dm_exception_store *store,
+			    status_type_t status, char *result,
+			    unsigned maxlen);
 
 	/*
 	 * Return how full the snapshot is.
@@ -82,7 +90,21 @@ struct dm_exception_store {
 			       sector_t *numerator,
 			       sector_t *denominator);
 
-	struct dm_snapshot *snap;
+	/* For internal device-mapper use only. */
+	struct list_head list;
+};
+
+struct dm_exception_store {
+	struct dm_exception_store_type *type;
+	struct dm_target *ti;
+
+	struct dm_dev *cow;
+
+	/* Size of data blocks saved - must be a power of 2 */
+	chunk_t chunk_size;
+	chunk_t chunk_mask;
+	chunk_t chunk_shift;
+
 	void *context;
 };
 
@@ -129,6 +151,28 @@ static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
 
 #  endif
 
+/*
+ * Return the number of sectors in the device.
+ */
+static inline sector_t get_dev_size(struct block_device *bdev)
+{
+	return bdev->bd_inode->i_size >> SECTOR_SHIFT;
+}
+
+static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
+				      sector_t sector)
+{
+	return (sector & ~store->chunk_mask) >> store->chunk_shift;
+}
+
+int dm_exception_store_type_register(struct dm_exception_store_type *type);
+int dm_exception_store_type_unregister(struct dm_exception_store_type *type);
+
+int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
+			      unsigned *args_used,
+			      struct dm_exception_store **store);
+void dm_exception_store_destroy(struct dm_exception_store *store);
+
 int dm_exception_store_init(void);
 void dm_exception_store_exit(void);
 
@@ -141,8 +185,4 @@ void dm_persistent_snapshot_exit(void);
 int dm_transient_snapshot_init(void);
 void dm_transient_snapshot_exit(void);
 
-int dm_create_persistent(struct dm_exception_store *store);
-
-int dm_create_transient(struct dm_exception_store *store);
-
 #endif /* _LINUX_DM_EXCEPTION_STORE */
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 36e2b5e46a6..e73aabd61cd 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -370,16 +370,13 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
 	while (1) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
 
-		if (!atomic_read(&io.count) || signal_pending(current))
+		if (!atomic_read(&io.count))
 			break;
 
 		io_schedule();
 	}
 	set_current_state(TASK_RUNNING);
 
-	if (atomic_read(&io.count))
-		return -EINTR;
-
 	if (error_bits)
 		*error_bits = io.error_bits;
 
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 737961f275c..be233bc4d91 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -16,40 +16,29 @@
 
 #define DM_MSG_PREFIX "dirty region log"
 
-struct dm_dirty_log_internal {
-	struct dm_dirty_log_type *type;
-
-	struct list_head list;
-	long use;
-};
-
 static LIST_HEAD(_log_types);
 static DEFINE_SPINLOCK(_lock);
 
-static struct dm_dirty_log_internal *__find_dirty_log_type(const char *name)
+static struct dm_dirty_log_type *__find_dirty_log_type(const char *name)
 {
-	struct dm_dirty_log_internal *log_type;
+	struct dm_dirty_log_type *log_type;
 
 	list_for_each_entry(log_type, &_log_types, list)
-		if (!strcmp(name, log_type->type->name))
+		if (!strcmp(name, log_type->name))
 			return log_type;
 
 	return NULL;
 }
 
-static struct dm_dirty_log_internal *_get_dirty_log_type(const char *name)
+static struct dm_dirty_log_type *_get_dirty_log_type(const char *name)
 {
-	struct dm_dirty_log_internal *log_type;
+	struct dm_dirty_log_type *log_type;
 
 	spin_lock(&_lock);
 
 	log_type = __find_dirty_log_type(name);
-	if (log_type) {
-		if (!log_type->use && !try_module_get(log_type->type->module))
-			log_type = NULL;
-		else
-			log_type->use++;
-	}
+	if (log_type && !try_module_get(log_type->module))
+		log_type = NULL;
 
 	spin_unlock(&_lock);
 
@@ -76,14 +65,14 @@ static struct dm_dirty_log_internal *_get_dirty_log_type(const char *name)
 static struct dm_dirty_log_type *get_type(const char *type_name)
 {
 	char *p, *type_name_dup;
-	struct dm_dirty_log_internal *log_type;
+	struct dm_dirty_log_type *log_type;
 
 	if (!type_name)
 		return NULL;
 
 	log_type = _get_dirty_log_type(type_name);
 	if (log_type)
-		return log_type->type;
+		return log_type;
 
 	type_name_dup = kstrdup(type_name, GFP_KERNEL);
 	if (!type_name_dup) {
@@ -105,56 +94,33 @@ static struct dm_dirty_log_type *get_type(const char *type_name)
 
 	kfree(type_name_dup);
 
-	return log_type ? log_type->type : NULL;
+	return log_type;
 }
 
 static void put_type(struct dm_dirty_log_type *type)
 {
-	struct dm_dirty_log_internal *log_type;
-
 	if (!type)
 		return;
 
 	spin_lock(&_lock);
-	log_type = __find_dirty_log_type(type->name);
-	if (!log_type)
+	if (!__find_dirty_log_type(type->name))
 		goto out;
 
-	if (!--log_type->use)
-		module_put(type->module);
-
-	BUG_ON(log_type->use < 0);
+	module_put(type->module);
 
 out:
 	spin_unlock(&_lock);
 }
 
-static struct dm_dirty_log_internal *_alloc_dirty_log_type(struct dm_dirty_log_type *type)
-{
-	struct dm_dirty_log_internal *log_type = kzalloc(sizeof(*log_type),
-							 GFP_KERNEL);
-
-	if (log_type)
-		log_type->type = type;
-
-	return log_type;
-}
-
 int dm_dirty_log_type_register(struct dm_dirty_log_type *type)
 {
-	struct dm_dirty_log_internal *log_type = _alloc_dirty_log_type(type);
 	int r = 0;
 
-	if (!log_type)
-		return -ENOMEM;
-
 	spin_lock(&_lock);
 	if (!__find_dirty_log_type(type->name))
-		list_add(&log_type->list, &_log_types);
-	else {
-		kfree(log_type);
+		list_add(&type->list, &_log_types);
+	else
 		r = -EEXIST;
-	}
 	spin_unlock(&_lock);
 
 	return r;
@@ -163,25 +129,16 @@ EXPORT_SYMBOL(dm_dirty_log_type_register);
 
 int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type)
 {
-	struct dm_dirty_log_internal *log_type;
-
 	spin_lock(&_lock);
 
-	log_type = __find_dirty_log_type(type->name);
-	if (!log_type) {
+	if (!__find_dirty_log_type(type->name)) {
 		spin_unlock(&_lock);
 		return -EINVAL;
 	}
 
-	if (log_type->use) {
-		spin_unlock(&_lock);
-		return -ETXTBSY;
-	}
-
-	list_del(&log_type->list);
+	list_del(&type->list);
 
 	spin_unlock(&_lock);
-	kfree(log_type);
 
 	return 0;
 }
diff --git a/drivers/md/dm-path-selector.c b/drivers/md/dm-path-selector.c
index 96ea226155b..42c04f04a0c 100644
--- a/drivers/md/dm-path-selector.c
+++ b/drivers/md/dm-path-selector.c
@@ -17,9 +17,7 @@
 
 struct ps_internal {
 	struct path_selector_type pst;
-
 	struct list_head list;
-	long use;
 };
 
 #define pst_to_psi(__pst) container_of((__pst), struct ps_internal, pst)
@@ -45,12 +43,8 @@ static struct ps_internal *get_path_selector(const char *name)
 
 	down_read(&_ps_lock);
 	psi = __find_path_selector_type(name);
-	if (psi) {
-		if ((psi->use == 0) && !try_module_get(psi->pst.module))
-			psi = NULL;
-		else
-			psi->use++;
-	}
+	if (psi && !try_module_get(psi->pst.module))
+		psi = NULL;
 	up_read(&_ps_lock);
 
 	return psi;
@@ -84,11 +78,7 @@ void dm_put_path_selector(struct path_selector_type *pst)
 	if (!psi)
 		goto out;
 
-	if (--psi->use == 0)
-		module_put(psi->pst.module);
-
-	BUG_ON(psi->use < 0);
-
+	module_put(psi->pst.module);
 out:
 	up_read(&_ps_lock);
 }
@@ -136,11 +126,6 @@ int dm_unregister_path_selector(struct path_selector_type *pst)
 		return -EINVAL;
 	}
 
-	if (psi->use) {
-		up_write(&_ps_lock);
-		return -ETXTBSY;
-	}
-
 	list_del(&psi->list);
 
 	up_write(&_ps_lock);
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 4d6bc101962..536ef0bef15 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -145,6 +145,8 @@ struct dm_raid1_read_record {
 	struct dm_bio_details details;
 };
 
+static struct kmem_cache *_dm_raid1_read_record_cache;
+
 /*
  * Every mirror should look like this one.
  */
@@ -586,6 +588,9 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 	int state;
 	struct bio *bio;
 	struct bio_list sync, nosync, recover, *this_list = NULL;
+	struct bio_list requeue;
+	struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
+	region_t region;
 
 	if (!writes->head)
 		return;
@@ -596,10 +601,18 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 	bio_list_init(&sync);
 	bio_list_init(&nosync);
 	bio_list_init(&recover);
+	bio_list_init(&requeue);
 
 	while ((bio = bio_list_pop(writes))) {
-		state = dm_rh_get_state(ms->rh,
-					dm_rh_bio_to_region(ms->rh, bio), 1);
+		region = dm_rh_bio_to_region(ms->rh, bio);
+
+		if (log->type->is_remote_recovering &&
+		    log->type->is_remote_recovering(log, region)) {
+			bio_list_add(&requeue, bio);
+			continue;
+		}
+
+		state = dm_rh_get_state(ms->rh, region, 1);
 		switch (state) {
 		case DM_RH_CLEAN:
 		case DM_RH_DIRTY:
@@ -619,6 +632,16 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 	}
 
 	/*
+	 * Add bios that are delayed due to remote recovery
+	 * back on to the write queue
+	 */
+	if (unlikely(requeue.head)) {
+		spin_lock_irq(&ms->lock);
+		bio_list_merge(&ms->writes, &requeue);
+		spin_unlock_irq(&ms->lock);
+	}
+
+	/*
 	 * Increment the pending counts for any regions that will
 	 * be written to (writes to recover regions are going to
 	 * be delayed).
@@ -764,9 +787,9 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
 	atomic_set(&ms->suspend, 0);
 	atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
 
-	len = sizeof(struct dm_raid1_read_record);
-	ms->read_record_pool = mempool_create_kmalloc_pool(MIN_READ_RECORDS,
-							   len);
+	ms->read_record_pool = mempool_create_slab_pool(MIN_READ_RECORDS,
+						_dm_raid1_read_record_cache);
+
 	if (!ms->read_record_pool) {
 		ti->error = "Error creating mirror read_record_pool";
 		kfree(ms);
@@ -1279,16 +1302,31 @@ static int __init dm_mirror_init(void)
 {
 	int r;
 
+	_dm_raid1_read_record_cache = KMEM_CACHE(dm_raid1_read_record, 0);
+	if (!_dm_raid1_read_record_cache) {
+		DMERR("Can't allocate dm_raid1_read_record cache");
+		r = -ENOMEM;
+		goto bad_cache;
+	}
+
 	r = dm_register_target(&mirror_target);
-	if (r < 0)
+	if (r < 0) {
 		DMERR("Failed to register mirror target");
+		goto bad_target;
+	}
+
+	return 0;
 
+bad_target:
+	kmem_cache_destroy(_dm_raid1_read_record_cache);
+bad_cache:
 	return r;
 }
 
 static void __exit dm_mirror_exit(void)
 {
 	dm_unregister_target(&mirror_target);
+	kmem_cache_destroy(_dm_raid1_read_record_cache);
 }
 
 /* Module hooks */
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 936b34e0959..e75c6dd76a9 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -6,7 +6,6 @@
  */
 
 #include "dm-exception-store.h"
-#include "dm-snap.h"
 
 #include <linux/mm.h>
 #include <linux/pagemap.h>
@@ -89,7 +88,7 @@ struct commit_callback {
  * The top level structure for a persistent exception store.
  */
 struct pstore {
-	struct dm_snapshot *snap;	/* up pointer to my snapshot */
+	struct dm_exception_store *store;
 	int version;
 	int valid;
 	uint32_t exceptions_per_area;
@@ -141,7 +140,7 @@ static int alloc_area(struct pstore *ps)
 	int r = -ENOMEM;
 	size_t len;
 
-	len = ps->snap->chunk_size << SECTOR_SHIFT;
+	len = ps->store->chunk_size << SECTOR_SHIFT;
 
 	/*
 	 * Allocate the chunk_size block of memory that will hold
@@ -163,9 +162,12 @@ static int alloc_area(struct pstore *ps)
 
 static void free_area(struct pstore *ps)
 {
-	vfree(ps->area);
+	if (ps->area)
+		vfree(ps->area);
 	ps->area = NULL;
-	vfree(ps->zero_area);
+
+	if (ps->zero_area)
+		vfree(ps->zero_area);
 	ps->zero_area = NULL;
 }
 
@@ -189,9 +191,9 @@ static void do_metadata(struct work_struct *work)
 static int chunk_io(struct pstore *ps, chunk_t chunk, int rw, int metadata)
 {
 	struct dm_io_region where = {
-		.bdev = ps->snap->cow->bdev,
-		.sector = ps->snap->chunk_size * chunk,
-		.count = ps->snap->chunk_size,
+		.bdev = ps->store->cow->bdev,
+		.sector = ps->store->chunk_size * chunk,
+		.count = ps->store->chunk_size,
 	};
 	struct dm_io_request io_req = {
 		.bi_rw = rw,
@@ -247,15 +249,15 @@ static int area_io(struct pstore *ps, int rw)
 
 static void zero_memory_area(struct pstore *ps)
 {
-	memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
+	memset(ps->area, 0, ps->store->chunk_size << SECTOR_SHIFT);
 }
 
 static int zero_disk_area(struct pstore *ps, chunk_t area)
 {
 	struct dm_io_region where = {
-		.bdev = ps->snap->cow->bdev,
-		.sector = ps->snap->chunk_size * area_location(ps, area),
-		.count = ps->snap->chunk_size,
+		.bdev = ps->store->cow->bdev,
+		.sector = ps->store->chunk_size * area_location(ps, area),
+		.count = ps->store->chunk_size,
 	};
 	struct dm_io_request io_req = {
 		.bi_rw = WRITE,
@@ -278,15 +280,15 @@ static int read_header(struct pstore *ps, int *new_snapshot)
 	/*
 	 * Use default chunk size (or hardsect_size, if larger) if none supplied
 	 */
-	if (!ps->snap->chunk_size) {
-		ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
-		    bdev_hardsect_size(ps->snap->cow->bdev) >> 9);
-		ps->snap->chunk_mask = ps->snap->chunk_size - 1;
-		ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1;
+	if (!ps->store->chunk_size) {
+		ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
+		    bdev_hardsect_size(ps->store->cow->bdev) >> 9);
+		ps->store->chunk_mask = ps->store->chunk_size - 1;
+		ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1;
 		chunk_size_supplied = 0;
 	}
 
-	ps->io_client = dm_io_client_create(sectors_to_pages(ps->snap->
+	ps->io_client = dm_io_client_create(sectors_to_pages(ps->store->
 							     chunk_size));
 	if (IS_ERR(ps->io_client))
 		return PTR_ERR(ps->io_client);
@@ -317,22 +319,22 @@ static int read_header(struct pstore *ps, int *new_snapshot)
 	ps->version = le32_to_cpu(dh->version);
 	chunk_size = le32_to_cpu(dh->chunk_size);
 
-	if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size)
+	if (!chunk_size_supplied || ps->store->chunk_size == chunk_size)
 		return 0;
 
 	DMWARN("chunk size %llu in device metadata overrides "
 	       "table chunk size of %llu.",
 	       (unsigned long long)chunk_size,
-	       (unsigned long long)ps->snap->chunk_size);
+	       (unsigned long long)ps->store->chunk_size);
 
 	/* We had a bogus chunk_size. Fix stuff up. */
 	free_area(ps);
 
-	ps->snap->chunk_size = chunk_size;
-	ps->snap->chunk_mask = chunk_size - 1;
-	ps->snap->chunk_shift = ffs(chunk_size) - 1;
+	ps->store->chunk_size = chunk_size;
+	ps->store->chunk_mask = chunk_size - 1;
+	ps->store->chunk_shift = ffs(chunk_size) - 1;
 
-	r = dm_io_client_resize(sectors_to_pages(ps->snap->chunk_size),
+	r = dm_io_client_resize(sectors_to_pages(ps->store->chunk_size),
 				ps->io_client);
 	if (r)
 		return r;
@@ -349,13 +351,13 @@ static int write_header(struct pstore *ps)
 {
 	struct disk_header *dh;
 
-	memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
+	memset(ps->area, 0, ps->store->chunk_size << SECTOR_SHIFT);
 
 	dh = (struct disk_header *) ps->area;
 	dh->magic = cpu_to_le32(SNAP_MAGIC);
 	dh->valid = cpu_to_le32(ps->valid);
 	dh->version = cpu_to_le32(ps->version);
-	dh->chunk_size = cpu_to_le32(ps->snap->chunk_size);
+	dh->chunk_size = cpu_to_le32(ps->store->chunk_size);
 
 	return chunk_io(ps, 0, WRITE, 1);
 }
@@ -474,18 +476,25 @@ static struct pstore *get_info(struct dm_exception_store *store)
 static void persistent_fraction_full(struct dm_exception_store *store,
 				     sector_t *numerator, sector_t *denominator)
 {
-	*numerator = get_info(store)->next_free * store->snap->chunk_size;
-	*denominator = get_dev_size(store->snap->cow->bdev);
+	*numerator = get_info(store)->next_free * store->chunk_size;
+	*denominator = get_dev_size(store->cow->bdev);
 }
 
-static void persistent_destroy(struct dm_exception_store *store)
+static void persistent_dtr(struct dm_exception_store *store)
 {
 	struct pstore *ps = get_info(store);
 
 	destroy_workqueue(ps->metadata_wq);
-	dm_io_client_destroy(ps->io_client);
-	vfree(ps->callbacks);
+
+	/* Created in read_header */
+	if (ps->io_client)
+		dm_io_client_destroy(ps->io_client);
 	free_area(ps);
+
+	/* Allocated in persistent_read_metadata */
+	if (ps->callbacks)
+		vfree(ps->callbacks);
+
 	kfree(ps);
 }
 
@@ -507,7 +516,7 @@ static int persistent_read_metadata(struct dm_exception_store *store,
 	/*
 	 * Now we know correct chunk_size, complete the initialisation.
 	 */
-	ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) /
+	ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) /
 				  sizeof(struct disk_exception);
 	ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
 			sizeof(*ps->callbacks));
@@ -564,10 +573,10 @@ static int persistent_prepare_exception(struct dm_exception_store *store,
 	struct pstore *ps = get_info(store);
 	uint32_t stride;
 	chunk_t next_free;
-	sector_t size = get_dev_size(store->snap->cow->bdev);
+	sector_t size = get_dev_size(store->cow->bdev);
 
 	/* Is there enough room ? */
-	if (size < ((ps->next_free + 1) * store->snap->chunk_size))
+	if (size < ((ps->next_free + 1) * store->chunk_size))
 		return -ENOSPC;
 
 	e->new_chunk = ps->next_free;
@@ -656,16 +665,17 @@ static void persistent_drop_snapshot(struct dm_exception_store *store)
 		DMWARN("write header failed");
 }
 
-int dm_create_persistent(struct dm_exception_store *store)
+static int persistent_ctr(struct dm_exception_store *store,
+			  unsigned argc, char **argv)
 {
 	struct pstore *ps;
 
 	/* allocate the pstore */
-	ps = kmalloc(sizeof(*ps), GFP_KERNEL);
+	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
 	if (!ps)
 		return -ENOMEM;
 
-	ps->snap = store->snap;
+	ps->store = store;
 	ps->valid = 1;
 	ps->version = SNAPSHOT_DISK_VERSION;
 	ps->area = NULL;
@@ -683,22 +693,77 @@ int dm_create_persistent(struct dm_exception_store *store)
 		return -ENOMEM;
 	}
 
-	store->destroy = persistent_destroy;
-	store->read_metadata = persistent_read_metadata;
-	store->prepare_exception = persistent_prepare_exception;
-	store->commit_exception = persistent_commit_exception;
-	store->drop_snapshot = persistent_drop_snapshot;
-	store->fraction_full = persistent_fraction_full;
 	store->context = ps;
 
 	return 0;
 }
 
+static unsigned persistent_status(struct dm_exception_store *store,
+				  status_type_t status, char *result,
+				  unsigned maxlen)
+{
+	unsigned sz = 0;
+
+	switch (status) {
+	case STATUSTYPE_INFO:
+		break;
+	case STATUSTYPE_TABLE:
+		DMEMIT(" %s P %llu", store->cow->name,
+		       (unsigned long long)store->chunk_size);
+	}
+
+	return sz;
+}
+
+static struct dm_exception_store_type _persistent_type = {
+	.name = "persistent",
+	.module = THIS_MODULE,
+	.ctr = persistent_ctr,
+	.dtr = persistent_dtr,
+	.read_metadata = persistent_read_metadata,
+	.prepare_exception = persistent_prepare_exception,
+	.commit_exception = persistent_commit_exception,
+	.drop_snapshot = persistent_drop_snapshot,
+	.fraction_full = persistent_fraction_full,
+	.status = persistent_status,
+};
+
+static struct dm_exception_store_type _persistent_compat_type = {
+	.name = "P",
+	.module = THIS_MODULE,
+	.ctr = persistent_ctr,
+	.dtr = persistent_dtr,
+	.read_metadata = persistent_read_metadata,
+	.prepare_exception = persistent_prepare_exception,
+	.commit_exception = persistent_commit_exception,
+	.drop_snapshot = persistent_drop_snapshot,
+	.fraction_full = persistent_fraction_full,
+	.status = persistent_status,
+};
+
 int dm_persistent_snapshot_init(void)
 {
-	return 0;
+	int r;
+
+	r = dm_exception_store_type_register(&_persistent_type);
+	if (r) {
+		DMERR("Unable to register persistent exception store type");
+		return r;
+	}
+
+	r = dm_exception_store_type_register(&_persistent_compat_type);
+	if (r) {
+		DMERR("Unable to register old-style persistent exception "
+		      "store type");
+		dm_exception_store_type_unregister(&_persistent_type);
+		return r;
+	}
+
+	return r;
 }
 
 void dm_persistent_snapshot_exit(void)
 {
+	dm_exception_store_type_unregister(&_persistent_type);
+	dm_exception_store_type_unregister(&_persistent_compat_type);
 }
diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c
index 7f6e2e6dcb0..cde5aa558e6 100644
--- a/drivers/md/dm-snap-transient.c
+++ b/drivers/md/dm-snap-transient.c
@@ -6,7 +6,6 @@
  */
 
 #include "dm-exception-store.h"
-#include "dm-snap.h"
 
 #include <linux/mm.h>
 #include <linux/pagemap.h>
@@ -23,7 +22,7 @@ struct transient_c {
 	sector_t next_free;
 };
 
-static void transient_destroy(struct dm_exception_store *store)
+static void transient_dtr(struct dm_exception_store *store)
 {
 	kfree(store->context);
 }
@@ -39,14 +38,14 @@ static int transient_read_metadata(struct dm_exception_store *store,
 static int transient_prepare_exception(struct dm_exception_store *store,
 				       struct dm_snap_exception *e)
 {
-	struct transient_c *tc = (struct transient_c *) store->context;
-	sector_t size = get_dev_size(store->snap->cow->bdev);
+	struct transient_c *tc = store->context;
+	sector_t size = get_dev_size(store->cow->bdev);
 
-	if (size < (tc->next_free + store->snap->chunk_size))
+	if (size < (tc->next_free + store->chunk_size))
 		return -1;
 
-	e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
-	tc->next_free += store->snap->chunk_size;
+	e->new_chunk = sector_to_chunk(store, tc->next_free);
+	tc->next_free += store->chunk_size;
 
 	return 0;
 }
@@ -64,20 +63,14 @@ static void transient_fraction_full(struct dm_exception_store *store,
 				    sector_t *numerator, sector_t *denominator)
 {
 	*numerator = ((struct transient_c *) store->context)->next_free;
-	*denominator = get_dev_size(store->snap->cow->bdev);
+	*denominator = get_dev_size(store->cow->bdev);
 }
 
-int dm_create_transient(struct dm_exception_store *store)
+static int transient_ctr(struct dm_exception_store *store,
+			 unsigned argc, char **argv)
 {
 	struct transient_c *tc;
 
-	store->destroy = transient_destroy;
-	store->read_metadata = transient_read_metadata;
-	store->prepare_exception = transient_prepare_exception;
-	store->commit_exception = transient_commit_exception;
-	store->drop_snapshot = NULL;
-	store->fraction_full = transient_fraction_full;
-
 	tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
 	if (!tc)
 		return -ENOMEM;
@@ -88,11 +81,70 @@ int dm_create_transient(struct dm_exception_store *store)
 	return 0;
 }
 
+static unsigned transient_status(struct dm_exception_store *store,
+				 status_type_t status, char *result,
+				 unsigned maxlen)
+{
+	unsigned sz = 0;
+
+	switch (status) {
+	case STATUSTYPE_INFO:
+		break;
+	case STATUSTYPE_TABLE:
+		DMEMIT(" %s N %llu", store->cow->name,
+		       (unsigned long long)store->chunk_size);
+	}
+
+	return sz;
+}
+
+static struct dm_exception_store_type _transient_type = {
+	.name = "transient",
+	.module = THIS_MODULE,
+	.ctr = transient_ctr,
+	.dtr = transient_dtr,
+	.read_metadata = transient_read_metadata,
+	.prepare_exception = transient_prepare_exception,
+	.commit_exception = transient_commit_exception,
+	.fraction_full = transient_fraction_full,
+	.status = transient_status,
+};
+
+static struct dm_exception_store_type _transient_compat_type = {
+	.name = "N",
+	.module = THIS_MODULE,
+	.ctr = transient_ctr,
+	.dtr = transient_dtr,
+	.read_metadata = transient_read_metadata,
+	.prepare_exception = transient_prepare_exception,
+	.commit_exception = transient_commit_exception,
+	.fraction_full = transient_fraction_full,
+	.status = transient_status,
+};
+
 int dm_transient_snapshot_init(void)
 {
-	return 0;
+	int r;
+
+	r = dm_exception_store_type_register(&_transient_type);
+	if (r) {
+		DMWARN("Unable to register transient exception store type");
+		return r;
+	}
+
+	r = dm_exception_store_type_register(&_transient_compat_type);
+	if (r) {
+		DMWARN("Unable to register old-style transient "
+		       "exception store type");
+		dm_exception_store_type_unregister(&_transient_type);
+		return r;
+	}
+
+	return r;
 }
 
 void dm_transient_snapshot_exit(void)
 {
+	dm_exception_store_type_unregister(&_transient_type);
+	dm_exception_store_type_unregister(&_transient_compat_type);
 }
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 65ff82ff124..981a0413068 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -7,7 +7,6 @@
  */
 
 #include <linux/blkdev.h>
-#include <linux/ctype.h>
 #include <linux/device-mapper.h>
 #include <linux/delay.h>
 #include <linux/fs.h>
@@ -20,9 +19,9 @@
 #include <linux/vmalloc.h>
 #include <linux/log2.h>
 #include <linux/dm-kcopyd.h>
+#include <linux/workqueue.h>
 
 #include "dm-exception-store.h"
-#include "dm-snap.h"
 #include "dm-bio-list.h"
 
 #define DM_MSG_PREFIX "snapshots"
@@ -47,9 +46,76 @@
  */
 #define MIN_IOS 256
 
+#define DM_TRACKED_CHUNK_HASH_SIZE	16
+#define DM_TRACKED_CHUNK_HASH(x)	((unsigned long)(x) & \
+					 (DM_TRACKED_CHUNK_HASH_SIZE - 1))
+
+struct exception_table {
+	uint32_t hash_mask;
+	unsigned hash_shift;
+	struct list_head *table;
+};
+
+struct dm_snapshot {
+	struct rw_semaphore lock;
+
+	struct dm_dev *origin;
+
+	/* List of snapshots per Origin */
+	struct list_head list;
+
+	/* You can't use a snapshot if this is 0 (e.g. if full) */
+	int valid;
+
+	/* Origin writes don't trigger exceptions until this is set */
+	int active;
+
+	mempool_t *pending_pool;
+
+	atomic_t pending_exceptions_count;
+
+	struct exception_table pending;
+	struct exception_table complete;
+
+	/*
+	 * pe_lock protects all pending_exception operations and access
+	 * as well as the snapshot_bios list.
+	 */
+	spinlock_t pe_lock;
+
+	/* The on disk metadata handler */
+	struct dm_exception_store *store;
+
+	struct dm_kcopyd_client *kcopyd_client;
+
+	/* Queue of snapshot writes for ksnapd to flush */
+	struct bio_list queued_bios;
+	struct work_struct queued_bios_work;
+
+	/* Chunks with outstanding reads */
+	mempool_t *tracked_chunk_pool;
+	spinlock_t tracked_chunk_lock;
+	struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
+};
+
 static struct workqueue_struct *ksnapd;
 static void flush_queued_bios(struct work_struct *work);
 
+static sector_t chunk_to_sector(struct dm_exception_store *store,
+				chunk_t chunk)
+{
+	return chunk << store->chunk_shift;
+}
+
+static int bdev_equal(struct block_device *lhs, struct block_device *rhs)
+{
+	/*
+	 * There is only ever one instance of a particular block
+	 * device so we can compare pointers safely.
+	 */
+	return lhs == rhs;
+}
+
 struct dm_snap_pending_exception {
 	struct dm_snap_exception e;
 
@@ -476,11 +542,11 @@ static int init_hash_tables(struct dm_snapshot *s)
 	 * Calculate based on the size of the original volume or
 	 * the COW volume...
 	 */
-	cow_dev_size = get_dev_size(s->cow->bdev);
+	cow_dev_size = get_dev_size(s->store->cow->bdev);
 	origin_dev_size = get_dev_size(s->origin->bdev);
 	max_buckets = calc_max_buckets();
 
-	hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift;
+	hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift;
 	hash_size = min(hash_size, max_buckets);
 
 	hash_size = rounddown_pow_of_two(hash_size);
@@ -505,58 +571,6 @@ static int init_hash_tables(struct dm_snapshot *s)
 }
 
 /*
- * Round a number up to the nearest 'size' boundary.  size must
- * be a power of 2.
- */
-static ulong round_up(ulong n, ulong size)
-{
-	size--;
-	return (n + size) & ~size;
-}
-
-static int set_chunk_size(struct dm_snapshot *s, const char *chunk_size_arg,
-			  char **error)
-{
-	unsigned long chunk_size;
-	char *value;
-
-	chunk_size = simple_strtoul(chunk_size_arg, &value, 10);
-	if (*chunk_size_arg == '\0' || *value != '\0') {
-		*error = "Invalid chunk size";
-		return -EINVAL;
-	}
-
-	if (!chunk_size) {
-		s->chunk_size = s->chunk_mask = s->chunk_shift = 0;
-		return 0;
-	}
-
-	/*
-	 * Chunk size must be multiple of page size.  Silently
-	 * round up if it's not.
-	 */
-	chunk_size = round_up(chunk_size, PAGE_SIZE >> 9);
-
-	/* Check chunk_size is a power of 2 */
-	if (!is_power_of_2(chunk_size)) {
-		*error = "Chunk size is not a power of 2";
-		return -EINVAL;
-	}
-
-	/* Validate the chunk size against the device block size */
-	if (chunk_size % (bdev_hardsect_size(s->cow->bdev) >> 9)) {
-		*error = "Chunk size is not a multiple of device blocksize";
-		return -EINVAL;
-	}
-
-	s->chunk_size = chunk_size;
-	s->chunk_mask = chunk_size - 1;
-	s->chunk_shift = ffs(chunk_size) - 1;
-
-	return 0;
-}
-
-/*
  * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
  */
 static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
@@ -564,91 +578,68 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	struct dm_snapshot *s;
 	int i;
 	int r = -EINVAL;
-	char persistent;
 	char *origin_path;
-	char *cow_path;
+	struct dm_exception_store *store;
+	unsigned args_used;
 
 	if (argc != 4) {
 		ti->error = "requires exactly 4 arguments";
 		r = -EINVAL;
-		goto bad1;
+		goto bad_args;
 	}
 
 	origin_path = argv[0];
-	cow_path = argv[1];
-	persistent = toupper(*argv[2]);
+	argv++;
+	argc--;
 
-	if (persistent != 'P' && persistent != 'N') {
-		ti->error = "Persistent flag is not P or N";
+	r = dm_exception_store_create(ti, argc, argv, &args_used, &store);
+	if (r) {
+		ti->error = "Couldn't create exception store";
 		r = -EINVAL;
-		goto bad1;
+		goto bad_args;
 	}
 
+	argv += args_used;
+	argc -= args_used;
+
 	s = kmalloc(sizeof(*s), GFP_KERNEL);
-	if (s == NULL) {
+	if (!s) {
 		ti->error = "Cannot allocate snapshot context private "
 		    "structure";
 		r = -ENOMEM;
-		goto bad1;
+		goto bad_snap;
 	}
 
 	r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin);
 	if (r) {
 		ti->error = "Cannot get origin device";
-		goto bad2;
-	}
-
-	r = dm_get_device(ti, cow_path, 0, 0,
-			  FMODE_READ | FMODE_WRITE, &s->cow);
-	if (r) {
-		dm_put_device(ti, s->origin);
-		ti->error = "Cannot get COW device";
-		goto bad2;
+		goto bad_origin;
 	}
 
-	r = set_chunk_size(s, argv[3], &ti->error);
-	if (r)
-		goto bad3;
-
-	s->type = persistent;
-
+	s->store = store;
 	s->valid = 1;
 	s->active = 0;
 	atomic_set(&s->pending_exceptions_count, 0);
 	init_rwsem(&s->lock);
 	spin_lock_init(&s->pe_lock);
-	s->ti = ti;
 
 	/* Allocate hash table for COW data */
 	if (init_hash_tables(s)) {
 		ti->error = "Unable to allocate hash table space";
 		r = -ENOMEM;
-		goto bad3;
-	}
-
-	s->store.snap = s;
-
-	if (persistent == 'P')
-		r = dm_create_persistent(&s->store);
-	else
-		r = dm_create_transient(&s->store);
-
-	if (r) {
-		ti->error = "Couldn't create exception store";
-		r = -EINVAL;
-		goto bad4;
+		goto bad_hash_tables;
 	}
 
 	r = dm_kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client);
 	if (r) {
 		ti->error = "Could not create kcopyd client";
-		goto bad5;
+		goto bad_kcopyd;
 	}
 
 	s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache);
 	if (!s->pending_pool) {
 		ti->error = "Could not allocate mempool for pending exceptions";
-		goto bad6;
+		goto bad_pending_pool;
 	}
 
 	s->tracked_chunk_pool = mempool_create_slab_pool(MIN_IOS,
@@ -665,7 +656,8 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	spin_lock_init(&s->tracked_chunk_lock);
 
 	/* Metadata must only be loaded into one table at once */
-	r = s->store.read_metadata(&s->store, dm_add_exception, (void *)s);
+	r = s->store->type->read_metadata(s->store, dm_add_exception,
+					  (void *)s);
 	if (r < 0) {
 		ti->error = "Failed to read snapshot metadata";
 		goto bad_load_and_register;
@@ -686,34 +678,33 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 
 	ti->private = s;
-	ti->split_io = s->chunk_size;
+	ti->split_io = s->store->chunk_size;
 
 	return 0;
 
- bad_load_and_register:
+bad_load_and_register:
 	mempool_destroy(s->tracked_chunk_pool);
 
- bad_tracked_chunk_pool:
+bad_tracked_chunk_pool:
 	mempool_destroy(s->pending_pool);
 
- bad6:
+bad_pending_pool:
 	dm_kcopyd_client_destroy(s->kcopyd_client);
 
- bad5:
-	s->store.destroy(&s->store);
-
- bad4:
+bad_kcopyd:
 	exit_exception_table(&s->pending, pending_cache);
 	exit_exception_table(&s->complete, exception_cache);
 
- bad3:
-	dm_put_device(ti, s->cow);
+bad_hash_tables:
 	dm_put_device(ti, s->origin);
 
- bad2:
+bad_origin:
 	kfree(s);
 
- bad1:
+bad_snap:
+	dm_exception_store_destroy(store);
+
+bad_args:
 	return r;
 }
 
@@ -724,8 +715,6 @@ static void __free_exceptions(struct dm_snapshot *s)
 
 	exit_exception_table(&s->pending, pending_cache);
 	exit_exception_table(&s->complete, exception_cache);
-
-	s->store.destroy(&s->store);
 }
 
 static void snapshot_dtr(struct dm_target *ti)
@@ -761,7 +750,8 @@ static void snapshot_dtr(struct dm_target *ti)
 	mempool_destroy(s->pending_pool);
 
 	dm_put_device(ti, s->origin);
-	dm_put_device(ti, s->cow);
+
+	dm_exception_store_destroy(s->store);
 
 	kfree(s);
 }
@@ -820,12 +810,12 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err)
 	else if (err == -ENOMEM)
 		DMERR("Invalidating snapshot: Unable to allocate exception.");
 
-	if (s->store.drop_snapshot)
-		s->store.drop_snapshot(&s->store);
+	if (s->store->type->drop_snapshot)
+		s->store->type->drop_snapshot(s->store);
 
 	s->valid = 0;
 
-	dm_table_event(s->ti->table);
+	dm_table_event(s->store->ti->table);
 }
 
 static void get_pending_exception(struct dm_snap_pending_exception *pe)
@@ -943,8 +933,8 @@ static void copy_callback(int read_err, unsigned long write_err, void *context)
 
 	else
 		/* Update the metadata if we are persistent */
-		s->store.commit_exception(&s->store, &pe->e, commit_callback,
-					  pe);
+		s->store->type->commit_exception(s->store, &pe->e,
+						 commit_callback, pe);
 }
 
 /*
@@ -960,11 +950,11 @@ static void start_copy(struct dm_snap_pending_exception *pe)
 	dev_size = get_dev_size(bdev);
 
 	src.bdev = bdev;
-	src.sector = chunk_to_sector(s, pe->e.old_chunk);
-	src.count = min(s->chunk_size, dev_size - src.sector);
+	src.sector = chunk_to_sector(s->store, pe->e.old_chunk);
+	src.count = min(s->store->chunk_size, dev_size - src.sector);
 
-	dest.bdev = s->cow->bdev;
-	dest.sector = chunk_to_sector(s, pe->e.new_chunk);
+	dest.bdev = s->store->cow->bdev;
+	dest.sector = chunk_to_sector(s->store, pe->e.new_chunk);
 	dest.count = src.count;
 
 	/* Hand over to kcopyd */
@@ -972,6 +962,17 @@ static void start_copy(struct dm_snap_pending_exception *pe)
 		    &src, 1, &dest, 0, copy_callback, pe);
 }
 
+static struct dm_snap_pending_exception *
+__lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk)
+{
+	struct dm_snap_exception *e = lookup_exception(&s->pending, chunk);
+
+	if (!e)
+		return NULL;
+
+	return container_of(e, struct dm_snap_pending_exception, e);
+}
+
 /*
  * Looks to see if this snapshot already has a pending exception
  * for this chunk, otherwise it allocates a new one and inserts
@@ -981,40 +982,15 @@ static void start_copy(struct dm_snap_pending_exception *pe)
  * this.
  */
 static struct dm_snap_pending_exception *
-__find_pending_exception(struct dm_snapshot *s, struct bio *bio)
+__find_pending_exception(struct dm_snapshot *s,
+			 struct dm_snap_pending_exception *pe, chunk_t chunk)
 {
-	struct dm_snap_exception *e;
-	struct dm_snap_pending_exception *pe;
-	chunk_t chunk = sector_to_chunk(s, bio->bi_sector);
+	struct dm_snap_pending_exception *pe2;
 
-	/*
-	 * Is there a pending exception for this already ?
-	 */
-	e = lookup_exception(&s->pending, chunk);
-	if (e) {
-		/* cast the exception to a pending exception */
-		pe = container_of(e, struct dm_snap_pending_exception, e);
-		goto out;
-	}
-
-	/*
-	 * Create a new pending exception, we don't want
-	 * to hold the lock while we do this.
-	 */
-	up_write(&s->lock);
-	pe = alloc_pending_exception(s);
-	down_write(&s->lock);
-
-	if (!s->valid) {
-		free_pending_exception(pe);
-		return NULL;
-	}
-
-	e = lookup_exception(&s->pending, chunk);
-	if (e) {
+	pe2 = __lookup_pending_exception(s, chunk);
+	if (pe2) {
 		free_pending_exception(pe);
-		pe = container_of(e, struct dm_snap_pending_exception, e);
-		goto out;
+		return pe2;
 	}
 
 	pe->e.old_chunk = chunk;
@@ -1024,7 +1000,7 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
 	atomic_set(&pe->ref_count, 0);
 	pe->started = 0;
 
-	if (s->store.prepare_exception(&s->store, &pe->e)) {
+	if (s->store->type->prepare_exception(s->store, &pe->e)) {
 		free_pending_exception(pe);
 		return NULL;
 	}
@@ -1032,17 +1008,18 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
 	get_pending_exception(pe);
 	insert_exception(&s->pending, &pe->e);
 
- out:
 	return pe;
 }
 
 static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e,
 			    struct bio *bio, chunk_t chunk)
 {
-	bio->bi_bdev = s->cow->bdev;
-	bio->bi_sector = chunk_to_sector(s, dm_chunk_number(e->new_chunk) +
-			 (chunk - e->old_chunk)) +
-			 (bio->bi_sector & s->chunk_mask);
+	bio->bi_bdev = s->store->cow->bdev;
+	bio->bi_sector = chunk_to_sector(s->store,
+					 dm_chunk_number(e->new_chunk) +
+					 (chunk - e->old_chunk)) +
+					 (bio->bi_sector &
+					  s->store->chunk_mask);
 }
 
 static int snapshot_map(struct dm_target *ti, struct bio *bio,
@@ -1054,7 +1031,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
 	chunk_t chunk;
 	struct dm_snap_pending_exception *pe = NULL;
 
-	chunk = sector_to_chunk(s, bio->bi_sector);
+	chunk = sector_to_chunk(s->store, bio->bi_sector);
 
 	/* Full snapshots are not usable */
 	/* To get here the table must be live so s->active is always set. */
@@ -1083,11 +1060,31 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
 	 * writeable.
 	 */
 	if (bio_rw(bio) == WRITE) {
-		pe = __find_pending_exception(s, bio);
+		pe = __lookup_pending_exception(s, chunk);
 		if (!pe) {
-			__invalidate_snapshot(s, -ENOMEM);
-			r = -EIO;
-			goto out_unlock;
+			up_write(&s->lock);
+			pe = alloc_pending_exception(s);
+			down_write(&s->lock);
+
+			if (!s->valid) {
+				free_pending_exception(pe);
+				r = -EIO;
+				goto out_unlock;
+			}
+
+			e = lookup_exception(&s->complete, chunk);
+			if (e) {
+				free_pending_exception(pe);
+				remap_exception(s, e, bio, chunk);
+				goto out_unlock;
+			}
+
+			pe = __find_pending_exception(s, pe, chunk);
+			if (!pe) {
+				__invalidate_snapshot(s, -ENOMEM);
+				r = -EIO;
+				goto out_unlock;
+			}
 		}
 
 		remap_exception(s, &pe->e, bio, chunk);
@@ -1137,24 +1134,25 @@ static void snapshot_resume(struct dm_target *ti)
 static int snapshot_status(struct dm_target *ti, status_type_t type,
 			   char *result, unsigned int maxlen)
 {
+	unsigned sz = 0;
 	struct dm_snapshot *snap = ti->private;
 
 	switch (type) {
 	case STATUSTYPE_INFO:
 		if (!snap->valid)
-			snprintf(result, maxlen, "Invalid");
+			DMEMIT("Invalid");
 		else {
-			if (snap->store.fraction_full) {
+			if (snap->store->type->fraction_full) {
 				sector_t numerator, denominator;
-				snap->store.fraction_full(&snap->store,
-							  &numerator,
-							  &denominator);
-				snprintf(result, maxlen, "%llu/%llu",
-					(unsigned long long)numerator,
-					(unsigned long long)denominator);
+				snap->store->type->fraction_full(snap->store,
+								 &numerator,
+								 &denominator);
+				DMEMIT("%llu/%llu",
+				       (unsigned long long)numerator,
+				       (unsigned long long)denominator);
 			}
 			else
-				snprintf(result, maxlen, "Unknown");
+				DMEMIT("Unknown");
 		}
 		break;
 
@@ -1164,10 +1162,9 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
 		 * to make private copies if the output is to
 		 * make sense.
 		 */
-		snprintf(result, maxlen, "%s %s %c %llu",
-			 snap->origin->name, snap->cow->name,
-			 snap->type,
-			 (unsigned long long)snap->chunk_size);
+		DMEMIT("%s", snap->origin->name);
+		snap->store->type->status(snap->store, type, result + sz,
+					  maxlen - sz);
 		break;
 	}
 
@@ -1196,14 +1193,14 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
 			goto next_snapshot;
 
 		/* Nothing to do if writing beyond end of snapshot */
-		if (bio->bi_sector >= dm_table_get_size(snap->ti->table))
+		if (bio->bi_sector >= dm_table_get_size(snap->store->ti->table))
 			goto next_snapshot;
 
 		/*
 		 * Remember, different snapshots can have
 		 * different chunk sizes.
 		 */
-		chunk = sector_to_chunk(snap, bio->bi_sector);
+		chunk = sector_to_chunk(snap->store, bio->bi_sector);
 
 		/*
 		 * Check exception table to see if block
@@ -1217,10 +1214,28 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
 		if (e)
 			goto next_snapshot;
 
-		pe = __find_pending_exception(snap, bio);
+		pe = __lookup_pending_exception(snap, chunk);
 		if (!pe) {
-			__invalidate_snapshot(snap, -ENOMEM);
-			goto next_snapshot;
+			up_write(&snap->lock);
+			pe = alloc_pending_exception(snap);
+			down_write(&snap->lock);
+
+			if (!snap->valid) {
+				free_pending_exception(pe);
+				goto next_snapshot;
+			}
+
+			e = lookup_exception(&snap->complete, chunk);
+			if (e) {
+				free_pending_exception(pe);
+				goto next_snapshot;
+			}
+
+			pe = __find_pending_exception(snap, pe, chunk);
+			if (!pe) {
+				__invalidate_snapshot(snap, -ENOMEM);
+				goto next_snapshot;
+			}
 		}
 
 		if (!primary_pe) {
@@ -1360,7 +1375,8 @@ static void origin_resume(struct dm_target *ti)
 	o = __lookup_origin(dev->bdev);
 	if (o)
 		list_for_each_entry (snap, &o->snapshots, list)
-			chunk_size = min_not_zero(chunk_size, snap->chunk_size);
+			chunk_size = min_not_zero(chunk_size,
+						  snap->store->chunk_size);
 	up_read(&_origins_lock);
 
 	ti->split_io = chunk_size;
diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h
deleted file mode 100644
index d9e62b43cf8..00000000000
--- a/drivers/md/dm-snap.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
- *
- * This file is released under the GPL.
- */
-
-#ifndef DM_SNAPSHOT_H
-#define DM_SNAPSHOT_H
-
-#include <linux/device-mapper.h>
-#include "dm-exception-store.h"
-#include "dm-bio-list.h"
-#include <linux/blkdev.h>
-#include <linux/workqueue.h>
-
-struct exception_table {
-	uint32_t hash_mask;
-	unsigned hash_shift;
-	struct list_head *table;
-};
-
-#define DM_TRACKED_CHUNK_HASH_SIZE	16
-#define DM_TRACKED_CHUNK_HASH(x)	((unsigned long)(x) & \
-					 (DM_TRACKED_CHUNK_HASH_SIZE - 1))
-
-struct dm_snapshot {
-	struct rw_semaphore lock;
-	struct dm_target *ti;
-
-	struct dm_dev *origin;
-	struct dm_dev *cow;
-
-	/* List of snapshots per Origin */
-	struct list_head list;
-
-	/* Size of data blocks saved - must be a power of 2 */
-	chunk_t chunk_size;
-	chunk_t chunk_mask;
-	chunk_t chunk_shift;
-
-	/* You can't use a snapshot if this is 0 (e.g. if full) */
-	int valid;
-
-	/* Origin writes don't trigger exceptions until this is set */
-	int active;
-
-	/* Used for display of table */
-	char type;
-
-	mempool_t *pending_pool;
-
-	atomic_t pending_exceptions_count;
-
-	struct exception_table pending;
-	struct exception_table complete;
-
-	/*
-	 * pe_lock protects all pending_exception operations and access
-	 * as well as the snapshot_bios list.
-	 */
-	spinlock_t pe_lock;
-
-	/* The on disk metadata handler */
-	struct dm_exception_store store;
-
-	struct dm_kcopyd_client *kcopyd_client;
-
-	/* Queue of snapshot writes for ksnapd to flush */
-	struct bio_list queued_bios;
-	struct work_struct queued_bios_work;
-
-	/* Chunks with outstanding reads */
-	mempool_t *tracked_chunk_pool;
-	spinlock_t tracked_chunk_lock;
-	struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
-};
-
-/*
- * Return the number of sectors in the device.
- */
-static inline sector_t get_dev_size(struct block_device *bdev)
-{
-	return bdev->bd_inode->i_size >> SECTOR_SHIFT;
-}
-
-static inline chunk_t sector_to_chunk(struct dm_snapshot *s, sector_t sector)
-{
-	return (sector & ~s->chunk_mask) >> s->chunk_shift;
-}
-
-static inline sector_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk)
-{
-	return chunk << s->chunk_shift;
-}
-
-static inline int bdev_equal(struct block_device *lhs, struct block_device *rhs)
-{
-	/*
-	 * There is only ever one instance of a particular block
-	 * device so we can compare pointers safely.
-	 */
-	return lhs == rhs;
-}
-
-#endif
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 2fd66c30f7f..e8361b191b9 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -399,28 +399,30 @@ static int check_device_area(struct dm_dev_internal *dd, sector_t start,
 }
 
 /*
- * This upgrades the mode on an already open dm_dev.  Being
+ * This upgrades the mode on an already open dm_dev, being
  * careful to leave things as they were if we fail to reopen the
- * device.
+ * device and not to touch the existing bdev field in case
+ * it is accessed concurrently inside dm_table_any_congested().
  */
 static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
 			struct mapped_device *md)
 {
 	int r;
-	struct dm_dev_internal dd_copy;
-	dev_t dev = dd->dm_dev.bdev->bd_dev;
+	struct dm_dev_internal dd_new, dd_old;
 
-	dd_copy = *dd;
+	dd_new = dd_old = *dd;
+
+	dd_new.dm_dev.mode |= new_mode;
+	dd_new.dm_dev.bdev = NULL;
+
+	r = open_dev(&dd_new, dd->dm_dev.bdev->bd_dev, md);
+	if (r)
+		return r;
 
 	dd->dm_dev.mode |= new_mode;
-	dd->dm_dev.bdev = NULL;
-	r = open_dev(dd, dev, md);
-	if (!r)
-		close_dev(&dd_copy, md);
-	else
-		*dd = dd_copy;
+	close_dev(&dd_old, md);
 
-	return r;
+	return 0;
 }
 
 /*
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 7decf10006e..04feccf2a99 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -14,45 +14,34 @@
 
 #define DM_MSG_PREFIX "target"
 
-struct tt_internal {
-	struct target_type tt;
-
-	struct list_head list;
-	long use;
-};
-
 static LIST_HEAD(_targets);
 static DECLARE_RWSEM(_lock);
 
 #define DM_MOD_NAME_SIZE 32
 
-static inline struct tt_internal *__find_target_type(const char *name)
+static inline struct target_type *__find_target_type(const char *name)
 {
-	struct tt_internal *ti;
+	struct target_type *tt;
 
-	list_for_each_entry (ti, &_targets, list)
-		if (!strcmp(name, ti->tt.name))
-			return ti;
+	list_for_each_entry(tt, &_targets, list)
+		if (!strcmp(name, tt->name))
+			return tt;
 
 	return NULL;
 }
 
-static struct tt_internal *get_target_type(const char *name)
+static struct target_type *get_target_type(const char *name)
 {
-	struct tt_internal *ti;
+	struct target_type *tt;
 
 	down_read(&_lock);
 
-	ti = __find_target_type(name);
-	if (ti) {
-		if ((ti->use == 0) && !try_module_get(ti->tt.module))
-			ti = NULL;
-		else
-			ti->use++;
-	}
+	tt = __find_target_type(name);
+	if (tt && !try_module_get(tt->module))
+		tt = NULL;
 
 	up_read(&_lock);
-	return ti;
+	return tt;
 }
 
 static void load_module(const char *name)
@@ -62,92 +51,59 @@ static void load_module(const char *name)
 
 struct target_type *dm_get_target_type(const char *name)
 {
-	struct tt_internal *ti = get_target_type(name);
+	struct target_type *tt = get_target_type(name);
 
-	if (!ti) {
+	if (!tt) {
 		load_module(name);
-		ti = get_target_type(name);
+		tt = get_target_type(name);
 	}
 
-	return ti ? &ti->tt : NULL;
+	return tt;
 }
 
-void dm_put_target_type(struct target_type *t)
+void dm_put_target_type(struct target_type *tt)
 {
-	struct tt_internal *ti = (struct tt_internal *) t;
-
 	down_read(&_lock);
-	if (--ti->use == 0)
-		module_put(ti->tt.module);
-
-	BUG_ON(ti->use < 0);
+	module_put(tt->module);
 	up_read(&_lock);
-
-	return;
-}
-
-static struct tt_internal *alloc_target(struct target_type *t)
-{
-	struct tt_internal *ti = kzalloc(sizeof(*ti), GFP_KERNEL);
-
-	if (ti)
-		ti->tt = *t;
-
-	return ti;
 }
 
-
 int dm_target_iterate(void (*iter_func)(struct target_type *tt,
 					void *param), void *param)
 {
-	struct tt_internal *ti;
+	struct target_type *tt;
 
 	down_read(&_lock);
-	list_for_each_entry (ti, &_targets, list)
-		iter_func(&ti->tt, param);
+	list_for_each_entry(tt, &_targets, list)
+		iter_func(tt, param);
 	up_read(&_lock);
 
 	return 0;
 }
 
-int dm_register_target(struct target_type *t)
+int dm_register_target(struct target_type *tt)
 {
 	int rv = 0;
-	struct tt_internal *ti = alloc_target(t);
-
-	if (!ti)
-		return -ENOMEM;
 
 	down_write(&_lock);
-	if (__find_target_type(t->name))
+	if (__find_target_type(tt->name))
 		rv = -EEXIST;
 	else
-		list_add(&ti->list, &_targets);
+		list_add(&tt->list, &_targets);
 
 	up_write(&_lock);
-	if (rv)
-		kfree(ti);
 	return rv;
 }
 
-void dm_unregister_target(struct target_type *t)
+void dm_unregister_target(struct target_type *tt)
 {
-	struct tt_internal *ti;
-
 	down_write(&_lock);
-	if (!(ti = __find_target_type(t->name))) {
-		DMCRIT("Unregistering unrecognised target: %s", t->name);
-		BUG();
-	}
-
-	if (ti->use) {
-		DMCRIT("Attempt to unregister target still in use: %s",
-		       t->name);
+	if (!__find_target_type(tt->name)) {
+		DMCRIT("Unregistering unrecognised target: %s", tt->name);
 		BUG();
 	}
 
-	list_del(&ti->list);
-	kfree(ti);
+	list_del(&tt->list);
 
 	up_write(&_lock);
 }
@@ -156,17 +112,17 @@ void dm_unregister_target(struct target_type *t)
  * io-err: always fails an io, useful for bringing
  * up LVs that have holes in them.
  */
-static int io_err_ctr(struct dm_target *ti, unsigned int argc, char **args)
+static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args)
 {
 	return 0;
 }
 
-static void io_err_dtr(struct dm_target *ti)
+static void io_err_dtr(struct dm_target *tt)
 {
 	/* empty */
 }
 
-static int io_err_map(struct dm_target *ti, struct bio *bio,
+static int io_err_map(struct dm_target *tt, struct bio *bio,
 		      union map_info *map_context)
 {
 	return -EIO;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8d40f27cce8..788ba96a625 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -99,19 +99,9 @@ union map_info *dm_get_mapinfo(struct bio *bio)
 /*
  * Work processed by per-device workqueue.
  */
-struct dm_wq_req {
-	enum {
-		DM_WQ_FLUSH_DEFERRED,
-	} type;
-	struct work_struct work;
-	struct mapped_device *md;
-	void *context;
-};
-
 struct mapped_device {
 	struct rw_semaphore io_lock;
 	struct mutex suspend_lock;
-	spinlock_t pushback_lock;
 	rwlock_t map_lock;
 	atomic_t holders;
 	atomic_t open_count;
@@ -129,8 +119,9 @@ struct mapped_device {
 	 */
 	atomic_t pending;
 	wait_queue_head_t wait;
+	struct work_struct work;
 	struct bio_list deferred;
-	struct bio_list pushback;
+	spinlock_t deferred_lock;
 
 	/*
 	 * Processing queue (flush/barriers)
@@ -453,7 +444,9 @@ static int queue_io(struct mapped_device *md, struct bio *bio)
 		return 1;
 	}
 
+	spin_lock_irq(&md->deferred_lock);
 	bio_list_add(&md->deferred, bio);
+	spin_unlock_irq(&md->deferred_lock);
 
 	up_write(&md->io_lock);
 	return 0;		/* deferred successfully */
@@ -537,16 +530,14 @@ static void dec_pending(struct dm_io *io, int error)
 		if (io->error == DM_ENDIO_REQUEUE) {
 			/*
 			 * Target requested pushing back the I/O.
-			 * This must be handled before the sleeper on
-			 * suspend queue merges the pushback list.
 			 */
-			spin_lock_irqsave(&md->pushback_lock, flags);
+			spin_lock_irqsave(&md->deferred_lock, flags);
 			if (__noflush_suspending(md))
-				bio_list_add(&md->pushback, io->bio);
+				bio_list_add(&md->deferred, io->bio);
 			else
 				/* noflush suspend was interrupted. */
 				io->error = -EIO;
-			spin_unlock_irqrestore(&md->pushback_lock, flags);
+			spin_unlock_irqrestore(&md->deferred_lock, flags);
 		}
 
 		end_io_acct(io);
@@ -834,20 +825,22 @@ static int __clone_and_map(struct clone_info *ci)
 }
 
 /*
- * Split the bio into several clones.
+ * Split the bio into several clones and submit it to targets.
  */
-static int __split_bio(struct mapped_device *md, struct bio *bio)
+static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 {
 	struct clone_info ci;
 	int error = 0;
 
 	ci.map = dm_get_table(md);
-	if (unlikely(!ci.map))
-		return -EIO;
+	if (unlikely(!ci.map)) {
+		bio_io_error(bio);
+		return;
+	}
 	if (unlikely(bio_barrier(bio) && !dm_table_barrier_ok(ci.map))) {
 		dm_table_put(ci.map);
 		bio_endio(bio, -EOPNOTSUPP);
-		return 0;
+		return;
 	}
 	ci.md = md;
 	ci.bio = bio;
@@ -867,8 +860,6 @@ static int __split_bio(struct mapped_device *md, struct bio *bio)
 	/* drop the extra reference count */
 	dec_pending(ci.io, error);
 	dm_table_put(ci.map);
-
-	return 0;
 }
 /*-----------------------------------------------------------------
  * CRUD END
@@ -959,8 +950,9 @@ static int dm_request(struct request_queue *q, struct bio *bio)
 		down_read(&md->io_lock);
 	}
 
-	r = __split_bio(md, bio);
+	__split_and_process_bio(md, bio);
 	up_read(&md->io_lock);
+	return 0;
 
 out_req:
 	if (r < 0)
@@ -1074,6 +1066,8 @@ out:
 
 static struct block_device_operations dm_blk_dops;
 
+static void dm_wq_work(struct work_struct *work);
+
 /*
  * Allocate and initialise a blank device with a given minor.
  */
@@ -1101,7 +1095,7 @@ static struct mapped_device *alloc_dev(int minor)
 
 	init_rwsem(&md->io_lock);
 	mutex_init(&md->suspend_lock);
-	spin_lock_init(&md->pushback_lock);
+	spin_lock_init(&md->deferred_lock);
 	rwlock_init(&md->map_lock);
 	atomic_set(&md->holders, 1);
 	atomic_set(&md->open_count, 0);
@@ -1118,6 +1112,7 @@ static struct mapped_device *alloc_dev(int minor)
 	md->queue->backing_dev_info.congested_fn = dm_any_congested;
 	md->queue->backing_dev_info.congested_data = md;
 	blk_queue_make_request(md->queue, dm_request);
+	blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL);
 	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
 	md->queue->unplug_fn = dm_unplug_all;
 	blk_queue_merge_bvec(md->queue, dm_merge_bvec);
@@ -1140,6 +1135,7 @@ static struct mapped_device *alloc_dev(int minor)
 
 	atomic_set(&md->pending, 0);
 	init_waitqueue_head(&md->wait);
+	INIT_WORK(&md->work, dm_wq_work);
 	init_waitqueue_head(&md->eventq);
 
 	md->disk->major = _major;
@@ -1379,18 +1375,24 @@ void dm_put(struct mapped_device *md)
 }
 EXPORT_SYMBOL_GPL(dm_put);
 
-static int dm_wait_for_completion(struct mapped_device *md)
+static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 {
 	int r = 0;
+	DECLARE_WAITQUEUE(wait, current);
+
+	dm_unplug_all(md->queue);
+
+	add_wait_queue(&md->wait, &wait);
 
 	while (1) {
-		set_current_state(TASK_INTERRUPTIBLE);
+		set_current_state(interruptible);
 
 		smp_mb();
 		if (!atomic_read(&md->pending))
 			break;
 
-		if (signal_pending(current)) {
+		if (interruptible == TASK_INTERRUPTIBLE &&
+		    signal_pending(current)) {
 			r = -EINTR;
 			break;
 		}
@@ -1399,67 +1401,40 @@ static int dm_wait_for_completion(struct mapped_device *md)
 	}
 	set_current_state(TASK_RUNNING);
 
+	remove_wait_queue(&md->wait, &wait);
+
 	return r;
 }
 
 /*
  * Process the deferred bios
  */
-static void __flush_deferred_io(struct mapped_device *md)
+static void dm_wq_work(struct work_struct *work)
 {
+	struct mapped_device *md = container_of(work, struct mapped_device,
+						work);
 	struct bio *c;
 
-	while ((c = bio_list_pop(&md->deferred))) {
-		if (__split_bio(md, c))
-			bio_io_error(c);
-	}
-
-	clear_bit(DMF_BLOCK_IO, &md->flags);
-}
+	down_write(&md->io_lock);
 
-static void __merge_pushback_list(struct mapped_device *md)
-{
-	unsigned long flags;
+next_bio:
+	spin_lock_irq(&md->deferred_lock);
+	c = bio_list_pop(&md->deferred);
+	spin_unlock_irq(&md->deferred_lock);
 
-	spin_lock_irqsave(&md->pushback_lock, flags);
-	clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
-	bio_list_merge_head(&md->deferred, &md->pushback);
-	bio_list_init(&md->pushback);
-	spin_unlock_irqrestore(&md->pushback_lock, flags);
-}
+	if (c) {
+		__split_and_process_bio(md, c);
+		goto next_bio;
+	}
 
-static void dm_wq_work(struct work_struct *work)
-{
-	struct dm_wq_req *req = container_of(work, struct dm_wq_req, work);
-	struct mapped_device *md = req->md;
+	clear_bit(DMF_BLOCK_IO, &md->flags);
 
-	down_write(&md->io_lock);
-	switch (req->type) {
-	case DM_WQ_FLUSH_DEFERRED:
-		__flush_deferred_io(md);
-		break;
-	default:
-		DMERR("dm_wq_work: unrecognised work type %d", req->type);
-		BUG();
-	}
 	up_write(&md->io_lock);
 }
 
-static void dm_wq_queue(struct mapped_device *md, int type, void *context,
-			struct dm_wq_req *req)
-{
-	req->type = type;
-	req->md = md;
-	req->context = context;
-	INIT_WORK(&req->work, dm_wq_work);
-	queue_work(md->wq, &req->work);
-}
-
-static void dm_queue_flush(struct mapped_device *md, int type, void *context)
+static void dm_queue_flush(struct mapped_device *md)
 {
-	struct dm_wq_req req;
-
-	dm_wq_queue(md, type, context, &req);
+	queue_work(md->wq, &md->work);
 	flush_workqueue(md->wq);
 }
 
@@ -1534,7 +1509,6 @@ static void unlock_fs(struct mapped_device *md)
 int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 {
 	struct dm_table *map = NULL;
-	DECLARE_WAITQUEUE(wait, current);
 	int r = 0;
 	int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
 	int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
@@ -1584,28 +1558,22 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	down_write(&md->io_lock);
 	set_bit(DMF_BLOCK_IO, &md->flags);
 
-	add_wait_queue(&md->wait, &wait);
 	up_write(&md->io_lock);
 
-	/* unplug */
-	if (map)
-		dm_table_unplug_all(map);
-
 	/*
 	 * Wait for the already-mapped ios to complete.
 	 */
-	r = dm_wait_for_completion(md);
+	r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
 
 	down_write(&md->io_lock);
-	remove_wait_queue(&md->wait, &wait);
 
 	if (noflush)
-		__merge_pushback_list(md);
+		clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 	up_write(&md->io_lock);
 
 	/* were we interrupted ? */
 	if (r < 0) {
-		dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL);
+		dm_queue_flush(md);
 
 		unlock_fs(md);
 		goto out; /* pushback list is already flushed, so skip flush */
@@ -1645,7 +1613,7 @@ int dm_resume(struct mapped_device *md)
 	if (r)
 		goto out;
 
-	dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL);
+	dm_queue_flush(md);
 
 	unlock_fs(md);
 
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 20194e000c5..b48397c0abb 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -60,7 +60,7 @@ int dm_table_barrier_ok(struct dm_table *t);
 int dm_target_init(void);
 void dm_target_exit(void);
 struct target_type *dm_get_target_type(const char *name);
-void dm_put_target_type(struct target_type *t);
+void dm_put_target_type(struct target_type *tt);
 int dm_target_iterate(void (*iter_func)(struct target_type *tt,
 					void *param), void *param);
 
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 86d9adf90e7..8695809b24b 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -62,7 +62,10 @@
 #define	ModeShift	5
 
 #define MaxFault	50
-#include <linux/raid/md.h>
+#include <linux/blkdev.h>
+#include <linux/raid/md_u.h>
+#include "md.h"
+#include <linux/seq_file.h>
 
 
 static void faulty_fail(struct bio *bio, int error)
@@ -280,6 +283,17 @@ static int reconfig(mddev_t *mddev, int layout, int chunk_size)
 	return 0;
 }
 
+static sector_t faulty_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+	WARN_ONCE(raid_disks,
+		  "%s does not support generic reshape\n", __func__);
+
+	if (sectors == 0)
+		return mddev->dev_sectors;
+
+	return sectors;
+}
+
 static int run(mddev_t *mddev)
 {
 	mdk_rdev_t *rdev;
@@ -298,7 +312,7 @@ static int run(mddev_t *mddev)
 	list_for_each_entry(rdev, &mddev->disks, same_set)
 		conf->rdev = rdev;
 
-	mddev->array_sectors = mddev->size * 2;
+	md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
 	mddev->private = conf;
 
 	reconfig(mddev, mddev->layout, -1);
@@ -325,6 +339,7 @@ static struct mdk_personality faulty_personality =
 	.stop		= stop,
 	.status		= status,
 	.reconfig	= reconfig,
+	.size		= faulty_size,
 };
 
 static int __init raid_init(void)
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 09658b21847..7a36e38393a 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -16,7 +16,11 @@
    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
 */
 
-#include <linux/raid/linear.h>
+#include <linux/blkdev.h>
+#include <linux/raid/md_u.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "linear.h"
 
 /*
  * find which device holds a particular offset 
@@ -97,6 +101,16 @@ static int linear_congested(void *data, int bits)
 	return ret;
 }
 
+static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+	linear_conf_t *conf = mddev_to_conf(mddev);
+
+	WARN_ONCE(sectors || raid_disks,
+		  "%s does not support generic reshape\n", __func__);
+
+	return conf->array_sectors;
+}
+
 static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 {
 	linear_conf_t *conf;
@@ -135,8 +149,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
 			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
-		disk->num_sectors = rdev->size * 2;
-		conf->array_sectors += rdev->size * 2;
+		disk->num_sectors = rdev->sectors;
+		conf->array_sectors += rdev->sectors;
 
 		cnt++;
 	}
@@ -249,7 +263,7 @@ static int linear_run (mddev_t *mddev)
 	if (!conf)
 		return 1;
 	mddev->private = conf;
-	mddev->array_sectors = conf->array_sectors;
+	md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
 
 	blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
 	mddev->queue->unplug_fn = linear_unplug;
@@ -283,7 +297,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
 	newconf->prev = mddev_to_conf(mddev);
 	mddev->private = newconf;
 	mddev->raid_disks++;
-	mddev->array_sectors = newconf->array_sectors;
+	md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
 	set_capacity(mddev->gendisk, mddev->array_sectors);
 	return 0;
 }
@@ -381,6 +395,7 @@ static struct mdk_personality linear_personality =
 	.stop		= linear_stop,
 	.status		= linear_status,
 	.hot_add_disk	= linear_add,
+	.size		= linear_size,
 };
 
 static int __init linear_init (void)
diff --git a/drivers/md/linear.h b/drivers/md/linear.h
new file mode 100644
index 00000000000..bf8179587f9
--- /dev/null
+++ b/drivers/md/linear.h
@@ -0,0 +1,29 @@
+#ifndef _LINEAR_H
+#define _LINEAR_H
+
+struct dev_info {
+	mdk_rdev_t	*rdev;
+	sector_t	num_sectors;
+	sector_t	start_sector;
+};
+
+typedef struct dev_info dev_info_t;
+
+struct linear_private_data
+{
+	struct linear_private_data *prev;	/* earlier version */
+	dev_info_t		**hash_table;
+	sector_t		spacing;
+	sector_t		array_sectors;
+	int			sector_shift;	/* shift before dividing
+						 * by spacing
+						 */
+	dev_info_t		disks[0];
+};
+
+
+typedef struct linear_private_data linear_conf_t;
+
+#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private)
+
+#endif
diff --git a/drivers/md/md.c b/drivers/md/md.c
index a307f87eb90..ed5727c089a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -33,9 +33,9 @@
 */
 
 #include <linux/kthread.h>
-#include <linux/raid/md.h>
-#include <linux/raid/bitmap.h>
+#include <linux/blkdev.h>
 #include <linux/sysctl.h>
+#include <linux/seq_file.h>
 #include <linux/buffer_head.h> /* for invalidate_bdev */
 #include <linux/poll.h>
 #include <linux/ctype.h>
@@ -45,11 +45,10 @@
 #include <linux/reboot.h>
 #include <linux/file.h>
 #include <linux/delay.h>
-
-#define MAJOR_NR MD_MAJOR
-
-/* 63 partitions with the alternate major number (mdp) */
-#define MdpMinorShift 6
+#include <linux/raid/md_p.h>
+#include <linux/raid/md_u.h>
+#include "md.h"
+#include "bitmap.h"
 
 #define DEBUG 0
 #define dprintk(x...) ((void)(DEBUG && printk(x)))
@@ -202,12 +201,68 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
 		)
 
 
-static int md_fail_request(struct request_queue *q, struct bio *bio)
+/* Rather than calling directly into the personality make_request function,
+ * IO requests come here first so that we can check if the device is
+ * being suspended pending a reconfiguration.
+ * We hold a refcount over the call to ->make_request.  By the time that
+ * call has finished, the bio has been linked into some internal structure
+ * and so is visible to ->quiesce(), so we don't need the refcount any more.
+ */
+static int md_make_request(struct request_queue *q, struct bio *bio)
 {
-	bio_io_error(bio);
-	return 0;
+	mddev_t *mddev = q->queuedata;
+	int rv;
+	if (mddev == NULL || mddev->pers == NULL) {
+		bio_io_error(bio);
+		return 0;
+	}
+	rcu_read_lock();
+	if (mddev->suspended) {
+		DEFINE_WAIT(__wait);
+		for (;;) {
+			prepare_to_wait(&mddev->sb_wait, &__wait,
+					TASK_UNINTERRUPTIBLE);
+			if (!mddev->suspended)
+				break;
+			rcu_read_unlock();
+			schedule();
+			rcu_read_lock();
+		}
+		finish_wait(&mddev->sb_wait, &__wait);
+	}
+	atomic_inc(&mddev->active_io);
+	rcu_read_unlock();
+	rv = mddev->pers->make_request(q, bio);
+	if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
+		wake_up(&mddev->sb_wait);
+
+	return rv;
+}
+
+static void mddev_suspend(mddev_t *mddev)
+{
+	BUG_ON(mddev->suspended);
+	mddev->suspended = 1;
+	synchronize_rcu();
+	wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
+	mddev->pers->quiesce(mddev, 1);
+	md_unregister_thread(mddev->thread);
+	mddev->thread = NULL;
+	/* we now know that no code is executing in the personality module,
+	 * except possibly the tail end of a ->bi_end_io function, but that
+	 * is certain to complete before the module has a chance to get
+	 * unloaded
+	 */
+}
+
+static void mddev_resume(mddev_t *mddev)
+{
+	mddev->suspended = 0;
+	wake_up(&mddev->sb_wait);
+	mddev->pers->quiesce(mddev, 0);
 }
 
+
 static inline mddev_t *mddev_get(mddev_t *mddev)
 {
 	atomic_inc(&mddev->active);
@@ -310,6 +365,7 @@ static mddev_t * mddev_find(dev_t unit)
 	init_timer(&new->safemode_timer);
 	atomic_set(&new->active, 1);
 	atomic_set(&new->openers, 0);
+	atomic_set(&new->active_io, 0);
 	spin_lock_init(&new->write_lock);
 	init_waitqueue_head(&new->sb_wait);
 	init_waitqueue_head(&new->recovery_wait);
@@ -326,6 +382,11 @@ static inline int mddev_lock(mddev_t * mddev)
 	return mutex_lock_interruptible(&mddev->reconfig_mutex);
 }
 
+static inline int mddev_is_locked(mddev_t *mddev)
+{
+	return mutex_is_locked(&mddev->reconfig_mutex);
+}
+
 static inline int mddev_trylock(mddev_t * mddev)
 {
 	return mutex_trylock(&mddev->reconfig_mutex);
@@ -409,7 +470,7 @@ static void free_disk_sb(mdk_rdev_t * rdev)
 		rdev->sb_loaded = 0;
 		rdev->sb_page = NULL;
 		rdev->sb_start = 0;
-		rdev->size = 0;
+		rdev->sectors = 0;
 	}
 }
 
@@ -775,9 +836,9 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
 		else 
 			ret = 0;
 	}
-	rdev->size = calc_num_sectors(rdev, sb->chunk_size) / 2;
+	rdev->sectors = calc_num_sectors(rdev, sb->chunk_size);
 
-	if (rdev->size < sb->size && sb->level > 1)
+	if (rdev->sectors < sb->size * 2 && sb->level > 1)
 		/* "this cannot possibly happen" ... */
 		ret = -EINVAL;
 
@@ -812,7 +873,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 		mddev->clevel[0] = 0;
 		mddev->layout = sb->layout;
 		mddev->raid_disks = sb->raid_disks;
-		mddev->size = sb->size;
+		mddev->dev_sectors = sb->size * 2;
 		mddev->events = ev1;
 		mddev->bitmap_offset = 0;
 		mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
@@ -926,7 +987,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 
 	sb->ctime = mddev->ctime;
 	sb->level = mddev->level;
-	sb->size  = mddev->size;
+	sb->size = mddev->dev_sectors / 2;
 	sb->raid_disks = mddev->raid_disks;
 	sb->md_minor = mddev->md_minor;
 	sb->not_persistent = 0;
@@ -1024,7 +1085,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 static unsigned long long
 super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
 {
-	if (num_sectors && num_sectors < rdev->mddev->size * 2)
+	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
 		return 0; /* component must fit device */
 	if (rdev->mddev->bitmap_offset)
 		return 0; /* can't move bitmap */
@@ -1180,16 +1241,17 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 			ret = 0;
 	}
 	if (minor_version)
-		rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
+		rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) -
+			le64_to_cpu(sb->data_offset);
 	else
-		rdev->size = rdev->sb_start / 2;
-	if (rdev->size < le64_to_cpu(sb->data_size)/2)
+		rdev->sectors = rdev->sb_start;
+	if (rdev->sectors < le64_to_cpu(sb->data_size))
 		return -EINVAL;
-	rdev->size = le64_to_cpu(sb->data_size)/2;
+	rdev->sectors = le64_to_cpu(sb->data_size);
 	if (le32_to_cpu(sb->chunksize))
-		rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
+		rdev->sectors &= ~((sector_t)le32_to_cpu(sb->chunksize) - 1);
 
-	if (le64_to_cpu(sb->size) > rdev->size*2)
+	if (le64_to_cpu(sb->size) > rdev->sectors)
 		return -EINVAL;
 	return ret;
 }
@@ -1216,7 +1278,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 		mddev->clevel[0] = 0;
 		mddev->layout = le32_to_cpu(sb->layout);
 		mddev->raid_disks = le32_to_cpu(sb->raid_disks);
-		mddev->size = le64_to_cpu(sb->size)/2;
+		mddev->dev_sectors = le64_to_cpu(sb->size);
 		mddev->events = ev1;
 		mddev->bitmap_offset = 0;
 		mddev->default_bitmap_offset = 1024 >> 9;
@@ -1312,7 +1374,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 	sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
 
 	sb->raid_disks = cpu_to_le32(mddev->raid_disks);
-	sb->size = cpu_to_le64(mddev->size<<1);
+	sb->size = cpu_to_le64(mddev->dev_sectors);
 
 	if (mddev->bitmap && mddev->bitmap_file == NULL) {
 		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
@@ -1320,10 +1382,15 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 	}
 
 	if (rdev->raid_disk >= 0 &&
-	    !test_bit(In_sync, &rdev->flags) &&
-	    rdev->recovery_offset > 0) {
-		sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
-		sb->recovery_offset = cpu_to_le64(rdev->recovery_offset);
+	    !test_bit(In_sync, &rdev->flags)) {
+		if (mddev->curr_resync_completed > rdev->recovery_offset)
+			rdev->recovery_offset = mddev->curr_resync_completed;
+		if (rdev->recovery_offset > 0) {
+			sb->feature_map |=
+				cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
+			sb->recovery_offset =
+				cpu_to_le64(rdev->recovery_offset);
+		}
 	}
 
 	if (mddev->reshape_position != MaxSector) {
@@ -1365,7 +1432,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
 {
 	struct mdp_superblock_1 *sb;
 	sector_t max_sectors;
-	if (num_sectors && num_sectors < rdev->mddev->size * 2)
+	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
 		return 0; /* component must fit device */
 	if (rdev->sb_start < rdev->data_offset) {
 		/* minor versions 1 and 2; superblock before data */
@@ -1381,7 +1448,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
 		sector_t sb_start;
 		sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
 		sb_start &= ~(sector_t)(4*2 - 1);
-		max_sectors = rdev->size * 2 + sb_start - rdev->sb_start;
+		max_sectors = rdev->sectors + sb_start - rdev->sb_start;
 		if (!num_sectors || num_sectors > max_sectors)
 			num_sectors = max_sectors;
 		rdev->sb_start = sb_start;
@@ -1433,6 +1500,38 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
 
 static LIST_HEAD(pending_raid_disks);
 
+static void md_integrity_check(mdk_rdev_t *rdev, mddev_t *mddev)
+{
+	struct mdk_personality *pers = mddev->pers;
+	struct gendisk *disk = mddev->gendisk;
+	struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev);
+	struct blk_integrity *bi_mddev = blk_get_integrity(disk);
+
+	/* Data integrity passthrough not supported on RAID 4, 5 and 6 */
+	if (pers && pers->level >= 4 && pers->level <= 6)
+		return;
+
+	/* If rdev is integrity capable, register profile for mddev */
+	if (!bi_mddev && bi_rdev) {
+		if (blk_integrity_register(disk, bi_rdev))
+			printk(KERN_ERR "%s: %s Could not register integrity!\n",
+			       __func__, disk->disk_name);
+		else
+			printk(KERN_NOTICE "Enabling data integrity on %s\n",
+			       disk->disk_name);
+		return;
+	}
+
+	/* Check that mddev and rdev have matching profiles */
+	if (blk_integrity_compare(disk, rdev->bdev->bd_disk) < 0) {
+		printk(KERN_ERR "%s: %s/%s integrity mismatch!\n", __func__,
+		       disk->disk_name, rdev->bdev->bd_disk->disk_name);
+		printk(KERN_NOTICE "Disabling data integrity on %s\n",
+		       disk->disk_name);
+		blk_integrity_unregister(disk);
+	}
+}
+
 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 {
 	char b[BDEVNAME_SIZE];
@@ -1449,8 +1548,9 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 	if (find_rdev(mddev, rdev->bdev->bd_dev))
 		return -EEXIST;
 
-	/* make sure rdev->size exceeds mddev->size */
-	if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) {
+	/* make sure rdev->sectors exceeds mddev->dev_sectors */
+	if (rdev->sectors && (mddev->dev_sectors == 0 ||
+			rdev->sectors < mddev->dev_sectors)) {
 		if (mddev->pers) {
 			/* Cannot change size, so fail
 			 * If mddev->level <= 0, then we don't care
@@ -1459,7 +1559,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 			if (mddev->level > 0)
 				return -ENOSPC;
 		} else
-			mddev->size = rdev->size;
+			mddev->dev_sectors = rdev->sectors;
 	}
 
 	/* Verify rdev->desc_nr is unique.
@@ -1503,6 +1603,8 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 
 	/* May as well allow recovery to be retried once */
 	mddev->recovery_disabled = 0;
+
+	md_integrity_check(rdev, mddev);
 	return 0;
 
  fail:
@@ -1713,8 +1815,8 @@ static void print_sb_1(struct mdp_superblock_1 *sb)
 static void print_rdev(mdk_rdev_t *rdev, int major_version)
 {
 	char b[BDEVNAME_SIZE];
-	printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
-		bdevname(rdev->bdev,b), (unsigned long long)rdev->size,
+	printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n",
+		bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors,
 	        test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
 	        rdev->desc_nr);
 	if (rdev->sb_loaded) {
@@ -2153,7 +2255,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 		return -EINVAL;
 	if (rdev->mddev->pers && rdev->raid_disk >= 0)
 		return -EBUSY;
-	if (rdev->size && rdev->mddev->external)
+	if (rdev->sectors && rdev->mddev->external)
 		/* Must set offset before size, so overlap checks
 		 * can be sane */
 		return -EBUSY;
@@ -2167,7 +2269,7 @@ __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
 static ssize_t
 rdev_size_show(mdk_rdev_t *rdev, char *page)
 {
-	return sprintf(page, "%llu\n", (unsigned long long)rdev->size);
+	return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
 }
 
 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
@@ -2180,34 +2282,52 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
 	return 1;
 }
 
+static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
+{
+	unsigned long long blocks;
+	sector_t new;
+
+	if (strict_strtoull(buf, 10, &blocks) < 0)
+		return -EINVAL;
+
+	if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
+		return -EINVAL; /* sector conversion overflow */
+
+	new = blocks * 2;
+	if (new != blocks * 2)
+		return -EINVAL; /* unsigned long long to sector_t overflow */
+
+	*sectors = new;
+	return 0;
+}
+
 static ssize_t
 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 {
-	unsigned long long size;
-	unsigned long long oldsize = rdev->size;
 	mddev_t *my_mddev = rdev->mddev;
+	sector_t oldsectors = rdev->sectors;
+	sector_t sectors;
 
-	if (strict_strtoull(buf, 10, &size) < 0)
+	if (strict_blocks_to_sectors(buf, &sectors) < 0)
 		return -EINVAL;
 	if (my_mddev->pers && rdev->raid_disk >= 0) {
 		if (my_mddev->persistent) {
-			size = super_types[my_mddev->major_version].
-				rdev_size_change(rdev, size * 2);
-			if (!size)
+			sectors = super_types[my_mddev->major_version].
+				rdev_size_change(rdev, sectors);
+			if (!sectors)
 				return -EBUSY;
-		} else if (!size) {
-			size = (rdev->bdev->bd_inode->i_size >> 10);
-			size -= rdev->data_offset/2;
-		}
+		} else if (!sectors)
+			sectors = (rdev->bdev->bd_inode->i_size >> 9) -
+				rdev->data_offset;
 	}
-	if (size < my_mddev->size)
+	if (sectors < my_mddev->dev_sectors)
 		return -EINVAL; /* component must fit device */
 
-	rdev->size = size;
-	if (size > oldsize && my_mddev->external) {
+	rdev->sectors = sectors;
+	if (sectors > oldsectors && my_mddev->external) {
 		/* need to check that all other rdevs with the same ->bdev
 		 * do not overlap.  We need to unlock the mddev to avoid
-		 * a deadlock.  We have already changed rdev->size, and if
+		 * a deadlock.  We have already changed rdev->sectors, and if
 		 * we have to change it back, we will have the lock again.
 		 */
 		mddev_t *mddev;
@@ -2223,9 +2343,9 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 				if (test_bit(AllReserved, &rdev2->flags) ||
 				    (rdev->bdev == rdev2->bdev &&
 				     rdev != rdev2 &&
-				     overlaps(rdev->data_offset, rdev->size * 2,
+				     overlaps(rdev->data_offset, rdev->sectors,
 					      rdev2->data_offset,
-					      rdev2->size * 2))) {
+					      rdev2->sectors))) {
 					overlap = 1;
 					break;
 				}
@@ -2239,11 +2359,11 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 		if (overlap) {
 			/* Someone else could have slipped in a size
 			 * change here, but doing so is just silly.
-			 * We put oldsize back because we *know* it is
+			 * We put oldsectors back because we *know* it is
 			 * safe, and trust userspace not to race with
 			 * itself
 			 */
-			rdev->size = oldsize;
+			rdev->sectors = oldsectors;
 			return -EBUSY;
 		}
 	}
@@ -2547,18 +2667,101 @@ level_show(mddev_t *mddev, char *page)
 static ssize_t
 level_store(mddev_t *mddev, const char *buf, size_t len)
 {
+	char level[16];
 	ssize_t rv = len;
-	if (mddev->pers)
+	struct mdk_personality *pers;
+	void *priv;
+
+	if (mddev->pers == NULL) {
+		if (len == 0)
+			return 0;
+		if (len >= sizeof(mddev->clevel))
+			return -ENOSPC;
+		strncpy(mddev->clevel, buf, len);
+		if (mddev->clevel[len-1] == '\n')
+			len--;
+		mddev->clevel[len] = 0;
+		mddev->level = LEVEL_NONE;
+		return rv;
+	}
+
+	/* request to change the personality.  Need to ensure:
+	 *  - array is not engaged in resync/recovery/reshape
+	 *  - old personality can be suspended
+	 *  - new personality will access other array.
+	 */
+
+	if (mddev->sync_thread || mddev->reshape_position != MaxSector)
 		return -EBUSY;
-	if (len == 0)
-		return 0;
-	if (len >= sizeof(mddev->clevel))
-		return -ENOSPC;
-	strncpy(mddev->clevel, buf, len);
-	if (mddev->clevel[len-1] == '\n')
+
+	if (!mddev->pers->quiesce) {
+		printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
+		       mdname(mddev), mddev->pers->name);
+		return -EINVAL;
+	}
+
+	/* Now find the new personality */
+	if (len == 0 || len >= sizeof(level))
+		return -EINVAL;
+	strncpy(level, buf, len);
+	if (level[len-1] == '\n')
 		len--;
-	mddev->clevel[len] = 0;
-	mddev->level = LEVEL_NONE;
+	level[len] = 0;
+
+	request_module("md-%s", level);
+	spin_lock(&pers_lock);
+	pers = find_pers(LEVEL_NONE, level);
+	if (!pers || !try_module_get(pers->owner)) {
+		spin_unlock(&pers_lock);
+		printk(KERN_WARNING "md: personality %s not loaded\n", level);
+		return -EINVAL;
+	}
+	spin_unlock(&pers_lock);
+
+	if (pers == mddev->pers) {
+		/* Nothing to do! */
+		module_put(pers->owner);
+		return rv;
+	}
+	if (!pers->takeover) {
+		module_put(pers->owner);
+		printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
+		       mdname(mddev), level);
+		return -EINVAL;
+	}
+
+	/* ->takeover must set new_* and/or delta_disks
+	 * if it succeeds, and may set them when it fails.
+	 */
+	priv = pers->takeover(mddev);
+	if (IS_ERR(priv)) {
+		mddev->new_level = mddev->level;
+		mddev->new_layout = mddev->layout;
+		mddev->new_chunk = mddev->chunk_size;
+		mddev->raid_disks -= mddev->delta_disks;
+		mddev->delta_disks = 0;
+		module_put(pers->owner);
+		printk(KERN_WARNING "md: %s: %s would not accept array\n",
+		       mdname(mddev), level);
+		return PTR_ERR(priv);
+	}
+
+	/* Looks like we have a winner */
+	mddev_suspend(mddev);
+	mddev->pers->stop(mddev);
+	module_put(mddev->pers->owner);
+	mddev->pers = pers;
+	mddev->private = priv;
+	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
+	mddev->level = mddev->new_level;
+	mddev->layout = mddev->new_layout;
+	mddev->chunk_size = mddev->new_chunk;
+	mddev->delta_disks = 0;
+	pers->run(mddev);
+	mddev_resume(mddev);
+	set_bit(MD_CHANGE_DEVS, &mddev->flags);
+	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+	md_wakeup_thread(mddev->thread);
 	return rv;
 }
 
@@ -2586,12 +2789,18 @@ layout_store(mddev_t *mddev, const char *buf, size_t len)
 	if (!*buf || (*e && *e != '\n'))
 		return -EINVAL;
 
-	if (mddev->pers)
-		return -EBUSY;
-	if (mddev->reshape_position != MaxSector)
+	if (mddev->pers) {
+		int err;
+		if (mddev->pers->reconfig == NULL)
+			return -EBUSY;
+		err = mddev->pers->reconfig(mddev, n, -1);
+		if (err)
+			return err;
+	} else {
 		mddev->new_layout = n;
-	else
-		mddev->layout = n;
+		if (mddev->reshape_position == MaxSector)
+			mddev->layout = n;
+	}
 	return len;
 }
 static struct md_sysfs_entry md_layout =
@@ -2648,19 +2857,24 @@ chunk_size_show(mddev_t *mddev, char *page)
 static ssize_t
 chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
 {
-	/* can only set chunk_size if array is not yet active */
 	char *e;
 	unsigned long n = simple_strtoul(buf, &e, 10);
 
 	if (!*buf || (*e && *e != '\n'))
 		return -EINVAL;
 
-	if (mddev->pers)
-		return -EBUSY;
-	else if (mddev->reshape_position != MaxSector)
+	if (mddev->pers) {
+		int err;
+		if (mddev->pers->reconfig == NULL)
+			return -EBUSY;
+		err = mddev->pers->reconfig(mddev, -1, n);
+		if (err)
+			return err;
+	} else {
 		mddev->new_chunk = n;
-	else
-		mddev->chunk_size = n;
+		if (mddev->reshape_position == MaxSector)
+			mddev->chunk_size = n;
+	}
 	return len;
 }
 static struct md_sysfs_entry md_chunk_size =
@@ -2669,6 +2883,8 @@ __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
 static ssize_t
 resync_start_show(mddev_t *mddev, char *page)
 {
+	if (mddev->recovery_cp == MaxSector)
+		return sprintf(page, "none\n");
 	return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
 }
 
@@ -2766,7 +2982,7 @@ array_state_show(mddev_t *mddev, char *page)
 	else {
 		if (list_empty(&mddev->disks) &&
 		    mddev->raid_disks == 0 &&
-		    mddev->size == 0)
+		    mddev->dev_sectors == 0)
 			st = clear;
 		else
 			st = inactive;
@@ -2973,7 +3189,8 @@ __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
 static ssize_t
 size_show(mddev_t *mddev, char *page)
 {
-	return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
+	return sprintf(page, "%llu\n",
+		(unsigned long long)mddev->dev_sectors / 2);
 }
 
 static int update_size(mddev_t *mddev, sector_t num_sectors);
@@ -2985,20 +3202,18 @@ size_store(mddev_t *mddev, const char *buf, size_t len)
 	 * not increase it (except from 0).
 	 * If array is active, we can try an on-line resize
 	 */
-	char *e;
-	int err = 0;
-	unsigned long long size = simple_strtoull(buf, &e, 10);
-	if (!*buf || *buf == '\n' ||
-	    (*e && *e != '\n'))
-		return -EINVAL;
+	sector_t sectors;
+	int err = strict_blocks_to_sectors(buf, &sectors);
 
+	if (err < 0)
+		return err;
 	if (mddev->pers) {
-		err = update_size(mddev, size * 2);
+		err = update_size(mddev, sectors);
 		md_update_sb(mddev, 1);
 	} else {
-		if (mddev->size == 0 ||
-		    mddev->size > size)
-			mddev->size = size;
+		if (mddev->dev_sectors == 0 ||
+		    mddev->dev_sectors > sectors)
+			mddev->dev_sectors = sectors;
 		else
 			err = -ENOSPC;
 	}
@@ -3251,6 +3466,8 @@ static ssize_t
 sync_speed_show(mddev_t *mddev, char *page)
 {
 	unsigned long resync, dt, db;
+	if (mddev->curr_resync == 0)
+		return sprintf(page, "none\n");
 	resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
 	dt = (jiffies - mddev->resync_mark) / HZ;
 	if (!dt) dt++;
@@ -3263,15 +3480,15 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
 static ssize_t
 sync_completed_show(mddev_t *mddev, char *page)
 {
-	unsigned long max_blocks, resync;
+	unsigned long max_sectors, resync;
 
 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
-		max_blocks = mddev->resync_max_sectors;
+		max_sectors = mddev->resync_max_sectors;
 	else
-		max_blocks = mddev->size << 1;
+		max_sectors = mddev->dev_sectors;
 
 	resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
-	return sprintf(page, "%lu / %lu\n", resync, max_blocks);
+	return sprintf(page, "%lu / %lu\n", resync, max_sectors);
 }
 
 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
@@ -3431,6 +3648,57 @@ static struct md_sysfs_entry md_reshape_position =
 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
        reshape_position_store);
 
+static ssize_t
+array_size_show(mddev_t *mddev, char *page)
+{
+	if (mddev->external_size)
+		return sprintf(page, "%llu\n",
+			       (unsigned long long)mddev->array_sectors/2);
+	else
+		return sprintf(page, "default\n");
+}
+
+static ssize_t
+array_size_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	sector_t sectors;
+
+	if (strncmp(buf, "default", 7) == 0) {
+		if (mddev->pers)
+			sectors = mddev->pers->size(mddev, 0, 0);
+		else
+			sectors = mddev->array_sectors;
+
+		mddev->external_size = 0;
+	} else {
+		if (strict_blocks_to_sectors(buf, &sectors) < 0)
+			return -EINVAL;
+		if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
+			return -EINVAL;
+
+		mddev->external_size = 1;
+	}
+
+	mddev->array_sectors = sectors;
+	set_capacity(mddev->gendisk, mddev->array_sectors);
+	if (mddev->pers) {
+		struct block_device *bdev = bdget_disk(mddev->gendisk, 0);
+
+		if (bdev) {
+			mutex_lock(&bdev->bd_inode->i_mutex);
+			i_size_write(bdev->bd_inode,
+				     (loff_t)mddev->array_sectors << 9);
+			mutex_unlock(&bdev->bd_inode->i_mutex);
+			bdput(bdev);
+		}
+	}
+
+	return len;
+}
+
+static struct md_sysfs_entry md_array_size =
+__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
+       array_size_store);
 
 static struct attribute *md_default_attrs[] = {
 	&md_level.attr,
@@ -3444,6 +3712,7 @@ static struct attribute *md_default_attrs[] = {
 	&md_safe_delay.attr,
 	&md_array_state.attr,
 	&md_reshape_position.attr,
+	&md_array_size.attr,
 	NULL,
 };
 
@@ -3602,10 +3871,12 @@ static int md_alloc(dev_t dev, char *name)
 		mddev_put(mddev);
 		return -ENOMEM;
 	}
+	mddev->queue->queuedata = mddev;
+
 	/* Can be unlocked because the queue is new: no concurrency */
 	queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue);
 
-	blk_queue_make_request(mddev->queue, md_fail_request);
+	blk_queue_make_request(mddev->queue, md_make_request);
 
 	disk = alloc_disk(1 << shift);
 	if (!disk) {
@@ -3731,13 +4002,13 @@ static int do_md_run(mddev_t * mddev)
 		list_for_each_entry(rdev, &mddev->disks, same_set) {
 			if (test_bit(Faulty, &rdev->flags))
 				continue;
-			if (rdev->size < chunk_size / 1024) {
+			if (rdev->sectors < chunk_size / 512) {
 				printk(KERN_WARNING
 					"md: Dev %s smaller than chunk_size:"
-					" %lluk < %dk\n",
+					" %llu < %d\n",
 					bdevname(rdev->bdev,b),
-					(unsigned long long)rdev->size,
-					chunk_size / 1024);
+					(unsigned long long)rdev->sectors,
+					chunk_size / 512);
 				return -EINVAL;
 			}
 		}
@@ -3761,11 +4032,11 @@ static int do_md_run(mddev_t * mddev)
 
 		/* perform some consistency tests on the device.
 		 * We don't want the data to overlap the metadata,
-		 * Internal Bitmap issues has handled elsewhere.
+		 * Internal Bitmap issues have been handled elsewhere.
 		 */
 		if (rdev->data_offset < rdev->sb_start) {
-			if (mddev->size &&
-			    rdev->data_offset + mddev->size*2
+			if (mddev->dev_sectors &&
+			    rdev->data_offset + mddev->dev_sectors
 			    > rdev->sb_start) {
 				printk("md: %s: data overlaps metadata\n",
 				       mdname(mddev));
@@ -3801,9 +4072,16 @@ static int do_md_run(mddev_t * mddev)
 	}
 	mddev->pers = pers;
 	spin_unlock(&pers_lock);
-	mddev->level = pers->level;
+	if (mddev->level != pers->level) {
+		mddev->level = pers->level;
+		mddev->new_level = pers->level;
+	}
 	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
 
+	if (pers->level >= 4 && pers->level <= 6)
+		/* Cannot support integrity (yet) */
+		blk_integrity_unregister(mddev->gendisk);
+
 	if (mddev->reshape_position != MaxSector &&
 	    pers->start_reshape == NULL) {
 		/* This personality cannot handle reshaping... */
@@ -3843,7 +4121,9 @@ static int do_md_run(mddev_t * mddev)
 	}
 
 	mddev->recovery = 0;
-	mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
+	/* may be over-ridden by personality */
+	mddev->resync_max_sectors = mddev->dev_sectors;
+
 	mddev->barriers_work = 1;
 	mddev->ok_start_degraded = start_dirty_degraded;
 
@@ -3853,7 +4133,17 @@ static int do_md_run(mddev_t * mddev)
 	err = mddev->pers->run(mddev);
 	if (err)
 		printk(KERN_ERR "md: pers->run() failed ...\n");
-	else if (mddev->pers->sync_request) {
+	else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) {
+		WARN_ONCE(!mddev->external_size, "%s: default size too small,"
+			  " but 'external_size' not in effect?\n", __func__);
+		printk(KERN_ERR
+		       "md: invalid array_size %llu > default size %llu\n",
+		       (unsigned long long)mddev->array_sectors / 2,
+		       (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2);
+		err = -EINVAL;
+		mddev->pers->stop(mddev);
+	}
+	if (err == 0 && mddev->pers->sync_request) {
 		err = bitmap_create(mddev);
 		if (err) {
 			printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
@@ -3899,16 +4189,6 @@ static int do_md_run(mddev_t * mddev)
 
 	set_capacity(disk, mddev->array_sectors);
 
-	/* If we call blk_queue_make_request here, it will
-	 * re-initialise max_sectors etc which may have been
-	 * refined inside -> run.  So just set the bits we need to set.
-	 * Most initialisation happended when we called
-	 * blk_queue_make_request(..., md_fail_request)
-	 * earlier.
-	 */
-	mddev->queue->queuedata = mddev;
-	mddev->queue->make_request_fn = mddev->pers->make_request;
-
 	/* If there is a partially-recovered drive we need to
 	 * start recovery here.  If we leave it to md_check_recovery,
 	 * it will remove the drives and not do the right thing
@@ -4038,7 +4318,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 			md_super_wait(mddev);
 			if (mddev->ro)
 				set_disk_ro(disk, 0);
-			blk_queue_make_request(mddev->queue, md_fail_request);
+
 			mddev->pers->stop(mddev);
 			mddev->queue->merge_bvec_fn = NULL;
 			mddev->queue->unplug_fn = NULL;
@@ -4095,7 +4375,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 		export_array(mddev);
 
 		mddev->array_sectors = 0;
-		mddev->size = 0;
+		mddev->external_size = 0;
+		mddev->dev_sectors = 0;
 		mddev->raid_disks = 0;
 		mddev->recovery_cp = 0;
 		mddev->resync_min = 0;
@@ -4135,6 +4416,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 		printk(KERN_INFO "md: %s switched to read-only mode.\n",
 			mdname(mddev));
 	err = 0;
+	blk_integrity_unregister(disk);
 	md_new_event(mddev);
 	sysfs_notify_dirent(mddev->sysfs_state);
 out:
@@ -4300,8 +4582,8 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
 	info.patch_version = MD_PATCHLEVEL_VERSION;
 	info.ctime         = mddev->ctime;
 	info.level         = mddev->level;
-	info.size          = mddev->size;
-	if (info.size != mddev->size) /* overflow */
+	info.size          = mddev->dev_sectors / 2;
+	if (info.size != mddev->dev_sectors / 2) /* overflow */
 		info.size = -1;
 	info.nr_disks      = nr;
 	info.raid_disks    = mddev->raid_disks;
@@ -4480,6 +4762,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 		clear_bit(In_sync, &rdev->flags); /* just to be sure */
 		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
 			set_bit(WriteMostly, &rdev->flags);
+		else
+			clear_bit(WriteMostly, &rdev->flags);
 
 		rdev->raid_disk = -1;
 		err = bind_rdev_to_array(rdev, mddev);
@@ -4543,7 +4827,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 			rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
 		} else 
 			rdev->sb_start = calc_dev_sboffset(rdev->bdev);
-		rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
+		rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size);
 
 		err = bind_rdev_to_array(rdev, mddev);
 		if (err) {
@@ -4613,7 +4897,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
 	else
 		rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
 
-	rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
+	rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size);
 
 	if (test_bit(Faulty, &rdev->flags)) {
 		printk(KERN_WARNING 
@@ -4749,7 +5033,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
 
 	mddev->level         = info->level;
 	mddev->clevel[0]     = 0;
-	mddev->size          = info->size;
+	mddev->dev_sectors   = 2 * (sector_t)info->size;
 	mddev->raid_disks    = info->raid_disks;
 	/* don't set md_minor, it is determined by which /dev/md* was
 	 * openned
@@ -4788,6 +5072,17 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
 	return 0;
 }
 
+void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors)
+{
+	WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
+
+	if (mddev->external_size)
+		return;
+
+	mddev->array_sectors = array_sectors;
+}
+EXPORT_SYMBOL(md_set_array_sectors);
+
 static int update_size(mddev_t *mddev, sector_t num_sectors)
 {
 	mdk_rdev_t *rdev;
@@ -4814,8 +5109,7 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
 		 */
 		return -EBUSY;
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
-		sector_t avail;
-		avail = rdev->size * 2;
+		sector_t avail = rdev->sectors;
 
 		if (fit && (num_sectors == 0 || num_sectors > avail))
 			num_sectors = avail;
@@ -4887,12 +5181,18 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
 		)
 		return -EINVAL;
 	/* Check there is only one change */
-	if (info->size >= 0 && mddev->size != info->size) cnt++;
-	if (mddev->raid_disks != info->raid_disks) cnt++;
-	if (mddev->layout != info->layout) cnt++;
-	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++;
-	if (cnt == 0) return 0;
-	if (cnt > 1) return -EINVAL;
+	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
+		cnt++;
+	if (mddev->raid_disks != info->raid_disks)
+		cnt++;
+	if (mddev->layout != info->layout)
+		cnt++;
+	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
+		cnt++;
+	if (cnt == 0)
+		return 0;
+	if (cnt > 1)
+		return -EINVAL;
 
 	if (mddev->layout != info->layout) {
 		/* Change layout
@@ -4904,7 +5204,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
 		else
 			return mddev->pers->reconfig(mddev, info->layout, -1);
 	}
-	if (info->size >= 0 && mddev->size != info->size)
+	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
 		rv = update_size(mddev, (sector_t)info->size * 2);
 
 	if (mddev->raid_disks    != info->raid_disks)
@@ -5331,6 +5631,8 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
 
 void md_unregister_thread(mdk_thread_t *thread)
 {
+	if (!thread)
+		return;
 	dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
 
 	kthread_stop(thread->tsk);
@@ -5404,7 +5706,7 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
 		max_blocks = mddev->resync_max_sectors >> 1;
 	else
-		max_blocks = mddev->size;
+		max_blocks = mddev->dev_sectors / 2;
 
 	/*
 	 * Should not happen.
@@ -5537,7 +5839,7 @@ struct mdstat_info {
 static int md_seq_show(struct seq_file *seq, void *v)
 {
 	mddev_t *mddev = v;
-	sector_t size;
+	sector_t sectors;
 	mdk_rdev_t *rdev;
 	struct mdstat_info *mi = seq->private;
 	struct bitmap *bitmap;
@@ -5573,7 +5875,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
 			seq_printf(seq, " %s", mddev->pers->name);
 		}
 
-		size = 0;
+		sectors = 0;
 		list_for_each_entry(rdev, &mddev->disks, same_set) {
 			char b[BDEVNAME_SIZE];
 			seq_printf(seq, " %s[%d]",
@@ -5585,7 +5887,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
 				continue;
 			} else if (rdev->raid_disk < 0)
 				seq_printf(seq, "(S)"); /* spare */
-			size += rdev->size;
+			sectors += rdev->sectors;
 		}
 
 		if (!list_empty(&mddev->disks)) {
@@ -5595,7 +5897,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
 					   mddev->array_sectors / 2);
 			else
 				seq_printf(seq, "\n      %llu blocks",
-					   (unsigned long long)size);
+					   (unsigned long long)sectors / 2);
 		}
 		if (mddev->persistent) {
 			if (mddev->major_version != 0 ||
@@ -5722,19 +6024,19 @@ int unregister_md_personality(struct mdk_personality *p)
 	return 0;
 }
 
-static int is_mddev_idle(mddev_t *mddev)
+static int is_mddev_idle(mddev_t *mddev, int init)
 {
 	mdk_rdev_t * rdev;
 	int idle;
-	long curr_events;
+	int curr_events;
 
 	idle = 1;
 	rcu_read_lock();
 	rdev_for_each_rcu(rdev, mddev) {
 		struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
-		curr_events = part_stat_read(&disk->part0, sectors[0]) +
-				part_stat_read(&disk->part0, sectors[1]) -
-				atomic_read(&disk->sync_io);
+		curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
+			      (int)part_stat_read(&disk->part0, sectors[1]) -
+			      atomic_read(&disk->sync_io);
 		/* sync IO will cause sync_io to increase before the disk_stats
 		 * as sync_io is counted when a request starts, and
 		 * disk_stats is counted when it completes.
@@ -5757,7 +6059,7 @@ static int is_mddev_idle(mddev_t *mddev)
 		 * always make curr_events less than last_events.
 		 *
 		 */
-		if (curr_events - rdev->last_events > 4096) {
+		if (init || curr_events - rdev->last_events > 64) {
 			rdev->last_events = curr_events;
 			idle = 0;
 		}
@@ -5980,10 +6282,10 @@ void md_do_sync(mddev_t *mddev)
 			j = mddev->recovery_cp;
 
 	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
-		max_sectors = mddev->size << 1;
+		max_sectors = mddev->dev_sectors;
 	else {
 		/* recovery follows the physical size of devices */
-		max_sectors = mddev->size << 1;
+		max_sectors = mddev->dev_sectors;
 		j = MaxSector;
 		list_for_each_entry(rdev, &mddev->disks, same_set)
 			if (rdev->raid_disk >= 0 &&
@@ -6000,7 +6302,7 @@ void md_do_sync(mddev_t *mddev)
 	       "(but not more than %d KB/sec) for %s.\n",
 	       speed_max(mddev), desc);
 
-	is_mddev_idle(mddev); /* this also initializes IO event counters */
+	is_mddev_idle(mddev, 1); /* this initializes IO event counters */
 
 	io_sectors = 0;
 	for (m = 0; m < SYNC_MARKS; m++) {
@@ -6040,6 +6342,18 @@ void md_do_sync(mddev_t *mddev)
 		}
 		if (kthread_should_stop())
 			goto interrupted;
+
+		if (mddev->curr_resync > mddev->curr_resync_completed &&
+		    (mddev->curr_resync - mddev->curr_resync_completed)
+		    > (max_sectors >> 4)) {
+			/* time to update curr_resync_completed */
+			blk_unplug(mddev->queue);
+			wait_event(mddev->recovery_wait,
+				   atomic_read(&mddev->recovery_active) == 0);
+			mddev->curr_resync_completed =
+				mddev->curr_resync;
+			set_bit(MD_CHANGE_CLEAN, &mddev->flags);
+		}
 		sectors = mddev->pers->sync_request(mddev, j, &skipped,
 						  currspeed < speed_min(mddev));
 		if (sectors == 0) {
@@ -6102,7 +6416,7 @@ void md_do_sync(mddev_t *mddev)
 
 		if (currspeed > speed_min(mddev)) {
 			if ((currspeed > speed_max(mddev)) ||
-					!is_mddev_idle(mddev)) {
+					!is_mddev_idle(mddev, 0)) {
 				msleep(500);
 				goto repeat;
 			}
@@ -6173,6 +6487,8 @@ static int remove_and_add_spares(mddev_t *mddev)
 	mdk_rdev_t *rdev;
 	int spares = 0;
 
+	mddev->curr_resync_completed = 0;
+
 	list_for_each_entry(rdev, &mddev->disks, same_set)
 		if (rdev->raid_disk >= 0 &&
 		    !test_bit(Blocked, &rdev->flags) &&
@@ -6327,6 +6643,9 @@ void md_check_recovery(mddev_t *mddev)
 					sysfs_notify(&mddev->kobj, NULL,
 						     "degraded");
 			}
+			if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+			    mddev->pers->finish_reshape)
+				mddev->pers->finish_reshape(mddev);
 			md_update_sb(mddev, 1);
 
 			/* if array is no-longer degraded, then any saved_raid_disk
@@ -6470,13 +6789,13 @@ static void md_geninit(void)
 
 static int __init md_init(void)
 {
-	if (register_blkdev(MAJOR_NR, "md"))
+	if (register_blkdev(MD_MAJOR, "md"))
 		return -1;
 	if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
-		unregister_blkdev(MAJOR_NR, "md");
+		unregister_blkdev(MD_MAJOR, "md");
 		return -1;
 	}
-	blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE,
+	blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
 			    md_probe, NULL, NULL);
 	blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
 			    md_probe, NULL, NULL);
@@ -6562,10 +6881,10 @@ static __exit void md_exit(void)
 	mddev_t *mddev;
 	struct list_head *tmp;
 
-	blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS);
+	blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS);
 	blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
 
-	unregister_blkdev(MAJOR_NR,"md");
+	unregister_blkdev(MD_MAJOR,"md");
 	unregister_blkdev(mdp_major, "mdp");
 	unregister_reboot_notifier(&md_notifier);
 	unregister_sysctl_table(raid_table_header);
diff --git a/drivers/md/md.h b/drivers/md/md.h
new file mode 100644
index 00000000000..e9b7f54c24d
--- /dev/null
+++ b/drivers/md/md.h
@@ -0,0 +1,436 @@
+/*
+   md_k.h : kernel internal structure of the Linux MD driver
+          Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
+	  
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+   
+   You should have received a copy of the GNU General Public License
+   (for example /usr/src/linux/COPYING); if not, write to the Free
+   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
+*/
+
+#ifndef _MD_K_H
+#define _MD_K_H
+
+#ifdef CONFIG_BLOCK
+
+#define MaxSector (~(sector_t)0)
+
+typedef struct mddev_s mddev_t;
+typedef struct mdk_rdev_s mdk_rdev_t;
+
+/*
+ * options passed in raidrun:
+ */
+
+/* Currently this must fit in an 'int' */
+#define MAX_CHUNK_SIZE (1<<30)
+
+/*
+ * MD's 'extended' device
+ */
+struct mdk_rdev_s
+{
+	struct list_head same_set;	/* RAID devices within the same set */
+
+	sector_t sectors;		/* Device size (in 512bytes sectors) */
+	mddev_t *mddev;			/* RAID array if running */
+	int last_events;		/* IO event timestamp */
+
+	struct block_device *bdev;	/* block device handle */
+
+	struct page	*sb_page;
+	int		sb_loaded;
+	__u64		sb_events;
+	sector_t	data_offset;	/* start of data in array */
+	sector_t 	sb_start;	/* offset of the super block (in 512byte sectors) */
+	int		sb_size;	/* bytes in the superblock */
+	int		preferred_minor;	/* autorun support */
+
+	struct kobject	kobj;
+
+	/* A device can be in one of three states based on two flags:
+	 * Not working:   faulty==1 in_sync==0
+	 * Fully working: faulty==0 in_sync==1
+	 * Working, but not
+	 * in sync with array
+	 *                faulty==0 in_sync==0
+	 *
+	 * It can never have faulty==1, in_sync==1
+	 * This reduces the burden of testing multiple flags in many cases
+	 */
+
+	unsigned long	flags;
+#define	Faulty		1		/* device is known to have a fault */
+#define	In_sync		2		/* device is in_sync with rest of array */
+#define	WriteMostly	4		/* Avoid reading if at all possible */
+#define	BarriersNotsupp	5		/* BIO_RW_BARRIER is not supported */
+#define	AllReserved	6		/* If whole device is reserved for
+					 * one array */
+#define	AutoDetected	7		/* added by auto-detect */
+#define Blocked		8		/* An error occured on an externally
+					 * managed array, don't allow writes
+					 * until it is cleared */
+#define StateChanged	9		/* Faulty or Blocked has changed during
+					 * interrupt, so it needs to be
+					 * notified by the thread */
+	wait_queue_head_t blocked_wait;
+
+	int desc_nr;			/* descriptor index in the superblock */
+	int raid_disk;			/* role of device in array */
+	int saved_raid_disk;		/* role that device used to have in the
+					 * array and could again if we did a partial
+					 * resync from the bitmap
+					 */
+	sector_t	recovery_offset;/* If this device has been partially
+					 * recovered, this is where we were
+					 * up to.
+					 */
+
+	atomic_t	nr_pending;	/* number of pending requests.
+					 * only maintained for arrays that
+					 * support hot removal
+					 */
+	atomic_t	read_errors;	/* number of consecutive read errors that
+					 * we have tried to ignore.
+					 */
+	atomic_t	corrected_errors; /* number of corrected read errors,
+					   * for reporting to userspace and storing
+					   * in superblock.
+					   */
+	struct work_struct del_work;	/* used for delayed sysfs removal */
+
+	struct sysfs_dirent *sysfs_state; /* handle for 'state'
+					   * sysfs entry */
+};
+
+struct mddev_s
+{
+	void				*private;
+	struct mdk_personality		*pers;
+	dev_t				unit;
+	int				md_minor;
+	struct list_head 		disks;
+	unsigned long			flags;
+#define MD_CHANGE_DEVS	0	/* Some device status has changed */
+#define MD_CHANGE_CLEAN 1	/* transition to or from 'clean' */
+#define MD_CHANGE_PENDING 2	/* superblock update in progress */
+
+	int				suspended;
+	atomic_t			active_io;
+	int				ro;
+
+	struct gendisk			*gendisk;
+
+	struct kobject			kobj;
+	int				hold_active;
+#define	UNTIL_IOCTL	1
+#define	UNTIL_STOP	2
+
+	/* Superblock information */
+	int				major_version,
+					minor_version,
+					patch_version;
+	int				persistent;
+	int 				external;	/* metadata is
+							 * managed externally */
+	char				metadata_type[17]; /* externally set*/
+	int				chunk_size;
+	time_t				ctime, utime;
+	int				level, layout;
+	char				clevel[16];
+	int				raid_disks;
+	int				max_disks;
+	sector_t			dev_sectors; 	/* used size of
+							 * component devices */
+	sector_t			array_sectors; /* exported array size */
+	int				external_size; /* size managed
+							* externally */
+	__u64				events;
+
+	char				uuid[16];
+
+	/* If the array is being reshaped, we need to record the
+	 * new shape and an indication of where we are up to.
+	 * This is written to the superblock.
+	 * If reshape_position is MaxSector, then no reshape is happening (yet).
+	 */
+	sector_t			reshape_position;
+	int				delta_disks, new_level, new_layout, new_chunk;
+
+	struct mdk_thread_s		*thread;	/* management thread */
+	struct mdk_thread_s		*sync_thread;	/* doing resync or reconstruct */
+	sector_t			curr_resync;	/* last block scheduled */
+	/* As resync requests can complete out of order, we cannot easily track
+	 * how much resync has been completed.  So we occasionally pause until
+	 * everything completes, then set curr_resync_completed to curr_resync.
+	 * As such it may be well behind the real resync mark, but it is a value
+	 * we are certain of.
+	 */
+	sector_t			curr_resync_completed;
+	unsigned long			resync_mark;	/* a recent timestamp */
+	sector_t			resync_mark_cnt;/* blocks written at resync_mark */
+	sector_t			curr_mark_cnt; /* blocks scheduled now */
+
+	sector_t			resync_max_sectors; /* may be set by personality */
+
+	sector_t			resync_mismatches; /* count of sectors where
+							    * parity/replica mismatch found
+							    */
+
+	/* allow user-space to request suspension of IO to regions of the array */
+	sector_t			suspend_lo;
+	sector_t			suspend_hi;
+	/* if zero, use the system-wide default */
+	int				sync_speed_min;
+	int				sync_speed_max;
+
+	/* resync even though the same disks are shared among md-devices */
+	int				parallel_resync;
+
+	int				ok_start_degraded;
+	/* recovery/resync flags 
+	 * NEEDED:   we might need to start a resync/recover
+	 * RUNNING:  a thread is running, or about to be started
+	 * SYNC:     actually doing a resync, not a recovery
+	 * RECOVER:  doing recovery, or need to try it.
+	 * INTR:     resync needs to be aborted for some reason
+	 * DONE:     thread is done and is waiting to be reaped
+	 * REQUEST:  user-space has requested a sync (used with SYNC)
+	 * CHECK:    user-space request for for check-only, no repair
+	 * RESHAPE:  A reshape is happening
+	 *
+	 * If neither SYNC or RESHAPE are set, then it is a recovery.
+	 */
+#define	MD_RECOVERY_RUNNING	0
+#define	MD_RECOVERY_SYNC	1
+#define	MD_RECOVERY_RECOVER	2
+#define	MD_RECOVERY_INTR	3
+#define	MD_RECOVERY_DONE	4
+#define	MD_RECOVERY_NEEDED	5
+#define	MD_RECOVERY_REQUESTED	6
+#define	MD_RECOVERY_CHECK	7
+#define MD_RECOVERY_RESHAPE	8
+#define	MD_RECOVERY_FROZEN	9
+
+	unsigned long			recovery;
+	int				recovery_disabled; /* if we detect that recovery
+							    * will always fail, set this
+							    * so we don't loop trying */
+
+	int				in_sync;	/* know to not need resync */
+	struct mutex			reconfig_mutex;
+	atomic_t			active;		/* general refcount */
+	atomic_t			openers;	/* number of active opens */
+
+	int				changed;	/* true if we might need to reread partition info */
+	int				degraded;	/* whether md should consider
+							 * adding a spare
+							 */
+	int				barriers_work;	/* initialised to true, cleared as soon
+							 * as a barrier request to slave
+							 * fails.  Only supported
+							 */
+	struct bio			*biolist; 	/* bios that need to be retried
+							 * because BIO_RW_BARRIER is not supported
+							 */
+
+	atomic_t			recovery_active; /* blocks scheduled, but not written */
+	wait_queue_head_t		recovery_wait;
+	sector_t			recovery_cp;
+	sector_t			resync_min;	/* user requested sync
+							 * starts here */
+	sector_t			resync_max;	/* resync should pause
+							 * when it gets here */
+
+	struct sysfs_dirent		*sysfs_state;	/* handle for 'array_state'
+							 * file in sysfs.
+							 */
+	struct sysfs_dirent		*sysfs_action;  /* handle for 'sync_action' */
+
+	struct work_struct del_work;	/* used for delayed sysfs removal */
+
+	spinlock_t			write_lock;
+	wait_queue_head_t		sb_wait;	/* for waiting on superblock updates */
+	atomic_t			pending_writes;	/* number of active superblock writes */
+
+	unsigned int			safemode;	/* if set, update "clean" superblock
+							 * when no writes pending.
+							 */ 
+	unsigned int			safemode_delay;
+	struct timer_list		safemode_timer;
+	atomic_t			writes_pending; 
+	struct request_queue		*queue;	/* for plugging ... */
+
+	atomic_t                        write_behind; /* outstanding async IO */
+	unsigned int                    max_write_behind; /* 0 = sync */
+
+	struct bitmap                   *bitmap; /* the bitmap for the device */
+	struct file			*bitmap_file; /* the bitmap file */
+	long				bitmap_offset; /* offset from superblock of
+							* start of bitmap. May be
+							* negative, but not '0'
+							*/
+	long				default_bitmap_offset; /* this is the offset to use when
+								* hot-adding a bitmap.  It should
+								* eventually be settable by sysfs.
+								*/
+
+	struct list_head		all_mddevs;
+};
+
+
+static inline void rdev_dec_pending(mdk_rdev_t *rdev, mddev_t *mddev)
+{
+	int faulty = test_bit(Faulty, &rdev->flags);
+	if (atomic_dec_and_test(&rdev->nr_pending) && faulty)
+		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+}
+
+static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
+{
+        atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
+}
+
+struct mdk_personality
+{
+	char *name;
+	int level;
+	struct list_head list;
+	struct module *owner;
+	int (*make_request)(struct request_queue *q, struct bio *bio);
+	int (*run)(mddev_t *mddev);
+	int (*stop)(mddev_t *mddev);
+	void (*status)(struct seq_file *seq, mddev_t *mddev);
+	/* error_handler must set ->faulty and clear ->in_sync
+	 * if appropriate, and should abort recovery if needed 
+	 */
+	void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev);
+	int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev);
+	int (*hot_remove_disk) (mddev_t *mddev, int number);
+	int (*spare_active) (mddev_t *mddev);
+	sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster);
+	int (*resize) (mddev_t *mddev, sector_t sectors);
+	sector_t (*size) (mddev_t *mddev, sector_t sectors, int raid_disks);
+	int (*check_reshape) (mddev_t *mddev);
+	int (*start_reshape) (mddev_t *mddev);
+	void (*finish_reshape) (mddev_t *mddev);
+	int (*reconfig) (mddev_t *mddev, int layout, int chunk_size);
+	/* quiesce moves between quiescence states
+	 * 0 - fully active
+	 * 1 - no new requests allowed
+	 * others - reserved
+	 */
+	void (*quiesce) (mddev_t *mddev, int state);
+	/* takeover is used to transition an array from one
+	 * personality to another.  The new personality must be able
+	 * to handle the data in the current layout.
+	 * e.g. 2drive raid1 -> 2drive raid5
+	 *      ndrive raid5 -> degraded n+1drive raid6 with special layout
+	 * If the takeover succeeds, a new 'private' structure is returned.
+	 * This needs to be installed and then ->run used to activate the
+	 * array.
+	 */
+	void *(*takeover) (mddev_t *mddev);
+};
+
+
+struct md_sysfs_entry {
+	struct attribute attr;
+	ssize_t (*show)(mddev_t *, char *);
+	ssize_t (*store)(mddev_t *, const char *, size_t);
+};
+
+
+static inline char * mdname (mddev_t * mddev)
+{
+	return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
+}
+
+/*
+ * iterates through some rdev ringlist. It's safe to remove the
+ * current 'rdev'. Dont touch 'tmp' though.
+ */
+#define rdev_for_each_list(rdev, tmp, head)				\
+	list_for_each_entry_safe(rdev, tmp, head, same_set)
+
+/*
+ * iterates through the 'same array disks' ringlist
+ */
+#define rdev_for_each(rdev, tmp, mddev)				\
+	list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
+
+#define rdev_for_each_rcu(rdev, mddev)				\
+	list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
+
+typedef struct mdk_thread_s {
+	void			(*run) (mddev_t *mddev);
+	mddev_t			*mddev;
+	wait_queue_head_t	wqueue;
+	unsigned long           flags;
+	struct task_struct	*tsk;
+	unsigned long		timeout;
+} mdk_thread_t;
+
+#define THREAD_WAKEUP  0
+
+#define __wait_event_lock_irq(wq, condition, lock, cmd) 		\
+do {									\
+	wait_queue_t __wait;						\
+	init_waitqueue_entry(&__wait, current);				\
+									\
+	add_wait_queue(&wq, &__wait);					\
+	for (;;) {							\
+		set_current_state(TASK_UNINTERRUPTIBLE);		\
+		if (condition)						\
+			break;						\
+		spin_unlock_irq(&lock);					\
+		cmd;							\
+		schedule();						\
+		spin_lock_irq(&lock);					\
+	}								\
+	current->state = TASK_RUNNING;					\
+	remove_wait_queue(&wq, &__wait);				\
+} while (0)
+
+#define wait_event_lock_irq(wq, condition, lock, cmd) 			\
+do {									\
+	if (condition)	 						\
+		break;							\
+	__wait_event_lock_irq(wq, condition, lock, cmd);		\
+} while (0)
+
+static inline void safe_put_page(struct page *p)
+{
+	if (p) put_page(p);
+}
+
+#endif /* CONFIG_BLOCK */
+#endif
+
+
+extern int register_md_personality(struct mdk_personality *p);
+extern int unregister_md_personality(struct mdk_personality *p);
+extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
+				mddev_t *mddev, const char *name);
+extern void md_unregister_thread(mdk_thread_t *thread);
+extern void md_wakeup_thread(mdk_thread_t *thread);
+extern void md_check_recovery(mddev_t *mddev);
+extern void md_write_start(mddev_t *mddev, struct bio *bi);
+extern void md_write_end(mddev_t *mddev);
+extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
+extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
+
+extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
+			   sector_t sector, int size, struct page *page);
+extern void md_super_wait(mddev_t *mddev);
+extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
+			struct page *page, int rw);
+extern void md_do_sync(mddev_t *mddev);
+extern void md_new_event(mddev_t *mddev);
+extern int md_allow_write(mddev_t *mddev);
+extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
+extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
diff --git a/drivers/md/mktables.c b/drivers/md/mktables.c
index b61d5767aae..3b1500843bb 100644
--- a/drivers/md/mktables.c
+++ b/drivers/md/mktables.c
@@ -59,7 +59,7 @@ int main(int argc, char *argv[])
 	uint8_t v;
 	uint8_t exptbl[256], invtbl[256];
 
-	printf("#include \"raid6.h\"\n");
+	printf("#include <linux/raid/pq.h>\n");
 
 	/* Compute multiplication table */
 	printf("\nconst u8  __attribute__((aligned(256)))\n"
@@ -76,6 +76,9 @@ int main(int argc, char *argv[])
 		printf("\t},\n");
 	}
 	printf("};\n");
+	printf("#ifdef __KERNEL__\n");
+	printf("EXPORT_SYMBOL(raid6_gfmul);\n");
+	printf("#endif\n");
 
 	/* Compute power-of-2 table (exponent) */
 	v = 1;
@@ -92,6 +95,9 @@ int main(int argc, char *argv[])
 		}
 	}
 	printf("};\n");
+	printf("#ifdef __KERNEL__\n");
+	printf("EXPORT_SYMBOL(raid6_gfexp);\n");
+	printf("#endif\n");
 
 	/* Compute inverse table x^-1 == x^254 */
 	printf("\nconst u8 __attribute__((aligned(256)))\n"
@@ -104,6 +110,9 @@ int main(int argc, char *argv[])
 		}
 	}
 	printf("};\n");
+	printf("#ifdef __KERNEL__\n");
+	printf("EXPORT_SYMBOL(raid6_gfinv);\n");
+	printf("#endif\n");
 
 	/* Compute inv(2^x + 1) (exponent-xor-inverse) table */
 	printf("\nconst u8 __attribute__((aligned(256)))\n"
@@ -115,6 +124,9 @@ int main(int argc, char *argv[])
 			       (j == 7) ? '\n' : ' ');
 	}
 	printf("};\n");
+	printf("#ifdef __KERNEL__\n");
+	printf("EXPORT_SYMBOL(raid6_gfexi);\n");
+	printf("#endif\n");
 
 	return 0;
 }
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index f6d08f24167..41ced0cbe82 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -19,7 +19,11 @@
  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#include <linux/raid/multipath.h>
+#include <linux/blkdev.h>
+#include <linux/raid/md_u.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "multipath.h"
 
 #define MAX_WORK_PER_DISK 128
 
@@ -402,6 +406,14 @@ static void multipathd (mddev_t *mddev)
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 }
 
+static sector_t multipath_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+	WARN_ONCE(sectors || raid_disks,
+		  "%s does not support generic reshape\n", __func__);
+
+	return mddev->dev_sectors;
+}
+
 static int multipath_run (mddev_t *mddev)
 {
 	multipath_conf_t *conf;
@@ -498,7 +510,7 @@ static int multipath_run (mddev_t *mddev)
 	/*
 	 * Ok, everything is just fine now
 	 */
-	mddev->array_sectors = mddev->size * 2;
+	md_set_array_sectors(mddev, multipath_size(mddev, 0, 0));
 
 	mddev->queue->unplug_fn = multipath_unplug;
 	mddev->queue->backing_dev_info.congested_fn = multipath_congested;
@@ -543,6 +555,7 @@ static struct mdk_personality multipath_personality =
 	.error_handler	= multipath_error,
 	.hot_add_disk	= multipath_add_disk,
 	.hot_remove_disk= multipath_remove_disk,
+	.size		= multipath_size,
 };
 
 static int __init multipath_init (void)
diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h
new file mode 100644
index 00000000000..6fa70b400cd
--- /dev/null
+++ b/drivers/md/multipath.h
@@ -0,0 +1,40 @@
+#ifndef _MULTIPATH_H
+#define _MULTIPATH_H
+
+struct multipath_info {
+	mdk_rdev_t	*rdev;
+};
+
+struct multipath_private_data {
+	mddev_t			*mddev;
+	struct multipath_info	*multipaths;
+	int			raid_disks;
+	int			working_disks;
+	spinlock_t		device_lock;
+	struct list_head	retry_list;
+
+	mempool_t		*pool;
+};
+
+typedef struct multipath_private_data multipath_conf_t;
+
+/*
+ * this is the only point in the RAID code where we violate
+ * C type safety. mddev->private is an 'opaque' pointer.
+ */
+#define mddev_to_conf(mddev) ((multipath_conf_t *) mddev->private)
+
+/*
+ * this is our 'private' 'collective' MULTIPATH buffer head.
+ * it contains information about what kind of IO operations were started
+ * for this MULTIPATH operation, and about their status:
+ */
+
+struct multipath_bh {
+	mddev_t			*mddev;
+	struct bio		*master_bio;
+	struct bio		bio;
+	int			path;
+	struct list_head	retry_list;
+};
+#endif
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index c605ba80558..c08d7559be5 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -18,7 +18,10 @@
    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
 */
 
-#include <linux/raid/raid0.h>
+#include <linux/blkdev.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "raid0.h"
 
 static void raid0_unplug(struct request_queue *q)
 {
@@ -73,16 +76,15 @@ static int create_strip_zones (mddev_t *mddev)
 		list_for_each_entry(rdev2, &mddev->disks, same_set) {
 			printk(KERN_INFO "raid0:   comparing %s(%llu)",
 			       bdevname(rdev1->bdev,b),
-			       (unsigned long long)rdev1->size);
+			       (unsigned long long)rdev1->sectors);
 			printk(KERN_INFO " with %s(%llu)\n",
 			       bdevname(rdev2->bdev,b),
-			       (unsigned long long)rdev2->size);
+			       (unsigned long long)rdev2->sectors);
 			if (rdev2 == rdev1) {
 				printk(KERN_INFO "raid0:   END\n");
 				break;
 			}
-			if (rdev2->size == rdev1->size)
-			{
+			if (rdev2->sectors == rdev1->sectors) {
 				/*
 				 * Not unique, don't count it as a new
 				 * group
@@ -145,7 +147,7 @@ static int create_strip_zones (mddev_t *mddev)
 		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
 			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
-		if (!smallest || (rdev1->size <smallest->size))
+		if (!smallest || (rdev1->sectors < smallest->sectors))
 			smallest = rdev1;
 		cnt++;
 	}
@@ -155,10 +157,10 @@ static int create_strip_zones (mddev_t *mddev)
 		goto abort;
 	}
 	zone->nb_dev = cnt;
-	zone->sectors = smallest->size * cnt * 2;
+	zone->sectors = smallest->sectors * cnt;
 	zone->zone_start = 0;
 
-	current_start = smallest->size * 2;
+	current_start = smallest->sectors;
 	curr_zone_start = zone->sectors;
 
 	/* now do the other zones */
@@ -177,29 +179,29 @@ static int create_strip_zones (mddev_t *mddev)
 			rdev = conf->strip_zone[0].dev[j];
 			printk(KERN_INFO "raid0: checking %s ...",
 				bdevname(rdev->bdev, b));
-			if (rdev->size > current_start / 2) {
-				printk(KERN_INFO " contained as device %d\n",
-					c);
-				zone->dev[c] = rdev;
-				c++;
-				if (!smallest || (rdev->size <smallest->size)) {
-					smallest = rdev;
-					printk(KERN_INFO "  (%llu) is smallest!.\n",
-						(unsigned long long)rdev->size);
-				}
-			} else
+			if (rdev->sectors <= current_start) {
 				printk(KERN_INFO " nope.\n");
+				continue;
+			}
+			printk(KERN_INFO " contained as device %d\n", c);
+			zone->dev[c] = rdev;
+			c++;
+			if (!smallest || rdev->sectors < smallest->sectors) {
+				smallest = rdev;
+				printk(KERN_INFO "  (%llu) is smallest!.\n",
+					(unsigned long long)rdev->sectors);
+			}
 		}
 
 		zone->nb_dev = c;
-		zone->sectors = (smallest->size * 2 - current_start) * c;
+		zone->sectors = (smallest->sectors - current_start) * c;
 		printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n",
 			zone->nb_dev, (unsigned long long)zone->sectors);
 
 		zone->zone_start = curr_zone_start;
 		curr_zone_start += zone->sectors;
 
-		current_start = smallest->size * 2;
+		current_start = smallest->sectors;
 		printk(KERN_INFO "raid0: current zone start: %llu\n",
 			(unsigned long long)current_start);
 	}
@@ -261,12 +263,25 @@ static int raid0_mergeable_bvec(struct request_queue *q,
 		return max;
 }
 
+static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+	sector_t array_sectors = 0;
+	mdk_rdev_t *rdev;
+
+	WARN_ONCE(sectors || raid_disks,
+		  "%s does not support generic reshape\n", __func__);
+
+	list_for_each_entry(rdev, &mddev->disks, same_set)
+		array_sectors += rdev->sectors;
+
+	return array_sectors;
+}
+
 static int raid0_run (mddev_t *mddev)
 {
 	unsigned  cur=0, i=0, nb_zone;
 	s64 sectors;
 	raid0_conf_t *conf;
-	mdk_rdev_t *rdev;
 
 	if (mddev->chunk_size == 0) {
 		printk(KERN_ERR "md/raid0: non-zero chunk size required.\n");
@@ -291,16 +306,14 @@ static int raid0_run (mddev_t *mddev)
 		goto out_free_conf;
 
 	/* calculate array device size */
-	mddev->array_sectors = 0;
-	list_for_each_entry(rdev, &mddev->disks, same_set)
-		mddev->array_sectors += rdev->size * 2;
+	md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
 
 	printk(KERN_INFO "raid0 : md_size is %llu sectors.\n",
 		(unsigned long long)mddev->array_sectors);
 	printk(KERN_INFO "raid0 : conf->spacing is %llu sectors.\n",
 		(unsigned long long)conf->spacing);
 	{
-		sector_t s = mddev->array_sectors;
+		sector_t s = raid0_size(mddev, 0, 0);
 		sector_t space = conf->spacing;
 		int round;
 		conf->sector_shift = 0;
@@ -509,6 +522,7 @@ static struct mdk_personality raid0_personality=
 	.run		= raid0_run,
 	.stop		= raid0_stop,
 	.status		= raid0_status,
+	.size		= raid0_size,
 };
 
 static int __init raid0_init (void)
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
new file mode 100644
index 00000000000..824b12eb1d4
--- /dev/null
+++ b/drivers/md/raid0.h
@@ -0,0 +1,28 @@
+#ifndef _RAID0_H
+#define _RAID0_H
+
+struct strip_zone
+{
+	sector_t zone_start;	/* Zone offset in md_dev (in sectors) */
+	sector_t dev_start;	/* Zone offset in real dev (in sectors) */
+	sector_t sectors;	/* Zone size in sectors */
+	int nb_dev;		/* # of devices attached to the zone */
+	mdk_rdev_t **dev;	/* Devices attached to the zone */
+};
+
+struct raid0_private_data
+{
+	struct strip_zone **hash_table; /* Table of indexes into strip_zone */
+	struct strip_zone *strip_zone;
+	mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
+	int nr_strip_zones;
+
+	sector_t spacing;
+	int sector_shift; /* shift this before divide by spacing */
+};
+
+typedef struct raid0_private_data raid0_conf_t;
+
+#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private)
+
+#endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e2466425d9c..b4f4badc006 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -31,10 +31,13 @@
  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#include "dm-bio-list.h"
 #include <linux/delay.h>
-#include <linux/raid/raid1.h>
-#include <linux/raid/bitmap.h>
+#include <linux/blkdev.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "dm-bio-list.h"
+#include "raid1.h"
+#include "bitmap.h"
 
 #define DEBUG 0
 #if DEBUG
@@ -1723,7 +1726,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 			return 0;
 	}
 
-	max_sector = mddev->size << 1;
+	max_sector = mddev->dev_sectors;
 	if (sector_nr >= max_sector) {
 		/* If we aborted, we need to abort the
 		 * sync on the 'current' bitmap chunk (there will
@@ -1919,6 +1922,14 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	return nr_sectors;
 }
 
+static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+	if (sectors)
+		return sectors;
+
+	return mddev->dev_sectors;
+}
+
 static int run(mddev_t *mddev)
 {
 	conf_t *conf;
@@ -2048,7 +2059,7 @@ static int run(mddev_t *mddev)
 	/*
 	 * Ok, everything is just fine now
 	 */
-	mddev->array_sectors = mddev->size * 2;
+	md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
 
 	mddev->queue->unplug_fn = raid1_unplug;
 	mddev->queue->backing_dev_info.congested_fn = raid1_congested;
@@ -2089,6 +2100,9 @@ static int stop(mddev_t *mddev)
 		/* need to kick something here to make sure I/O goes? */
 	}
 
+	raise_barrier(conf);
+	lower_barrier(conf);
+
 	md_unregister_thread(mddev->thread);
 	mddev->thread = NULL;
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
@@ -2110,15 +2124,17 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
 	 * any io in the removed space completes, but it hardly seems
 	 * worth it.
 	 */
-	mddev->array_sectors = sectors;
+	md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0));
+	if (mddev->array_sectors > raid1_size(mddev, sectors, 0))
+		return -EINVAL;
 	set_capacity(mddev->gendisk, mddev->array_sectors);
 	mddev->changed = 1;
-	if (mddev->array_sectors / 2 > mddev->size &&
+	if (sectors > mddev->dev_sectors &&
 	    mddev->recovery_cp == MaxSector) {
-		mddev->recovery_cp = mddev->size << 1;
+		mddev->recovery_cp = mddev->dev_sectors;
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	}
-	mddev->size = mddev->array_sectors / 2;
+	mddev->dev_sectors = sectors;
 	mddev->resync_max_sectors = sectors;
 	return 0;
 }
@@ -2264,6 +2280,7 @@ static struct mdk_personality raid1_personality =
 	.spare_active	= raid1_spare_active,
 	.sync_request	= sync_request,
 	.resize		= raid1_resize,
+	.size		= raid1_size,
 	.check_reshape	= raid1_reshape,
 	.quiesce	= raid1_quiesce,
 };
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
new file mode 100644
index 00000000000..1620eea3d57
--- /dev/null
+++ b/drivers/md/raid1.h
@@ -0,0 +1,132 @@
+#ifndef _RAID1_H
+#define _RAID1_H
+
+typedef struct mirror_info mirror_info_t;
+
+struct mirror_info {
+	mdk_rdev_t	*rdev;
+	sector_t	head_position;
+};
+
+/*
+ * memory pools need a pointer to the mddev, so they can force an unplug
+ * when memory is tight, and a count of the number of drives that the
+ * pool was allocated for, so they know how much to allocate and free.
+ * mddev->raid_disks cannot be used, as it can change while a pool is active
+ * These two datums are stored in a kmalloced struct.
+ */
+
+struct pool_info {
+	mddev_t *mddev;
+	int	raid_disks;
+};
+
+
+typedef struct r1bio_s r1bio_t;
+
+struct r1_private_data_s {
+	mddev_t			*mddev;
+	mirror_info_t		*mirrors;
+	int			raid_disks;
+	int			last_used;
+	sector_t		next_seq_sect;
+	spinlock_t		device_lock;
+
+	struct list_head	retry_list;
+	/* queue pending writes and submit them on unplug */
+	struct bio_list		pending_bio_list;
+	/* queue of writes that have been unplugged */
+	struct bio_list		flushing_bio_list;
+
+	/* for use when syncing mirrors: */
+
+	spinlock_t		resync_lock;
+	int			nr_pending;
+	int			nr_waiting;
+	int			nr_queued;
+	int			barrier;
+	sector_t		next_resync;
+	int			fullsync;  /* set to 1 if a full sync is needed,
+					    * (fresh device added).
+					    * Cleared when a sync completes.
+					    */
+
+	wait_queue_head_t	wait_barrier;
+
+	struct pool_info	*poolinfo;
+
+	struct page		*tmppage;
+
+	mempool_t *r1bio_pool;
+	mempool_t *r1buf_pool;
+};
+
+typedef struct r1_private_data_s conf_t;
+
+/*
+ * this is the only point in the RAID code where we violate
+ * C type safety. mddev->private is an 'opaque' pointer.
+ */
+#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
+
+/*
+ * this is our 'private' RAID1 bio.
+ *
+ * it contains information about what kind of IO operations were started
+ * for this RAID1 operation, and about their status:
+ */
+
+struct r1bio_s {
+	atomic_t		remaining; /* 'have we finished' count,
+					    * used from IRQ handlers
+					    */
+	atomic_t		behind_remaining; /* number of write-behind ios remaining
+						 * in this BehindIO request
+						 */
+	sector_t		sector;
+	int			sectors;
+	unsigned long		state;
+	mddev_t			*mddev;
+	/*
+	 * original bio going to /dev/mdx
+	 */
+	struct bio		*master_bio;
+	/*
+	 * if the IO is in READ direction, then this is where we read
+	 */
+	int			read_disk;
+
+	struct list_head	retry_list;
+	struct bitmap_update	*bitmap_update;
+	/*
+	 * if the IO is in WRITE direction, then multiple bios are used.
+	 * We choose the number when they are allocated.
+	 */
+	struct bio		*bios[0];
+	/* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/
+};
+
+/* when we get a read error on a read-only array, we redirect to another
+ * device without failing the first device, or trying to over-write to
+ * correct the read error.  To keep track of bad blocks on a per-bio
+ * level, we store IO_BLOCKED in the appropriate 'bios' pointer
+ */
+#define IO_BLOCKED ((struct bio*)1)
+
+/* bits for r1bio.state */
+#define	R1BIO_Uptodate	0
+#define	R1BIO_IsSync	1
+#define	R1BIO_Degraded	2
+#define	R1BIO_BehindIO	3
+#define	R1BIO_Barrier	4
+#define R1BIO_BarrierRetry 5
+/* For write-behind requests, we call bi_end_io when
+ * the last non-write-behind device completes, providing
+ * any write was successful.  Otherwise we call when
+ * any write-behind write succeeds, otherwise we call
+ * with failure when last write completes (and all failed).
+ * Record that bi_end_io was called with this flag...
+ */
+#define	R1BIO_Returned 6
+
+#endif
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 7301631abe0..e293d92641a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -18,10 +18,13 @@
  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#include "dm-bio-list.h"
 #include <linux/delay.h>
-#include <linux/raid/raid10.h>
-#include <linux/raid/bitmap.h>
+#include <linux/blkdev.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "dm-bio-list.h"
+#include "raid10.h"
+#include "bitmap.h"
 
 /*
  * RAID10 provides a combination of RAID0 and RAID1 functionality.
@@ -1695,7 +1698,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 			return 0;
 
  skipped:
-	max_sector = mddev->size << 1;
+	max_sector = mddev->dev_sectors;
 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
 		max_sector = mddev->resync_max_sectors;
 	if (sector_nr >= max_sector) {
@@ -2020,6 +2023,25 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	goto skipped;
 }
 
+static sector_t
+raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+	sector_t size;
+	conf_t *conf = mddev_to_conf(mddev);
+
+	if (!raid_disks)
+		raid_disks = mddev->raid_disks;
+	if (!sectors)
+		sectors = mddev->dev_sectors;
+
+	size = sectors >> conf->chunk_shift;
+	sector_div(size, conf->far_copies);
+	size = size * raid_disks;
+	sector_div(size, conf->near_copies);
+
+	return size << conf->chunk_shift;
+}
+
 static int run(mddev_t *mddev)
 {
 	conf_t *conf;
@@ -2076,7 +2098,7 @@ static int run(mddev_t *mddev)
 	conf->far_offset = fo;
 	conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1;
 	conf->chunk_shift = ffz(~mddev->chunk_size) - 9;
-	size = mddev->size >> (conf->chunk_shift-1);
+	size = mddev->dev_sectors >> conf->chunk_shift;
 	sector_div(size, fc);
 	size = size * conf->raid_disks;
 	sector_div(size, nc);
@@ -2089,7 +2111,7 @@ static int run(mddev_t *mddev)
 	 */
 	stride += conf->raid_disks - 1;
 	sector_div(stride, conf->raid_disks);
-	mddev->size = stride  << (conf->chunk_shift-1);
+	mddev->dev_sectors = stride << conf->chunk_shift;
 
 	if (fo)
 		stride = 1;
@@ -2171,8 +2193,8 @@ static int run(mddev_t *mddev)
 	/*
 	 * Ok, everything is just fine now
 	 */
-	mddev->array_sectors = size << conf->chunk_shift;
-	mddev->resync_max_sectors = size << conf->chunk_shift;
+	md_set_array_sectors(mddev, raid10_size(mddev, 0, 0));
+	mddev->resync_max_sectors = raid10_size(mddev, 0, 0);
 
 	mddev->queue->unplug_fn = raid10_unplug;
 	mddev->queue->backing_dev_info.congested_fn = raid10_congested;
@@ -2208,6 +2230,9 @@ static int stop(mddev_t *mddev)
 {
 	conf_t *conf = mddev_to_conf(mddev);
 
+	raise_barrier(conf, 0);
+	lower_barrier(conf);
+
 	md_unregister_thread(mddev->thread);
 	mddev->thread = NULL;
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
@@ -2255,6 +2280,7 @@ static struct mdk_personality raid10_personality =
 	.spare_active	= raid10_spare_active,
 	.sync_request	= sync_request,
 	.quiesce	= raid10_quiesce,
+	.size		= raid10_size,
 };
 
 static int __init raid_init(void)
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
new file mode 100644
index 00000000000..244dbe507a5
--- /dev/null
+++ b/drivers/md/raid10.h
@@ -0,0 +1,121 @@
+#ifndef _RAID10_H
+#define _RAID10_H
+
+typedef struct mirror_info mirror_info_t;
+
+struct mirror_info {
+	mdk_rdev_t	*rdev;
+	sector_t	head_position;
+};
+
+typedef struct r10bio_s r10bio_t;
+
+struct r10_private_data_s {
+	mddev_t			*mddev;
+	mirror_info_t		*mirrors;
+	int			raid_disks;
+	spinlock_t		device_lock;
+
+	/* geometry */
+	int			near_copies;  /* number of copies layed out raid0 style */
+	int 			far_copies;   /* number of copies layed out
+					       * at large strides across drives
+					       */
+	int			far_offset;   /* far_copies are offset by 1 stripe
+					       * instead of many
+					       */
+	int			copies;	      /* near_copies * far_copies.
+					       * must be <= raid_disks
+					       */
+	sector_t		stride;	      /* distance between far copies.
+					       * This is size / far_copies unless
+					       * far_offset, in which case it is
+					       * 1 stripe.
+					       */
+
+	int chunk_shift; /* shift from chunks to sectors */
+	sector_t chunk_mask;
+
+	struct list_head	retry_list;
+	/* queue pending writes and submit them on unplug */
+	struct bio_list		pending_bio_list;
+
+
+	spinlock_t		resync_lock;
+	int nr_pending;
+	int nr_waiting;
+	int nr_queued;
+	int barrier;
+	sector_t		next_resync;
+	int			fullsync;  /* set to 1 if a full sync is needed,
+					    * (fresh device added).
+					    * Cleared when a sync completes.
+					    */
+
+	wait_queue_head_t	wait_barrier;
+
+	mempool_t *r10bio_pool;
+	mempool_t *r10buf_pool;
+	struct page		*tmppage;
+};
+
+typedef struct r10_private_data_s conf_t;
+
+/*
+ * this is the only point in the RAID code where we violate
+ * C type safety. mddev->private is an 'opaque' pointer.
+ */
+#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
+
+/*
+ * this is our 'private' RAID10 bio.
+ *
+ * it contains information about what kind of IO operations were started
+ * for this RAID10 operation, and about their status:
+ */
+
+struct r10bio_s {
+	atomic_t		remaining; /* 'have we finished' count,
+					    * used from IRQ handlers
+					    */
+	sector_t		sector;	/* virtual sector number */
+	int			sectors;
+	unsigned long		state;
+	mddev_t			*mddev;
+	/*
+	 * original bio going to /dev/mdx
+	 */
+	struct bio		*master_bio;
+	/*
+	 * if the IO is in READ direction, then this is where we read
+	 */
+	int			read_slot;
+
+	struct list_head	retry_list;
+	/*
+	 * if the IO is in WRITE direction, then multiple bios are used,
+	 * one for each copy.
+	 * When resyncing we also use one for each copy.
+	 * When reconstructing, we use 2 bios, one for read, one for write.
+	 * We choose the number when they are allocated.
+	 */
+	struct {
+		struct bio		*bio;
+		sector_t addr;
+		int devnum;
+	} devs[0];
+};
+
+/* when we get a read error on a read-only array, we redirect to another
+ * device without failing the first device, or trying to over-write to
+ * correct the read error.  To keep track of bad blocks on a per-bio
+ * level, we store IO_BLOCKED in the appropriate 'bios' pointer
+ */
+#define IO_BLOCKED ((struct bio*)1)
+
+/* bits for r10bio.state */
+#define	R10BIO_Uptodate	0
+#define	R10BIO_IsSync	1
+#define	R10BIO_IsRecover 2
+#define	R10BIO_Degraded 3
+#endif
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a5ba080d303..3bbc6d64704 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -43,11 +43,14 @@
  * miss any bits.
  */
 
+#include <linux/blkdev.h>
 #include <linux/kthread.h>
-#include "raid6.h"
-
-#include <linux/raid/bitmap.h>
+#include <linux/raid/pq.h>
 #include <linux/async_tx.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "raid5.h"
+#include "bitmap.h"
 
 /*
  * Stripe cache
@@ -91,11 +94,6 @@
 
 #define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
 
-#if !RAID6_USE_EMPTY_ZERO_PAGE
-/* In .bss so it's zeroed */
-const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
-#endif
-
 /*
  * We maintain a biased count of active stripes in the bottom 16 bits of
  * bi_phys_segments, and a count of processed stripes in the upper 16 bits
@@ -130,12 +128,42 @@ static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
 	bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16);
 }
 
+/* Find first data disk in a raid6 stripe */
+static inline int raid6_d0(struct stripe_head *sh)
+{
+	if (sh->ddf_layout)
+		/* ddf always start from first device */
+		return 0;
+	/* md starts just after Q block */
+	if (sh->qd_idx == sh->disks - 1)
+		return 0;
+	else
+		return sh->qd_idx + 1;
+}
 static inline int raid6_next_disk(int disk, int raid_disks)
 {
 	disk++;
 	return (disk < raid_disks) ? disk : 0;
 }
 
+/* When walking through the disks in a raid5, starting at raid6_d0,
+ * We need to map each disk to a 'slot', where the data disks are slot
+ * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
+ * is raid_disks-1.  This help does that mapping.
+ */
+static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
+			     int *count, int syndrome_disks)
+{
+	int slot;
+
+	if (idx == sh->pd_idx)
+		return syndrome_disks;
+	if (idx == sh->qd_idx)
+		return syndrome_disks + 1;
+	slot = (*count)++;
+	return slot;
+}
+
 static void return_io(struct bio *return_bi)
 {
 	struct bio *bi = return_bi;
@@ -193,6 +221,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
 		}
 	}
 }
+
 static void release_stripe(struct stripe_head *sh)
 {
 	raid5_conf_t *conf = sh->raid_conf;
@@ -270,9 +299,11 @@ static int grow_buffers(struct stripe_head *sh, int num)
 	return 0;
 }
 
-static void raid5_build_block(struct stripe_head *sh, int i);
+static void raid5_build_block(struct stripe_head *sh, int i, int previous);
+static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
+			    struct stripe_head *sh);
 
-static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks)
+static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
 {
 	raid5_conf_t *conf = sh->raid_conf;
 	int i;
@@ -287,11 +318,12 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
 
 	remove_hash(sh);
 
+	sh->generation = conf->generation - previous;
+	sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
 	sh->sector = sector;
-	sh->pd_idx = pd_idx;
+	stripe_set_idx(sector, conf, previous, sh);
 	sh->state = 0;
 
-	sh->disks = disks;
 
 	for (i = sh->disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
@@ -305,12 +337,13 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
 			BUG();
 		}
 		dev->flags = 0;
-		raid5_build_block(sh, i);
+		raid5_build_block(sh, i, previous);
 	}
 	insert_hash(conf, sh);
 }
 
-static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks)
+static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
+					 short generation)
 {
 	struct stripe_head *sh;
 	struct hlist_node *hn;
@@ -318,7 +351,7 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in
 	CHECK_DEVLOCK();
 	pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
 	hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
-		if (sh->sector == sector && sh->disks == disks)
+		if (sh->sector == sector && sh->generation == generation)
 			return sh;
 	pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
 	return NULL;
@@ -327,8 +360,9 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in
 static void unplug_slaves(mddev_t *mddev);
 static void raid5_unplug_device(struct request_queue *q);
 
-static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks,
-					     int pd_idx, int noblock)
+static struct stripe_head *
+get_active_stripe(raid5_conf_t *conf, sector_t sector,
+		  int previous, int noblock)
 {
 	struct stripe_head *sh;
 
@@ -340,7 +374,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 		wait_event_lock_irq(conf->wait_for_stripe,
 				    conf->quiesce == 0,
 				    conf->device_lock, /* nothing */);
-		sh = __find_stripe(conf, sector, disks);
+		sh = __find_stripe(conf, sector, conf->generation - previous);
 		if (!sh) {
 			if (!conf->inactive_blocked)
 				sh = get_free_stripe(conf);
@@ -358,10 +392,11 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 					);
 				conf->inactive_blocked = 0;
 			} else
-				init_stripe(sh, sector, pd_idx, disks);
+				init_stripe(sh, sector, previous);
 		} else {
 			if (atomic_read(&sh->count)) {
-			  BUG_ON(!list_empty(&sh->lru));
+				BUG_ON(!list_empty(&sh->lru)
+				    && !test_bit(STRIPE_EXPANDING, &sh->state));
 			} else {
 				if (!test_bit(STRIPE_HANDLE, &sh->state))
 					atomic_inc(&conf->active_stripes);
@@ -895,8 +930,10 @@ static int grow_stripes(raid5_conf_t *conf, int num)
 	struct kmem_cache *sc;
 	int devs = conf->raid_disks;
 
-	sprintf(conf->cache_name[0], "raid5-%s", mdname(conf->mddev));
-	sprintf(conf->cache_name[1], "raid5-%s-alt", mdname(conf->mddev));
+	sprintf(conf->cache_name[0],
+		"raid%d-%s", conf->level, mdname(conf->mddev));
+	sprintf(conf->cache_name[1],
+		"raid%d-%s-alt", conf->level, mdname(conf->mddev));
 	conf->active_name = 0;
 	sc = kmem_cache_create(conf->cache_name[conf->active_name],
 			       sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
@@ -911,7 +948,6 @@ static int grow_stripes(raid5_conf_t *conf, int num)
 	return 0;
 }
 
-#ifdef CONFIG_MD_RAID5_RESHAPE
 static int resize_stripes(raid5_conf_t *conf, int newsize)
 {
 	/* Make all the stripes able to hold 'newsize' devices.
@@ -1036,7 +1072,6 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
 	conf->pool_size = newsize;
 	return err;
 }
-#endif
 
 static int drop_one_stripe(raid5_conf_t *conf)
 {
@@ -1066,7 +1101,7 @@ static void shrink_stripes(raid5_conf_t *conf)
 
 static void raid5_end_read_request(struct bio * bi, int error)
 {
- 	struct stripe_head *sh = bi->bi_private;
+	struct stripe_head *sh = bi->bi_private;
 	raid5_conf_t *conf = sh->raid_conf;
 	int disks = sh->disks, i;
 	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -1148,7 +1183,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
 
 static void raid5_end_write_request(struct bio *bi, int error)
 {
- 	struct stripe_head *sh = bi->bi_private;
+	struct stripe_head *sh = bi->bi_private;
 	raid5_conf_t *conf = sh->raid_conf;
 	int disks = sh->disks, i;
 	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -1176,9 +1211,9 @@ static void raid5_end_write_request(struct bio *bi, int error)
 }
 
 
-static sector_t compute_blocknr(struct stripe_head *sh, int i);
+static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
 	
-static void raid5_build_block(struct stripe_head *sh, int i)
+static void raid5_build_block(struct stripe_head *sh, int i, int previous)
 {
 	struct r5dev *dev = &sh->dev[i];
 
@@ -1194,7 +1229,7 @@ static void raid5_build_block(struct stripe_head *sh, int i)
 	dev->req.bi_private = sh;
 
 	dev->flags = 0;
-	dev->sector = compute_blocknr(sh, i);
+	dev->sector = compute_blocknr(sh, i, previous);
 }
 
 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
@@ -1227,15 +1262,23 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
  * Input: a 'big' sector number,
  * Output: index of the data and parity disk, and the sector # in them.
  */
-static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
-			unsigned int data_disks, unsigned int * dd_idx,
-			unsigned int * pd_idx, raid5_conf_t *conf)
+static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
+				     int previous, int *dd_idx,
+				     struct stripe_head *sh)
 {
 	long stripe;
 	unsigned long chunk_number;
 	unsigned int chunk_offset;
+	int pd_idx, qd_idx;
+	int ddf_layout = 0;
 	sector_t new_sector;
-	int sectors_per_chunk = conf->chunk_size >> 9;
+	int algorithm = previous ? conf->prev_algo
+				 : conf->algorithm;
+	int sectors_per_chunk = previous ? (conf->prev_chunk >> 9)
+					 : (conf->chunk_size >> 9);
+	int raid_disks = previous ? conf->previous_raid_disks
+				  : conf->raid_disks;
+	int data_disks = raid_disks - conf->max_degraded;
 
 	/* First compute the information on this sector */
 
@@ -1259,68 +1302,170 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
 	/*
 	 * Select the parity disk based on the user selected algorithm.
 	 */
+	pd_idx = qd_idx = ~0;
 	switch(conf->level) {
 	case 4:
-		*pd_idx = data_disks;
+		pd_idx = data_disks;
 		break;
 	case 5:
-		switch (conf->algorithm) {
+		switch (algorithm) {
 		case ALGORITHM_LEFT_ASYMMETRIC:
-			*pd_idx = data_disks - stripe % raid_disks;
-			if (*dd_idx >= *pd_idx)
+			pd_idx = data_disks - stripe % raid_disks;
+			if (*dd_idx >= pd_idx)
 				(*dd_idx)++;
 			break;
 		case ALGORITHM_RIGHT_ASYMMETRIC:
-			*pd_idx = stripe % raid_disks;
-			if (*dd_idx >= *pd_idx)
+			pd_idx = stripe % raid_disks;
+			if (*dd_idx >= pd_idx)
 				(*dd_idx)++;
 			break;
 		case ALGORITHM_LEFT_SYMMETRIC:
-			*pd_idx = data_disks - stripe % raid_disks;
-			*dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
+			pd_idx = data_disks - stripe % raid_disks;
+			*dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
 			break;
 		case ALGORITHM_RIGHT_SYMMETRIC:
-			*pd_idx = stripe % raid_disks;
-			*dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
+			pd_idx = stripe % raid_disks;
+			*dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
+			break;
+		case ALGORITHM_PARITY_0:
+			pd_idx = 0;
+			(*dd_idx)++;
+			break;
+		case ALGORITHM_PARITY_N:
+			pd_idx = data_disks;
 			break;
 		default:
 			printk(KERN_ERR "raid5: unsupported algorithm %d\n",
-				conf->algorithm);
+				algorithm);
+			BUG();
 		}
 		break;
 	case 6:
 
-		/**** FIX THIS ****/
-		switch (conf->algorithm) {
+		switch (algorithm) {
 		case ALGORITHM_LEFT_ASYMMETRIC:
-			*pd_idx = raid_disks - 1 - (stripe % raid_disks);
-			if (*pd_idx == raid_disks-1)
-				(*dd_idx)++; 	/* Q D D D P */
-			else if (*dd_idx >= *pd_idx)
+			pd_idx = raid_disks - 1 - (stripe % raid_disks);
+			qd_idx = pd_idx + 1;
+			if (pd_idx == raid_disks-1) {
+				(*dd_idx)++;	/* Q D D D P */
+				qd_idx = 0;
+			} else if (*dd_idx >= pd_idx)
 				(*dd_idx) += 2; /* D D P Q D */
 			break;
 		case ALGORITHM_RIGHT_ASYMMETRIC:
-			*pd_idx = stripe % raid_disks;
-			if (*pd_idx == raid_disks-1)
-				(*dd_idx)++; 	/* Q D D D P */
-			else if (*dd_idx >= *pd_idx)
+			pd_idx = stripe % raid_disks;
+			qd_idx = pd_idx + 1;
+			if (pd_idx == raid_disks-1) {
+				(*dd_idx)++;	/* Q D D D P */
+				qd_idx = 0;
+			} else if (*dd_idx >= pd_idx)
 				(*dd_idx) += 2; /* D D P Q D */
 			break;
 		case ALGORITHM_LEFT_SYMMETRIC:
-			*pd_idx = raid_disks - 1 - (stripe % raid_disks);
-			*dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
+			pd_idx = raid_disks - 1 - (stripe % raid_disks);
+			qd_idx = (pd_idx + 1) % raid_disks;
+			*dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
 			break;
 		case ALGORITHM_RIGHT_SYMMETRIC:
-			*pd_idx = stripe % raid_disks;
-			*dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
+			pd_idx = stripe % raid_disks;
+			qd_idx = (pd_idx + 1) % raid_disks;
+			*dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
+			break;
+
+		case ALGORITHM_PARITY_0:
+			pd_idx = 0;
+			qd_idx = 1;
+			(*dd_idx) += 2;
+			break;
+		case ALGORITHM_PARITY_N:
+			pd_idx = data_disks;
+			qd_idx = data_disks + 1;
 			break;
+
+		case ALGORITHM_ROTATING_ZERO_RESTART:
+			/* Exactly the same as RIGHT_ASYMMETRIC, but or
+			 * of blocks for computing Q is different.
+			 */
+			pd_idx = stripe % raid_disks;
+			qd_idx = pd_idx + 1;
+			if (pd_idx == raid_disks-1) {
+				(*dd_idx)++;	/* Q D D D P */
+				qd_idx = 0;
+			} else if (*dd_idx >= pd_idx)
+				(*dd_idx) += 2; /* D D P Q D */
+			ddf_layout = 1;
+			break;
+
+		case ALGORITHM_ROTATING_N_RESTART:
+			/* Same a left_asymmetric, by first stripe is
+			 * D D D P Q  rather than
+			 * Q D D D P
+			 */
+			pd_idx = raid_disks - 1 - ((stripe + 1) % raid_disks);
+			qd_idx = pd_idx + 1;
+			if (pd_idx == raid_disks-1) {
+				(*dd_idx)++;	/* Q D D D P */
+				qd_idx = 0;
+			} else if (*dd_idx >= pd_idx)
+				(*dd_idx) += 2; /* D D P Q D */
+			ddf_layout = 1;
+			break;
+
+		case ALGORITHM_ROTATING_N_CONTINUE:
+			/* Same as left_symmetric but Q is before P */
+			pd_idx = raid_disks - 1 - (stripe % raid_disks);
+			qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
+			*dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
+			ddf_layout = 1;
+			break;
+
+		case ALGORITHM_LEFT_ASYMMETRIC_6:
+			/* RAID5 left_asymmetric, with Q on last device */
+			pd_idx = data_disks - stripe % (raid_disks-1);
+			if (*dd_idx >= pd_idx)
+				(*dd_idx)++;
+			qd_idx = raid_disks - 1;
+			break;
+
+		case ALGORITHM_RIGHT_ASYMMETRIC_6:
+			pd_idx = stripe % (raid_disks-1);
+			if (*dd_idx >= pd_idx)
+				(*dd_idx)++;
+			qd_idx = raid_disks - 1;
+			break;
+
+		case ALGORITHM_LEFT_SYMMETRIC_6:
+			pd_idx = data_disks - stripe % (raid_disks-1);
+			*dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
+			qd_idx = raid_disks - 1;
+			break;
+
+		case ALGORITHM_RIGHT_SYMMETRIC_6:
+			pd_idx = stripe % (raid_disks-1);
+			*dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
+			qd_idx = raid_disks - 1;
+			break;
+
+		case ALGORITHM_PARITY_0_6:
+			pd_idx = 0;
+			(*dd_idx)++;
+			qd_idx = raid_disks - 1;
+			break;
+
+
 		default:
 			printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
-			       conf->algorithm);
+			       algorithm);
+			BUG();
 		}
 		break;
 	}
 
+	if (sh) {
+		sh->pd_idx = pd_idx;
+		sh->qd_idx = qd_idx;
+		sh->ddf_layout = ddf_layout;
+	}
 	/*
 	 * Finally, compute the new sector number
 	 */
@@ -1329,17 +1474,21 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
 }
 
 
-static sector_t compute_blocknr(struct stripe_head *sh, int i)
+static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
 {
 	raid5_conf_t *conf = sh->raid_conf;
 	int raid_disks = sh->disks;
 	int data_disks = raid_disks - conf->max_degraded;
 	sector_t new_sector = sh->sector, check;
-	int sectors_per_chunk = conf->chunk_size >> 9;
+	int sectors_per_chunk = previous ? (conf->prev_chunk >> 9)
+					 : (conf->chunk_size >> 9);
+	int algorithm = previous ? conf->prev_algo
+				 : conf->algorithm;
 	sector_t stripe;
 	int chunk_offset;
-	int chunk_number, dummy1, dummy2, dd_idx = i;
+	int chunk_number, dummy1, dd_idx = i;
 	sector_t r_sector;
+	struct stripe_head sh2;
 
 
 	chunk_offset = sector_div(new_sector, sectors_per_chunk);
@@ -1351,7 +1500,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
 	switch(conf->level) {
 	case 4: break;
 	case 5:
-		switch (conf->algorithm) {
+		switch (algorithm) {
 		case ALGORITHM_LEFT_ASYMMETRIC:
 		case ALGORITHM_RIGHT_ASYMMETRIC:
 			if (i > sh->pd_idx)
@@ -1363,19 +1512,27 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
 				i += raid_disks;
 			i -= (sh->pd_idx + 1);
 			break;
+		case ALGORITHM_PARITY_0:
+			i -= 1;
+			break;
+		case ALGORITHM_PARITY_N:
+			break;
 		default:
 			printk(KERN_ERR "raid5: unsupported algorithm %d\n",
-			       conf->algorithm);
+			       algorithm);
+			BUG();
 		}
 		break;
 	case 6:
-		if (i == raid6_next_disk(sh->pd_idx, raid_disks))
+		if (i == sh->qd_idx)
 			return 0; /* It is the Q disk */
-		switch (conf->algorithm) {
+		switch (algorithm) {
 		case ALGORITHM_LEFT_ASYMMETRIC:
 		case ALGORITHM_RIGHT_ASYMMETRIC:
-		  	if (sh->pd_idx == raid_disks-1)
-				i--; 	/* Q D D D P */
+		case ALGORITHM_ROTATING_ZERO_RESTART:
+		case ALGORITHM_ROTATING_N_RESTART:
+			if (sh->pd_idx == raid_disks-1)
+				i--;	/* Q D D D P */
 			else if (i > sh->pd_idx)
 				i -= 2; /* D D P Q D */
 			break;
@@ -1390,9 +1547,35 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
 				i -= (sh->pd_idx + 2);
 			}
 			break;
+		case ALGORITHM_PARITY_0:
+			i -= 2;
+			break;
+		case ALGORITHM_PARITY_N:
+			break;
+		case ALGORITHM_ROTATING_N_CONTINUE:
+			if (sh->pd_idx == 0)
+				i--;	/* P D D D Q */
+			else if (i > sh->pd_idx)
+				i -= 2; /* D D Q P D */
+			break;
+		case ALGORITHM_LEFT_ASYMMETRIC_6:
+		case ALGORITHM_RIGHT_ASYMMETRIC_6:
+			if (i > sh->pd_idx)
+				i--;
+			break;
+		case ALGORITHM_LEFT_SYMMETRIC_6:
+		case ALGORITHM_RIGHT_SYMMETRIC_6:
+			if (i < sh->pd_idx)
+				i += data_disks + 1;
+			i -= (sh->pd_idx + 1);
+			break;
+		case ALGORITHM_PARITY_0_6:
+			i -= 1;
+			break;
 		default:
 			printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
-			       conf->algorithm);
+			       algorithm);
+			BUG();
 		}
 		break;
 	}
@@ -1400,8 +1583,10 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
 	chunk_number = stripe * data_disks + i;
 	r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset;
 
-	check = raid5_compute_sector(r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
-	if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
+	check = raid5_compute_sector(conf, r_sector,
+				     previous, &dummy1, &sh2);
+	if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
+		|| sh2.qd_idx != sh->qd_idx) {
 		printk(KERN_ERR "compute_blocknr: map not correct\n");
 		return 0;
 	}
@@ -1468,14 +1653,16 @@ static void copy_data(int frombio, struct bio *bio,
 
 static void compute_parity6(struct stripe_head *sh, int method)
 {
-	raid6_conf_t *conf = sh->raid_conf;
-	int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
+	raid5_conf_t *conf = sh->raid_conf;
+	int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
+	int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
 	struct bio *chosen;
 	/**** FIX THIS: This could be very bad if disks is close to 256 ****/
-	void *ptrs[disks];
+	void *ptrs[syndrome_disks+2];
 
-	qd_idx = raid6_next_disk(pd_idx, disks);
-	d0_idx = raid6_next_disk(qd_idx, disks);
+	pd_idx = sh->pd_idx;
+	qd_idx = sh->qd_idx;
+	d0_idx = raid6_d0(sh);
 
 	pr_debug("compute_parity, stripe %llu, method %d\n",
 		(unsigned long long)sh->sector, method);
@@ -1513,24 +1700,29 @@ static void compute_parity6(struct stripe_head *sh, int method)
 			set_bit(R5_UPTODATE, &sh->dev[i].flags);
 		}
 
-//	switch(method) {
-//	case RECONSTRUCT_WRITE:
-//	case CHECK_PARITY:
-//	case UPDATE_PARITY:
-		/* Note that unlike RAID-5, the ordering of the disks matters greatly. */
-		/* FIX: Is this ordering of drives even remotely optimal? */
-		count = 0;
-		i = d0_idx;
-		do {
-			ptrs[count++] = page_address(sh->dev[i].page);
-			if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
-				printk("block %d/%d not uptodate on parity calc\n", i,count);
-			i = raid6_next_disk(i, disks);
-		} while ( i != d0_idx );
-//		break;
-//	}
-
-	raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs);
+	/* Note that unlike RAID-5, the ordering of the disks matters greatly.*/
+
+	for (i = 0; i < disks; i++)
+		ptrs[i] = (void *)raid6_empty_zero_page;
+
+	count = 0;
+	i = d0_idx;
+	do {
+		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
+
+		ptrs[slot] = page_address(sh->dev[i].page);
+		if (slot < syndrome_disks &&
+		    !test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
+			printk(KERN_ERR "block %d/%d not uptodate "
+			       "on parity calc\n", i, count);
+			BUG();
+		}
+
+		i = raid6_next_disk(i, disks);
+	} while (i != d0_idx);
+	BUG_ON(count != syndrome_disks);
+
+	raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs);
 
 	switch(method) {
 	case RECONSTRUCT_WRITE:
@@ -1552,8 +1744,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
 {
 	int i, count, disks = sh->disks;
 	void *ptr[MAX_XOR_BLOCKS], *dest, *p;
-	int pd_idx = sh->pd_idx;
-	int qd_idx = raid6_next_disk(pd_idx, disks);
+	int qd_idx = sh->qd_idx;
 
 	pr_debug("compute_block_1, stripe %llu, idx %d\n",
 		(unsigned long long)sh->sector, dd_idx);
@@ -1589,63 +1780,65 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
 static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
 {
 	int i, count, disks = sh->disks;
-	int pd_idx = sh->pd_idx;
-	int qd_idx = raid6_next_disk(pd_idx, disks);
-	int d0_idx = raid6_next_disk(qd_idx, disks);
-	int faila, failb;
+	int syndrome_disks = sh->ddf_layout ? disks : disks-2;
+	int d0_idx = raid6_d0(sh);
+	int faila = -1, failb = -1;
+	/**** FIX THIS: This could be very bad if disks is close to 256 ****/
+	void *ptrs[syndrome_disks+2];
 
-	/* faila and failb are disk numbers relative to d0_idx */
-	/* pd_idx become disks-2 and qd_idx become disks-1 */
-	faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx;
-	failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx;
+	for (i = 0; i < disks ; i++)
+		ptrs[i] = (void *)raid6_empty_zero_page;
+	count = 0;
+	i = d0_idx;
+	do {
+		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
+
+		ptrs[slot] = page_address(sh->dev[i].page);
+
+		if (i == dd_idx1)
+			faila = slot;
+		if (i == dd_idx2)
+			failb = slot;
+		i = raid6_next_disk(i, disks);
+	} while (i != d0_idx);
+	BUG_ON(count != syndrome_disks);
 
 	BUG_ON(faila == failb);
 	if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
 
 	pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
-	       (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb);
+		 (unsigned long long)sh->sector, dd_idx1, dd_idx2,
+		 faila, failb);
 
-	if ( failb == disks-1 ) {
+	if (failb == syndrome_disks+1) {
 		/* Q disk is one of the missing disks */
-		if ( faila == disks-2 ) {
+		if (faila == syndrome_disks) {
 			/* Missing P+Q, just recompute */
 			compute_parity6(sh, UPDATE_PARITY);
 			return;
 		} else {
 			/* We're missing D+Q; recompute D from P */
-			compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0);
+			compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ?
+					     dd_idx2 : dd_idx1),
+					0);
 			compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */
 			return;
 		}
 	}
 
-	/* We're missing D+P or D+D; build pointer table */
-	{
-		/**** FIX THIS: This could be very bad if disks is close to 256 ****/
-		void *ptrs[disks];
-
-		count = 0;
-		i = d0_idx;
-		do {
-			ptrs[count++] = page_address(sh->dev[i].page);
-			i = raid6_next_disk(i, disks);
-			if (i != dd_idx1 && i != dd_idx2 &&
-			    !test_bit(R5_UPTODATE, &sh->dev[i].flags))
-				printk("compute_2 with missing block %d/%d\n", count, i);
-		} while ( i != d0_idx );
-
-		if ( failb == disks-2 ) {
-			/* We're missing D+P. */
-			raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs);
-		} else {
-			/* We're missing D+D. */
-			raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs);
-		}
-
-		/* Both the above update both missing blocks */
-		set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
-		set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
+	/* We're missing D+P or D+D; */
+	if (failb == syndrome_disks) {
+		/* We're missing D+P. */
+		raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs);
+	} else {
+		/* We're missing D+D. */
+		raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb,
+				  ptrs);
 	}
+
+	/* Both the above update both missing blocks */
+	set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
+	set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
 }
 
 static void
@@ -1800,17 +1993,21 @@ static int page_is_zero(struct page *p)
 		memcmp(a, a+4, STRIPE_SIZE-4)==0);
 }
 
-static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
+static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
+			    struct stripe_head *sh)
 {
-	int sectors_per_chunk = conf->chunk_size >> 9;
-	int pd_idx, dd_idx;
+	int sectors_per_chunk =
+		previous ? (conf->prev_chunk >> 9)
+			 : (conf->chunk_size >> 9);
+	int dd_idx;
 	int chunk_offset = sector_div(stripe, sectors_per_chunk);
+	int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
 
-	raid5_compute_sector(stripe * (disks - conf->max_degraded)
+	raid5_compute_sector(conf,
+			     stripe * (disks - conf->max_degraded)
 			     *sectors_per_chunk + chunk_offset,
-			     disks, disks - conf->max_degraded,
-			     &dd_idx, &pd_idx, conf);
-	return pd_idx;
+			     previous,
+			     &dd_idx, sh);
 }
 
 static void
@@ -2181,7 +2378,7 @@ static void handle_stripe_dirtying6(raid5_conf_t *conf,
 		struct r6_state *r6s, int disks)
 {
 	int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i;
-	int qd_idx = r6s->qd_idx;
+	int qd_idx = sh->qd_idx;
 	for (i = disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 		/* Would I have to read this buffer for reconstruct_write */
@@ -2371,7 +2568,7 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
 	int update_p = 0, update_q = 0;
 	struct r5dev *dev;
 	int pd_idx = sh->pd_idx;
-	int qd_idx = r6s->qd_idx;
+	int qd_idx = sh->qd_idx;
 
 	set_bit(STRIPE_HANDLE, &sh->state);
 
@@ -2467,17 +2664,14 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
 	struct dma_async_tx_descriptor *tx = NULL;
 	clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
 	for (i = 0; i < sh->disks; i++)
-		if (i != sh->pd_idx && (!r6s || i != r6s->qd_idx)) {
-			int dd_idx, pd_idx, j;
+		if (i != sh->pd_idx && i != sh->qd_idx) {
+			int dd_idx, j;
 			struct stripe_head *sh2;
 
-			sector_t bn = compute_blocknr(sh, i);
-			sector_t s = raid5_compute_sector(bn, conf->raid_disks,
-						conf->raid_disks -
-						conf->max_degraded, &dd_idx,
-						&pd_idx, conf);
-			sh2 = get_active_stripe(conf, s, conf->raid_disks,
-						pd_idx, 1);
+			sector_t bn = compute_blocknr(sh, i, 1);
+			sector_t s = raid5_compute_sector(conf, bn, 0,
+							  &dd_idx, NULL);
+			sh2 = get_active_stripe(conf, s, 0, 1);
 			if (sh2 == NULL)
 				/* so far only the early blocks of this stripe
 				 * have been requested.  When later blocks
@@ -2500,8 +2694,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
 			set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
 			for (j = 0; j < conf->raid_disks; j++)
 				if (j != sh2->pd_idx &&
-				    (!r6s || j != raid6_next_disk(sh2->pd_idx,
-								 sh2->disks)) &&
+				    (!r6s || j != sh2->qd_idx) &&
 				    !test_bit(R5_Expanded, &sh2->dev[j].flags))
 					break;
 			if (j == conf->raid_disks) {
@@ -2750,6 +2943,23 @@ static bool handle_stripe5(struct stripe_head *sh)
 
 	/* Finish reconstruct operations initiated by the expansion process */
 	if (sh->reconstruct_state == reconstruct_state_result) {
+		struct stripe_head *sh2
+			= get_active_stripe(conf, sh->sector, 1, 1);
+		if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
+			/* sh cannot be written until sh2 has been read.
+			 * so arrange for sh to be delayed a little
+			 */
+			set_bit(STRIPE_DELAYED, &sh->state);
+			set_bit(STRIPE_HANDLE, &sh->state);
+			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
+					      &sh2->state))
+				atomic_inc(&conf->preread_active_stripes);
+			release_stripe(sh2);
+			goto unlock;
+		}
+		if (sh2)
+			release_stripe(sh2);
+
 		sh->reconstruct_state = reconstruct_state_idle;
 		clear_bit(STRIPE_EXPANDING, &sh->state);
 		for (i = conf->raid_disks; i--; ) {
@@ -2763,8 +2973,7 @@ static bool handle_stripe5(struct stripe_head *sh)
 	    !sh->reconstruct_state) {
 		/* Need to write out all blocks after computing parity */
 		sh->disks = conf->raid_disks;
-		sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
-			conf->raid_disks);
+		stripe_set_idx(sh->sector, conf, 0, sh);
 		schedule_reconstruction5(sh, &s, 1, 1);
 	} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
 		clear_bit(STRIPE_EXPAND_READY, &sh->state);
@@ -2796,20 +3005,19 @@ static bool handle_stripe5(struct stripe_head *sh)
 
 static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 {
-	raid6_conf_t *conf = sh->raid_conf;
+	raid5_conf_t *conf = sh->raid_conf;
 	int disks = sh->disks;
 	struct bio *return_bi = NULL;
-	int i, pd_idx = sh->pd_idx;
+	int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx;
 	struct stripe_head_state s;
 	struct r6_state r6s;
 	struct r5dev *dev, *pdev, *qdev;
 	mdk_rdev_t *blocked_rdev = NULL;
 
-	r6s.qd_idx = raid6_next_disk(pd_idx, disks);
 	pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
 		"pd_idx=%d, qd_idx=%d\n",
 	       (unsigned long long)sh->sector, sh->state,
-	       atomic_read(&sh->count), pd_idx, r6s.qd_idx);
+	       atomic_read(&sh->count), pd_idx, qd_idx);
 	memset(&s, 0, sizeof(s));
 
 	spin_lock(&sh->lock);
@@ -2920,9 +3128,9 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 	pdev = &sh->dev[pd_idx];
 	r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx)
 		|| (s.failed >= 2 && r6s.failed_num[1] == pd_idx);
-	qdev = &sh->dev[r6s.qd_idx];
-	r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == r6s.qd_idx)
-		|| (s.failed >= 2 && r6s.failed_num[1] == r6s.qd_idx);
+	qdev = &sh->dev[qd_idx];
+	r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx)
+		|| (s.failed >= 2 && r6s.failed_num[1] == qd_idx);
 
 	if ( s.written &&
 	     ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
@@ -2980,10 +3188,26 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 		}
 
 	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
+		struct stripe_head *sh2
+			= get_active_stripe(conf, sh->sector, 1, 1);
+		if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
+			/* sh cannot be written until sh2 has been read.
+			 * so arrange for sh to be delayed a little
+			 */
+			set_bit(STRIPE_DELAYED, &sh->state);
+			set_bit(STRIPE_HANDLE, &sh->state);
+			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
+					      &sh2->state))
+				atomic_inc(&conf->preread_active_stripes);
+			release_stripe(sh2);
+			goto unlock;
+		}
+		if (sh2)
+			release_stripe(sh2);
+
 		/* Need to write out all blocks after computing P&Q */
 		sh->disks = conf->raid_disks;
-		sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
-					     conf->raid_disks);
+		stripe_set_idx(sh->sector, conf, 0, sh);
 		compute_parity6(sh, RECONSTRUCT_WRITE);
 		for (i = conf->raid_disks ; i-- ;  ) {
 			set_bit(R5_LOCKED, &sh->dev[i].flags);
@@ -3134,6 +3358,8 @@ static int raid5_mergeable_bvec(struct request_queue *q,
 	if ((bvm->bi_rw & 1) == WRITE)
 		return biovec->bv_len; /* always allow writes to be mergeable */
 
+	if (mddev->new_chunk < mddev->chunk_size)
+		chunk_sectors = mddev->new_chunk >> 9;
 	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
 	if (max < 0) max = 0;
 	if (max <= biovec->bv_len && bio_sectors == 0)
@@ -3149,6 +3375,8 @@ static int in_chunk_boundary(mddev_t *mddev, struct bio *bio)
 	unsigned int chunk_sectors = mddev->chunk_size >> 9;
 	unsigned int bio_sectors = bio->bi_size >> 9;
 
+	if (mddev->new_chunk < mddev->chunk_size)
+		chunk_sectors = mddev->new_chunk >> 9;
 	return  chunk_sectors >=
 		((sector & (chunk_sectors - 1)) + bio_sectors);
 }
@@ -3255,9 +3483,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
 {
 	mddev_t *mddev = q->queuedata;
 	raid5_conf_t *conf = mddev_to_conf(mddev);
-	const unsigned int raid_disks = conf->raid_disks;
-	const unsigned int data_disks = raid_disks - conf->max_degraded;
-	unsigned int dd_idx, pd_idx;
+	unsigned int dd_idx;
 	struct bio* align_bi;
 	mdk_rdev_t *rdev;
 
@@ -3266,7 +3492,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
 		return 0;
 	}
 	/*
- 	 * use bio_clone to make a copy of the bio
+	 * use bio_clone to make a copy of the bio
 	 */
 	align_bi = bio_clone(raid_bio, GFP_NOIO);
 	if (!align_bi)
@@ -3280,12 +3506,9 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
 	/*
 	 *	compute position
 	 */
-	align_bi->bi_sector =  raid5_compute_sector(raid_bio->bi_sector,
-					raid_disks,
-					data_disks,
-					&dd_idx,
-					&pd_idx,
-					conf);
+	align_bi->bi_sector =  raid5_compute_sector(conf, raid_bio->bi_sector,
+						    0,
+						    &dd_idx, NULL);
 
 	rcu_read_lock();
 	rdev = rcu_dereference(conf->disks[dd_idx].rdev);
@@ -3377,7 +3600,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
 {
 	mddev_t *mddev = q->queuedata;
 	raid5_conf_t *conf = mddev_to_conf(mddev);
-	unsigned int dd_idx, pd_idx;
+	int dd_idx;
 	sector_t new_sector;
 	sector_t logical_sector, last_sector;
 	struct stripe_head *sh;
@@ -3400,7 +3623,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
 	if (rw == READ &&
 	     mddev->reshape_position == MaxSector &&
 	     chunk_aligned_read(q,bi))
-            	return 0;
+		return 0;
 
 	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
 	last_sector = bi->bi_sector + (bi->bi_size>>9);
@@ -3410,26 +3633,31 @@ static int make_request(struct request_queue *q, struct bio * bi)
 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
 		DEFINE_WAIT(w);
 		int disks, data_disks;
+		int previous;
 
 	retry:
+		previous = 0;
+		disks = conf->raid_disks;
 		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
-		if (likely(conf->expand_progress == MaxSector))
-			disks = conf->raid_disks;
-		else {
-			/* spinlock is needed as expand_progress may be
+		if (unlikely(conf->reshape_progress != MaxSector)) {
+			/* spinlock is needed as reshape_progress may be
 			 * 64bit on a 32bit platform, and so it might be
 			 * possible to see a half-updated value
-			 * Ofcourse expand_progress could change after
+			 * Ofcourse reshape_progress could change after
 			 * the lock is dropped, so once we get a reference
 			 * to the stripe that we think it is, we will have
 			 * to check again.
 			 */
 			spin_lock_irq(&conf->device_lock);
-			disks = conf->raid_disks;
-			if (logical_sector >= conf->expand_progress)
+			if (mddev->delta_disks < 0
+			    ? logical_sector < conf->reshape_progress
+			    : logical_sector >= conf->reshape_progress) {
 				disks = conf->previous_raid_disks;
-			else {
-				if (logical_sector >= conf->expand_lo) {
+				previous = 1;
+			} else {
+				if (mddev->delta_disks < 0
+				    ? logical_sector < conf->reshape_safe
+				    : logical_sector >= conf->reshape_safe) {
 					spin_unlock_irq(&conf->device_lock);
 					schedule();
 					goto retry;
@@ -3439,15 +3667,17 @@ static int make_request(struct request_queue *q, struct bio * bi)
 		}
 		data_disks = disks - conf->max_degraded;
 
- 		new_sector = raid5_compute_sector(logical_sector, disks, data_disks,
-						  &dd_idx, &pd_idx, conf);
+		new_sector = raid5_compute_sector(conf, logical_sector,
+						  previous,
+						  &dd_idx, NULL);
 		pr_debug("raid5: make_request, sector %llu logical %llu\n",
 			(unsigned long long)new_sector, 
 			(unsigned long long)logical_sector);
 
-		sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK));
+		sh = get_active_stripe(conf, new_sector, previous,
+				       (bi->bi_rw&RWA_MASK));
 		if (sh) {
-			if (unlikely(conf->expand_progress != MaxSector)) {
+			if (unlikely(previous)) {
 				/* expansion might have moved on while waiting for a
 				 * stripe, so we must do the range check again.
 				 * Expansion could still move past after this
@@ -3458,8 +3688,9 @@ static int make_request(struct request_queue *q, struct bio * bi)
 				 */
 				int must_retry = 0;
 				spin_lock_irq(&conf->device_lock);
-				if (logical_sector <  conf->expand_progress &&
-				    disks == conf->previous_raid_disks)
+				if (mddev->delta_disks < 0
+				    ? logical_sector >= conf->reshape_progress
+				    : logical_sector < conf->reshape_progress)
 					/* mismatch, need to try again */
 					must_retry = 1;
 				spin_unlock_irq(&conf->device_lock);
@@ -3514,6 +3745,8 @@ static int make_request(struct request_queue *q, struct bio * bi)
 	return 0;
 }
 
+static sector_t raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks);
+
 static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped)
 {
 	/* reshaping is quite different to recovery/resync so it is
@@ -3527,61 +3760,118 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 	 */
 	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
 	struct stripe_head *sh;
-	int pd_idx;
 	sector_t first_sector, last_sector;
 	int raid_disks = conf->previous_raid_disks;
 	int data_disks = raid_disks - conf->max_degraded;
 	int new_data_disks = conf->raid_disks - conf->max_degraded;
 	int i;
 	int dd_idx;
-	sector_t writepos, safepos, gap;
-
-	if (sector_nr == 0 &&
-	    conf->expand_progress != 0) {
-		/* restarting in the middle, skip the initial sectors */
-		sector_nr = conf->expand_progress;
+	sector_t writepos, readpos, safepos;
+	sector_t stripe_addr;
+	int reshape_sectors;
+	struct list_head stripes;
+
+	if (sector_nr == 0) {
+		/* If restarting in the middle, skip the initial sectors */
+		if (mddev->delta_disks < 0 &&
+		    conf->reshape_progress < raid5_size(mddev, 0, 0)) {
+			sector_nr = raid5_size(mddev, 0, 0)
+				- conf->reshape_progress;
+		} else if (mddev->delta_disks > 0 &&
+			   conf->reshape_progress > 0)
+			sector_nr = conf->reshape_progress;
 		sector_div(sector_nr, new_data_disks);
-		*skipped = 1;
-		return sector_nr;
+		if (sector_nr) {
+			*skipped = 1;
+			return sector_nr;
+		}
 	}
 
+	/* We need to process a full chunk at a time.
+	 * If old and new chunk sizes differ, we need to process the
+	 * largest of these
+	 */
+	if (mddev->new_chunk > mddev->chunk_size)
+		reshape_sectors = mddev->new_chunk / 512;
+	else
+		reshape_sectors = mddev->chunk_size / 512;
+
 	/* we update the metadata when there is more than 3Meg
 	 * in the block range (that is rather arbitrary, should
 	 * probably be time based) or when the data about to be
 	 * copied would over-write the source of the data at
 	 * the front of the range.
-	 * i.e. one new_stripe forward from expand_progress new_maps
-	 * to after where expand_lo old_maps to
+	 * i.e. one new_stripe along from reshape_progress new_maps
+	 * to after where reshape_safe old_maps to
 	 */
-	writepos = conf->expand_progress +
-		conf->chunk_size/512*(new_data_disks);
+	writepos = conf->reshape_progress;
 	sector_div(writepos, new_data_disks);
-	safepos = conf->expand_lo;
+	readpos = conf->reshape_progress;
+	sector_div(readpos, data_disks);
+	safepos = conf->reshape_safe;
 	sector_div(safepos, data_disks);
-	gap = conf->expand_progress - conf->expand_lo;
+	if (mddev->delta_disks < 0) {
+		writepos -= reshape_sectors;
+		readpos += reshape_sectors;
+		safepos += reshape_sectors;
+	} else {
+		writepos += reshape_sectors;
+		readpos -= reshape_sectors;
+		safepos -= reshape_sectors;
+	}
 
-	if (writepos >= safepos ||
-	    gap > (new_data_disks)*3000*2 /*3Meg*/) {
+	/* 'writepos' is the most advanced device address we might write.
+	 * 'readpos' is the least advanced device address we might read.
+	 * 'safepos' is the least address recorded in the metadata as having
+	 *     been reshaped.
+	 * If 'readpos' is behind 'writepos', then there is no way that we can
+	 * ensure safety in the face of a crash - that must be done by userspace
+	 * making a backup of the data.  So in that case there is no particular
+	 * rush to update metadata.
+	 * Otherwise if 'safepos' is behind 'writepos', then we really need to
+	 * update the metadata to advance 'safepos' to match 'readpos' so that
+	 * we can be safe in the event of a crash.
+	 * So we insist on updating metadata if safepos is behind writepos and
+	 * readpos is beyond writepos.
+	 * In any case, update the metadata every 10 seconds.
+	 * Maybe that number should be configurable, but I'm not sure it is
+	 * worth it.... maybe it could be a multiple of safemode_delay???
+	 */
+	if ((mddev->delta_disks < 0
+	     ? (safepos > writepos && readpos < writepos)
+	     : (safepos < writepos && readpos > writepos)) ||
+	    time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
 		/* Cannot proceed until we've updated the superblock... */
 		wait_event(conf->wait_for_overlap,
 			   atomic_read(&conf->reshape_stripes)==0);
-		mddev->reshape_position = conf->expand_progress;
+		mddev->reshape_position = conf->reshape_progress;
+		conf->reshape_checkpoint = jiffies;
 		set_bit(MD_CHANGE_DEVS, &mddev->flags);
 		md_wakeup_thread(mddev->thread);
 		wait_event(mddev->sb_wait, mddev->flags == 0 ||
 			   kthread_should_stop());
 		spin_lock_irq(&conf->device_lock);
-		conf->expand_lo = mddev->reshape_position;
+		conf->reshape_safe = mddev->reshape_position;
 		spin_unlock_irq(&conf->device_lock);
 		wake_up(&conf->wait_for_overlap);
 	}
 
-	for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
+	if (mddev->delta_disks < 0) {
+		BUG_ON(conf->reshape_progress == 0);
+		stripe_addr = writepos;
+		BUG_ON((mddev->dev_sectors &
+			~((sector_t)reshape_sectors - 1))
+		       - reshape_sectors - stripe_addr
+		       != sector_nr);
+	} else {
+		BUG_ON(writepos != sector_nr + reshape_sectors);
+		stripe_addr = sector_nr;
+	}
+	INIT_LIST_HEAD(&stripes);
+	for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
 		int j;
 		int skipped = 0;
-		pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks);
-		sh = get_active_stripe(conf, sector_nr+i,
-				       conf->raid_disks, pd_idx, 0);
+		sh = get_active_stripe(conf, stripe_addr+i, 0, 0);
 		set_bit(STRIPE_EXPANDING, &sh->state);
 		atomic_inc(&conf->reshape_stripes);
 		/* If any of this stripe is beyond the end of the old
@@ -3592,10 +3882,10 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 			if (j == sh->pd_idx)
 				continue;
 			if (conf->level == 6 &&
-			    j == raid6_next_disk(sh->pd_idx, sh->disks))
+			    j == sh->qd_idx)
 				continue;
-			s = compute_blocknr(sh, j);
-			if (s < mddev->array_sectors) {
+			s = compute_blocknr(sh, j, 0);
+			if (s < raid5_size(mddev, 0, 0)) {
 				skipped = 1;
 				continue;
 			}
@@ -3607,10 +3897,13 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 			set_bit(STRIPE_EXPAND_READY, &sh->state);
 			set_bit(STRIPE_HANDLE, &sh->state);
 		}
-		release_stripe(sh);
+		list_add(&sh->lru, &stripes);
 	}
 	spin_lock_irq(&conf->device_lock);
-	conf->expand_progress = (sector_nr + i) * new_data_disks;
+	if (mddev->delta_disks < 0)
+		conf->reshape_progress -= reshape_sectors * new_data_disks;
+	else
+		conf->reshape_progress += reshape_sectors * new_data_disks;
 	spin_unlock_irq(&conf->device_lock);
 	/* Ok, those stripe are ready. We can start scheduling
 	 * reads on the source stripes.
@@ -3618,46 +3911,50 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 	 * block on the destination stripes.
 	 */
 	first_sector =
-		raid5_compute_sector(sector_nr*(new_data_disks),
-				     raid_disks, data_disks,
-				     &dd_idx, &pd_idx, conf);
+		raid5_compute_sector(conf, stripe_addr*(new_data_disks),
+				     1, &dd_idx, NULL);
 	last_sector =
-		raid5_compute_sector((sector_nr+conf->chunk_size/512)
-				     *(new_data_disks) -1,
-				     raid_disks, data_disks,
-				     &dd_idx, &pd_idx, conf);
-	if (last_sector >= (mddev->size<<1))
-		last_sector = (mddev->size<<1)-1;
+		raid5_compute_sector(conf, ((stripe_addr+conf->chunk_size/512)
+					    *(new_data_disks) - 1),
+				     1, &dd_idx, NULL);
+	if (last_sector >= mddev->dev_sectors)
+		last_sector = mddev->dev_sectors - 1;
 	while (first_sector <= last_sector) {
-		pd_idx = stripe_to_pdidx(first_sector, conf,
-					 conf->previous_raid_disks);
-		sh = get_active_stripe(conf, first_sector,
-				       conf->previous_raid_disks, pd_idx, 0);
+		sh = get_active_stripe(conf, first_sector, 1, 0);
 		set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
 		set_bit(STRIPE_HANDLE, &sh->state);
 		release_stripe(sh);
 		first_sector += STRIPE_SECTORS;
 	}
+	/* Now that the sources are clearly marked, we can release
+	 * the destination stripes
+	 */
+	while (!list_empty(&stripes)) {
+		sh = list_entry(stripes.next, struct stripe_head, lru);
+		list_del_init(&sh->lru);
+		release_stripe(sh);
+	}
 	/* If this takes us to the resync_max point where we have to pause,
 	 * then we need to write out the superblock.
 	 */
-	sector_nr += conf->chunk_size>>9;
+	sector_nr += reshape_sectors;
 	if (sector_nr >= mddev->resync_max) {
 		/* Cannot proceed until we've updated the superblock... */
 		wait_event(conf->wait_for_overlap,
 			   atomic_read(&conf->reshape_stripes) == 0);
-		mddev->reshape_position = conf->expand_progress;
+		mddev->reshape_position = conf->reshape_progress;
+		conf->reshape_checkpoint = jiffies;
 		set_bit(MD_CHANGE_DEVS, &mddev->flags);
 		md_wakeup_thread(mddev->thread);
 		wait_event(mddev->sb_wait,
 			   !test_bit(MD_CHANGE_DEVS, &mddev->flags)
 			   || kthread_should_stop());
 		spin_lock_irq(&conf->device_lock);
-		conf->expand_lo = mddev->reshape_position;
+		conf->reshape_safe = mddev->reshape_position;
 		spin_unlock_irq(&conf->device_lock);
 		wake_up(&conf->wait_for_overlap);
 	}
-	return conf->chunk_size>>9;
+	return reshape_sectors;
 }
 
 /* FIXME go_faster isn't used */
@@ -3665,9 +3962,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
 {
 	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
 	struct stripe_head *sh;
-	int pd_idx;
-	int raid_disks = conf->raid_disks;
-	sector_t max_sector = mddev->size << 1;
+	sector_t max_sector = mddev->dev_sectors;
 	int sync_blocks;
 	int still_degraded = 0;
 	int i;
@@ -3675,6 +3970,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
 	if (sector_nr >= max_sector) {
 		/* just being told to finish up .. nothing much to do */
 		unplug_slaves(mddev);
+
 		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
 			end_reshape(conf);
 			return 0;
@@ -3705,7 +4001,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
 	 */
 	if (mddev->degraded >= conf->max_degraded &&
 	    test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
-		sector_t rv = (mddev->size << 1) - sector_nr;
+		sector_t rv = mddev->dev_sectors - sector_nr;
 		*skipped = 1;
 		return rv;
 	}
@@ -3721,10 +4017,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
 
 	bitmap_cond_end_sync(mddev->bitmap, sector_nr);
 
-	pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks);
-	sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1);
+	sh = get_active_stripe(conf, sector_nr, 0, 1);
 	if (sh == NULL) {
-		sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0);
+		sh = get_active_stripe(conf, sector_nr, 0, 0);
 		/* make sure we don't swamp the stripe cache if someone else
 		 * is trying to get access
 		 */
@@ -3766,19 +4061,15 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
 	 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
 	 */
 	struct stripe_head *sh;
-	int dd_idx, pd_idx;
+	int dd_idx;
 	sector_t sector, logical_sector, last_sector;
 	int scnt = 0;
 	int remaining;
 	int handled = 0;
 
 	logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
-	sector = raid5_compute_sector(	logical_sector,
-					conf->raid_disks,
-					conf->raid_disks - conf->max_degraded,
-					&dd_idx,
-					&pd_idx,
-					conf);
+	sector = raid5_compute_sector(conf, logical_sector,
+				      0, &dd_idx, NULL);
 	last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
 
 	for (; logical_sector < last_sector;
@@ -3790,7 +4081,7 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
 			/* already done this stripe */
 			continue;
 
-		sh = get_active_stripe(conf, sector, conf->raid_disks, pd_idx, 1);
+		sh = get_active_stripe(conf, sector, 0, 1);
 
 		if (!sh) {
 			/* failed to get a stripe - must wait */
@@ -3992,89 +4283,69 @@ static struct attribute_group raid5_attrs_group = {
 	.attrs = raid5_attrs,
 };
 
-static int run(mddev_t *mddev)
+static sector_t
+raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+	raid5_conf_t *conf = mddev_to_conf(mddev);
+
+	if (!sectors)
+		sectors = mddev->dev_sectors;
+	if (!raid_disks) {
+		/* size is defined by the smallest of previous and new size */
+		if (conf->raid_disks < conf->previous_raid_disks)
+			raid_disks = conf->raid_disks;
+		else
+			raid_disks = conf->previous_raid_disks;
+	}
+
+	sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
+	sectors &= ~((sector_t)mddev->new_chunk/512 - 1);
+	return sectors * (raid_disks - conf->max_degraded);
+}
+
+static raid5_conf_t *setup_conf(mddev_t *mddev)
 {
 	raid5_conf_t *conf;
 	int raid_disk, memory;
 	mdk_rdev_t *rdev;
 	struct disk_info *disk;
-	int working_disks = 0;
 
-	if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) {
+	if (mddev->new_level != 5
+	    && mddev->new_level != 4
+	    && mddev->new_level != 6) {
 		printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n",
-		       mdname(mddev), mddev->level);
-		return -EIO;
+		       mdname(mddev), mddev->new_level);
+		return ERR_PTR(-EIO);
 	}
-
-	if (mddev->chunk_size < PAGE_SIZE) {
-		printk(KERN_ERR "md/raid5: chunk_size must be at least "
-		       "PAGE_SIZE but %d < %ld\n",
-		       mddev->chunk_size, PAGE_SIZE);
-		return -EINVAL;
+	if ((mddev->new_level == 5
+	     && !algorithm_valid_raid5(mddev->new_layout)) ||
+	    (mddev->new_level == 6
+	     && !algorithm_valid_raid6(mddev->new_layout))) {
+		printk(KERN_ERR "raid5: %s: layout %d not supported\n",
+		       mdname(mddev), mddev->new_layout);
+		return ERR_PTR(-EIO);
 	}
-
-	if (mddev->reshape_position != MaxSector) {
-		/* Check that we can continue the reshape.
-		 * Currently only disks can change, it must
-		 * increase, and we must be past the point where
-		 * a stripe over-writes itself
-		 */
-		sector_t here_new, here_old;
-		int old_disks;
-		int max_degraded = (mddev->level == 5 ? 1 : 2);
-
-		if (mddev->new_level != mddev->level ||
-		    mddev->new_layout != mddev->layout ||
-		    mddev->new_chunk != mddev->chunk_size) {
-			printk(KERN_ERR "raid5: %s: unsupported reshape "
-			       "required - aborting.\n",
-			       mdname(mddev));
-			return -EINVAL;
-		}
-		if (mddev->delta_disks <= 0) {
-			printk(KERN_ERR "raid5: %s: unsupported reshape "
-			       "(reduce disks) required - aborting.\n",
-			       mdname(mddev));
-			return -EINVAL;
-		}
-		old_disks = mddev->raid_disks - mddev->delta_disks;
-		/* reshape_position must be on a new-stripe boundary, and one
-		 * further up in new geometry must map after here in old
-		 * geometry.
-		 */
-		here_new = mddev->reshape_position;
-		if (sector_div(here_new, (mddev->chunk_size>>9)*
-			       (mddev->raid_disks - max_degraded))) {
-			printk(KERN_ERR "raid5: reshape_position not "
-			       "on a stripe boundary\n");
-			return -EINVAL;
-		}
-		/* here_new is the stripe we will write to */
-		here_old = mddev->reshape_position;
-		sector_div(here_old, (mddev->chunk_size>>9)*
-			   (old_disks-max_degraded));
-		/* here_old is the first stripe that we might need to read
-		 * from */
-		if (here_new >= here_old) {
-			/* Reading from the same stripe as writing to - bad */
-			printk(KERN_ERR "raid5: reshape_position too early for "
-			       "auto-recovery - aborting.\n");
-			return -EINVAL;
-		}
-		printk(KERN_INFO "raid5: reshape will continue\n");
-		/* OK, we should be able to continue; */
+	if (mddev->new_level == 6 && mddev->raid_disks < 4) {
+		printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
+		       mdname(mddev), mddev->raid_disks);
+		return ERR_PTR(-EINVAL);
 	}
 
+	if (!mddev->new_chunk || mddev->new_chunk % PAGE_SIZE) {
+		printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
+			mddev->new_chunk, mdname(mddev));
+		return ERR_PTR(-EINVAL);
+	}
 
-	mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL);
-	if ((conf = mddev->private) == NULL)
+	conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL);
+	if (conf == NULL)
 		goto abort;
-	if (mddev->reshape_position == MaxSector) {
-		conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks;
-	} else {
-		conf->raid_disks = mddev->raid_disks;
+
+	conf->raid_disks = mddev->raid_disks;
+	if (mddev->reshape_position == MaxSector)
+		conf->previous_raid_disks = mddev->raid_disks;
+	else
 		conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
-	}
 
 	conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info),
 			      GFP_KERNEL);
@@ -4086,13 +4357,12 @@ static int run(mddev_t *mddev)
 	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
 		goto abort;
 
-	if (mddev->level == 6) {
+	if (mddev->new_level == 6) {
 		conf->spare_page = alloc_page(GFP_KERNEL);
 		if (!conf->spare_page)
 			goto abort;
 	}
 	spin_lock_init(&conf->device_lock);
-	mddev->queue->queue_lock = &conf->device_lock;
 	init_waitqueue_head(&conf->wait_for_stripe);
 	init_waitqueue_head(&conf->wait_for_overlap);
 	INIT_LIST_HEAD(&conf->handle_list);
@@ -4121,47 +4391,134 @@ static int run(mddev_t *mddev)
 			printk(KERN_INFO "raid5: device %s operational as raid"
 				" disk %d\n", bdevname(rdev->bdev,b),
 				raid_disk);
-			working_disks++;
 		} else
 			/* Cannot rely on bitmap to complete recovery */
 			conf->fullsync = 1;
 	}
 
-	/*
-	 * 0 for a fully functional array, 1 or 2 for a degraded array.
-	 */
-	mddev->degraded = conf->raid_disks - working_disks;
-	conf->mddev = mddev;
-	conf->chunk_size = mddev->chunk_size;
-	conf->level = mddev->level;
+	conf->chunk_size = mddev->new_chunk;
+	conf->level = mddev->new_level;
 	if (conf->level == 6)
 		conf->max_degraded = 2;
 	else
 		conf->max_degraded = 1;
-	conf->algorithm = mddev->layout;
+	conf->algorithm = mddev->new_layout;
 	conf->max_nr_stripes = NR_STRIPES;
-	conf->expand_progress = mddev->reshape_position;
-
-	/* device size must be a multiple of chunk size */
-	mddev->size &= ~(mddev->chunk_size/1024 -1);
-	mddev->resync_max_sectors = mddev->size << 1;
+	conf->reshape_progress = mddev->reshape_position;
+	if (conf->reshape_progress != MaxSector) {
+		conf->prev_chunk = mddev->chunk_size;
+		conf->prev_algo = mddev->layout;
+	}
 
-	if (conf->level == 6 && conf->raid_disks < 4) {
-		printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
-		       mdname(mddev), conf->raid_disks);
+	memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
+		 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
+	if (grow_stripes(conf, conf->max_nr_stripes)) {
+		printk(KERN_ERR
+			"raid5: couldn't allocate %dkB for buffers\n", memory);
 		goto abort;
-	}
-	if (!conf->chunk_size || conf->chunk_size % 4) {
-		printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
-			conf->chunk_size, mdname(mddev));
+	} else
+		printk(KERN_INFO "raid5: allocated %dkB for %s\n",
+			memory, mdname(mddev));
+
+	conf->thread = md_register_thread(raid5d, mddev, "%s_raid5");
+	if (!conf->thread) {
+		printk(KERN_ERR
+		       "raid5: couldn't allocate thread for %s\n",
+		       mdname(mddev));
 		goto abort;
 	}
-	if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
-		printk(KERN_ERR 
-			"raid5: unsupported parity algorithm %d for %s\n",
-			conf->algorithm, mdname(mddev));
-		goto abort;
+
+	return conf;
+
+ abort:
+	if (conf) {
+		shrink_stripes(conf);
+		safe_put_page(conf->spare_page);
+		kfree(conf->disks);
+		kfree(conf->stripe_hashtbl);
+		kfree(conf);
+		return ERR_PTR(-EIO);
+	} else
+		return ERR_PTR(-ENOMEM);
+}
+
+static int run(mddev_t *mddev)
+{
+	raid5_conf_t *conf;
+	int working_disks = 0;
+	mdk_rdev_t *rdev;
+
+	if (mddev->reshape_position != MaxSector) {
+		/* Check that we can continue the reshape.
+		 * Currently only disks can change, it must
+		 * increase, and we must be past the point where
+		 * a stripe over-writes itself
+		 */
+		sector_t here_new, here_old;
+		int old_disks;
+		int max_degraded = (mddev->level == 6 ? 2 : 1);
+
+		if (mddev->new_level != mddev->level) {
+			printk(KERN_ERR "raid5: %s: unsupported reshape "
+			       "required - aborting.\n",
+			       mdname(mddev));
+			return -EINVAL;
+		}
+		old_disks = mddev->raid_disks - mddev->delta_disks;
+		/* reshape_position must be on a new-stripe boundary, and one
+		 * further up in new geometry must map after here in old
+		 * geometry.
+		 */
+		here_new = mddev->reshape_position;
+		if (sector_div(here_new, (mddev->new_chunk>>9)*
+			       (mddev->raid_disks - max_degraded))) {
+			printk(KERN_ERR "raid5: reshape_position not "
+			       "on a stripe boundary\n");
+			return -EINVAL;
+		}
+		/* here_new is the stripe we will write to */
+		here_old = mddev->reshape_position;
+		sector_div(here_old, (mddev->chunk_size>>9)*
+			   (old_disks-max_degraded));
+		/* here_old is the first stripe that we might need to read
+		 * from */
+		if (here_new >= here_old) {
+			/* Reading from the same stripe as writing to - bad */
+			printk(KERN_ERR "raid5: reshape_position too early for "
+			       "auto-recovery - aborting.\n");
+			return -EINVAL;
+		}
+		printk(KERN_INFO "raid5: reshape will continue\n");
+		/* OK, we should be able to continue; */
+	} else {
+		BUG_ON(mddev->level != mddev->new_level);
+		BUG_ON(mddev->layout != mddev->new_layout);
+		BUG_ON(mddev->chunk_size != mddev->new_chunk);
+		BUG_ON(mddev->delta_disks != 0);
 	}
+
+	if (mddev->private == NULL)
+		conf = setup_conf(mddev);
+	else
+		conf = mddev->private;
+
+	if (IS_ERR(conf))
+		return PTR_ERR(conf);
+
+	mddev->thread = conf->thread;
+	conf->thread = NULL;
+	mddev->private = conf;
+
+	/*
+	 * 0 for a fully functional array, 1 or 2 for a degraded array.
+	 */
+	list_for_each_entry(rdev, &mddev->disks, same_set)
+		if (rdev->raid_disk >= 0 &&
+		    test_bit(In_sync, &rdev->flags))
+			working_disks++;
+
+	mddev->degraded = conf->raid_disks - working_disks;
+
 	if (mddev->degraded > conf->max_degraded) {
 		printk(KERN_ERR "raid5: not enough operational devices for %s"
 			" (%d/%d failed)\n",
@@ -4169,6 +4526,10 @@ static int run(mddev_t *mddev)
 		goto abort;
 	}
 
+	/* device size must be a multiple of chunk size */
+	mddev->dev_sectors &= ~(mddev->chunk_size / 512 - 1);
+	mddev->resync_max_sectors = mddev->dev_sectors;
+
 	if (mddev->degraded > 0 &&
 	    mddev->recovery_cp != MaxSector) {
 		if (mddev->ok_start_degraded)
@@ -4184,43 +4545,22 @@ static int run(mddev_t *mddev)
 		}
 	}
 
-	{
-		mddev->thread = md_register_thread(raid5d, mddev, "%s_raid5");
-		if (!mddev->thread) {
-			printk(KERN_ERR 
-				"raid5: couldn't allocate thread for %s\n",
-				mdname(mddev));
-			goto abort;
-		}
-	}
-	memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
-		 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
-	if (grow_stripes(conf, conf->max_nr_stripes)) {
-		printk(KERN_ERR 
-			"raid5: couldn't allocate %dkB for buffers\n", memory);
-		shrink_stripes(conf);
-		md_unregister_thread(mddev->thread);
-		goto abort;
-	} else
-		printk(KERN_INFO "raid5: allocated %dkB for %s\n",
-			memory, mdname(mddev));
-
 	if (mddev->degraded == 0)
 		printk("raid5: raid level %d set %s active with %d out of %d"
-			" devices, algorithm %d\n", conf->level, mdname(mddev), 
-			mddev->raid_disks-mddev->degraded, mddev->raid_disks,
-			conf->algorithm);
+		       " devices, algorithm %d\n", conf->level, mdname(mddev),
+		       mddev->raid_disks-mddev->degraded, mddev->raid_disks,
+		       mddev->new_layout);
 	else
 		printk(KERN_ALERT "raid5: raid level %d set %s active with %d"
 			" out of %d devices, algorithm %d\n", conf->level,
 			mdname(mddev), mddev->raid_disks - mddev->degraded,
-			mddev->raid_disks, conf->algorithm);
+			mddev->raid_disks, mddev->new_layout);
 
 	print_raid5_conf(conf);
 
-	if (conf->expand_progress != MaxSector) {
+	if (conf->reshape_progress != MaxSector) {
 		printk("...ok start reshape thread\n");
-		conf->expand_lo = conf->expand_progress;
+		conf->reshape_safe = conf->reshape_progress;
 		atomic_set(&conf->reshape_stripes, 0);
 		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -4247,18 +4587,22 @@ static int run(mddev_t *mddev)
 		       "raid5: failed to create sysfs attributes for %s\n",
 		       mdname(mddev));
 
+	mddev->queue->queue_lock = &conf->device_lock;
+
 	mddev->queue->unplug_fn = raid5_unplug_device;
 	mddev->queue->backing_dev_info.congested_data = mddev;
 	mddev->queue->backing_dev_info.congested_fn = raid5_congested;
 
-	mddev->array_sectors = 2 * mddev->size * (conf->previous_raid_disks -
-					    conf->max_degraded);
+	md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
 
 	blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
 
 	return 0;
 abort:
+	md_unregister_thread(mddev->thread);
+	mddev->thread = NULL;
 	if (conf) {
+		shrink_stripes(conf);
 		print_raid5_conf(conf);
 		safe_put_page(conf->spare_page);
 		kfree(conf->disks);
@@ -4396,6 +4740,10 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
 	print_raid5_conf(conf);
 	rdev = p->rdev;
 	if (rdev) {
+		if (number >= conf->raid_disks &&
+		    conf->reshape_progress == MaxSector)
+			clear_bit(In_sync, &rdev->flags);
+
 		if (test_bit(In_sync, &rdev->flags) ||
 		    atomic_read(&rdev->nr_pending)) {
 			err = -EBUSY;
@@ -4405,7 +4753,8 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
 		 * isn't possible.
 		 */
 		if (!test_bit(Faulty, &rdev->flags) &&
-		    mddev->degraded <= conf->max_degraded) {
+		    mddev->degraded <= conf->max_degraded &&
+		    number < conf->raid_disks) {
 			err = -EBUSY;
 			goto abort;
 		}
@@ -4472,36 +4821,48 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
 	 * any io in the removed space completes, but it hardly seems
 	 * worth it.
 	 */
-	raid5_conf_t *conf = mddev_to_conf(mddev);
-
 	sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
-	mddev->array_sectors = sectors * (mddev->raid_disks
-					  - conf->max_degraded);
+	md_set_array_sectors(mddev, raid5_size(mddev, sectors,
+					       mddev->raid_disks));
+	if (mddev->array_sectors >
+	    raid5_size(mddev, sectors, mddev->raid_disks))
+		return -EINVAL;
 	set_capacity(mddev->gendisk, mddev->array_sectors);
 	mddev->changed = 1;
-	if (sectors/2  > mddev->size && mddev->recovery_cp == MaxSector) {
-		mddev->recovery_cp = mddev->size << 1;
+	if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) {
+		mddev->recovery_cp = mddev->dev_sectors;
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	}
-	mddev->size = sectors /2;
+	mddev->dev_sectors = sectors;
 	mddev->resync_max_sectors = sectors;
 	return 0;
 }
 
-#ifdef CONFIG_MD_RAID5_RESHAPE
 static int raid5_check_reshape(mddev_t *mddev)
 {
 	raid5_conf_t *conf = mddev_to_conf(mddev);
-	int err;
 
-	if (mddev->delta_disks < 0 ||
-	    mddev->new_level != mddev->level)
-		return -EINVAL; /* Cannot shrink array or change level yet */
-	if (mddev->delta_disks == 0)
-		return 0; /* nothing to do */
+	if (mddev->delta_disks == 0 &&
+	    mddev->new_layout == mddev->layout &&
+	    mddev->new_chunk == mddev->chunk_size)
+		return -EINVAL; /* nothing to do */
 	if (mddev->bitmap)
 		/* Cannot grow a bitmap yet */
 		return -EBUSY;
+	if (mddev->degraded > conf->max_degraded)
+		return -EINVAL;
+	if (mddev->delta_disks < 0) {
+		/* We might be able to shrink, but the devices must
+		 * be made bigger first.
+		 * For raid6, 4 is the minimum size.
+		 * Otherwise 2 is the minimum
+		 */
+		int min = 2;
+		if (mddev->level == 6)
+			min = 4;
+		if (mddev->raid_disks + mddev->delta_disks < min)
+			return -EINVAL;
+	}
 
 	/* Can only proceed if there are plenty of stripe_heads.
 	 * We need a minimum of one full stripe,, and for sensible progress
@@ -4514,18 +4875,12 @@ static int raid5_check_reshape(mddev_t *mddev)
 	if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes ||
 	    (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
 		printk(KERN_WARNING "raid5: reshape: not enough stripes.  Needed %lu\n",
-		       (mddev->chunk_size / STRIPE_SIZE)*4);
+		       (max(mddev->chunk_size, mddev->new_chunk)
+			/ STRIPE_SIZE)*4);
 		return -ENOSPC;
 	}
 
-	err = resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
-	if (err)
-		return err;
-
-	if (mddev->degraded > conf->max_degraded)
-		return -EINVAL;
-	/* looks like we might be able to manage this */
-	return 0;
+	return resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
 }
 
 static int raid5_start_reshape(mddev_t *mddev)
@@ -4550,12 +4905,31 @@ static int raid5_start_reshape(mddev_t *mddev)
 		 */
 		return -EINVAL;
 
+	/* Refuse to reduce size of the array.  Any reductions in
+	 * array size must be through explicit setting of array_size
+	 * attribute.
+	 */
+	if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
+	    < mddev->array_sectors) {
+		printk(KERN_ERR "md: %s: array size must be reduced "
+		       "before number of disks\n", mdname(mddev));
+		return -EINVAL;
+	}
+
 	atomic_set(&conf->reshape_stripes, 0);
 	spin_lock_irq(&conf->device_lock);
 	conf->previous_raid_disks = conf->raid_disks;
 	conf->raid_disks += mddev->delta_disks;
-	conf->expand_progress = 0;
-	conf->expand_lo = 0;
+	conf->prev_chunk = conf->chunk_size;
+	conf->chunk_size = mddev->new_chunk;
+	conf->prev_algo = conf->algorithm;
+	conf->algorithm = mddev->new_layout;
+	if (mddev->delta_disks < 0)
+		conf->reshape_progress = raid5_size(mddev, 0, 0);
+	else
+		conf->reshape_progress = 0;
+	conf->reshape_safe = conf->reshape_progress;
+	conf->generation++;
 	spin_unlock_irq(&conf->device_lock);
 
 	/* Add some new drives, as many as will fit.
@@ -4580,9 +4954,12 @@ static int raid5_start_reshape(mddev_t *mddev)
 				break;
 		}
 
-	spin_lock_irqsave(&conf->device_lock, flags);
-	mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices;
-	spin_unlock_irqrestore(&conf->device_lock, flags);
+	if (mddev->delta_disks > 0) {
+		spin_lock_irqsave(&conf->device_lock, flags);
+		mddev->degraded = (conf->raid_disks - conf->previous_raid_disks)
+			- added_devices;
+		spin_unlock_irqrestore(&conf->device_lock, flags);
+	}
 	mddev->raid_disks = conf->raid_disks;
 	mddev->reshape_position = 0;
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -4597,52 +4974,86 @@ static int raid5_start_reshape(mddev_t *mddev)
 		mddev->recovery = 0;
 		spin_lock_irq(&conf->device_lock);
 		mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
-		conf->expand_progress = MaxSector;
+		conf->reshape_progress = MaxSector;
 		spin_unlock_irq(&conf->device_lock);
 		return -EAGAIN;
 	}
+	conf->reshape_checkpoint = jiffies;
 	md_wakeup_thread(mddev->sync_thread);
 	md_new_event(mddev);
 	return 0;
 }
-#endif
 
+/* This is called from the reshape thread and should make any
+ * changes needed in 'conf'
+ */
 static void end_reshape(raid5_conf_t *conf)
 {
-	struct block_device *bdev;
 
 	if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
-		conf->mddev->array_sectors = 2 * conf->mddev->size *
-			(conf->raid_disks - conf->max_degraded);
-		set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors);
-		conf->mddev->changed = 1;
-
-		bdev = bdget_disk(conf->mddev->gendisk, 0);
-		if (bdev) {
-			mutex_lock(&bdev->bd_inode->i_mutex);
-			i_size_write(bdev->bd_inode,
-				     (loff_t)conf->mddev->array_sectors << 9);
-			mutex_unlock(&bdev->bd_inode->i_mutex);
-			bdput(bdev);
-		}
+
 		spin_lock_irq(&conf->device_lock);
-		conf->expand_progress = MaxSector;
+		conf->previous_raid_disks = conf->raid_disks;
+		conf->reshape_progress = MaxSector;
 		spin_unlock_irq(&conf->device_lock);
-		conf->mddev->reshape_position = MaxSector;
+		wake_up(&conf->wait_for_overlap);
 
 		/* read-ahead size must cover two whole stripes, which is
 		 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
 		 */
 		{
-			int data_disks = conf->previous_raid_disks - conf->max_degraded;
-			int stripe = data_disks *
-				(conf->mddev->chunk_size / PAGE_SIZE);
+			int data_disks = conf->raid_disks - conf->max_degraded;
+			int stripe = data_disks * (conf->chunk_size
+						   / PAGE_SIZE);
 			if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
 				conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
 		}
 	}
 }
 
+/* This is called from the raid5d thread with mddev_lock held.
+ * It makes config changes to the device.
+ */
+static void raid5_finish_reshape(mddev_t *mddev)
+{
+	struct block_device *bdev;
+	raid5_conf_t *conf = mddev_to_conf(mddev);
+
+	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+
+		if (mddev->delta_disks > 0) {
+			md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
+			set_capacity(mddev->gendisk, mddev->array_sectors);
+			mddev->changed = 1;
+
+			bdev = bdget_disk(mddev->gendisk, 0);
+			if (bdev) {
+				mutex_lock(&bdev->bd_inode->i_mutex);
+				i_size_write(bdev->bd_inode,
+					     (loff_t)mddev->array_sectors << 9);
+				mutex_unlock(&bdev->bd_inode->i_mutex);
+				bdput(bdev);
+			}
+		} else {
+			int d;
+			mddev->degraded = conf->raid_disks;
+			for (d = 0; d < conf->raid_disks ; d++)
+				if (conf->disks[d].rdev &&
+				    test_bit(In_sync,
+					     &conf->disks[d].rdev->flags))
+					mddev->degraded--;
+			for (d = conf->raid_disks ;
+			     d < conf->raid_disks - mddev->delta_disks;
+			     d++)
+				raid5_remove_disk(mddev, d);
+		}
+		mddev->layout = conf->algorithm;
+		mddev->chunk_size = conf->chunk_size;
+		mddev->reshape_position = MaxSector;
+		mddev->delta_disks = 0;
+	}
+}
+
 static void raid5_quiesce(mddev_t *mddev, int state)
 {
 	raid5_conf_t *conf = mddev_to_conf(mddev);
@@ -4672,6 +5083,212 @@ static void raid5_quiesce(mddev_t *mddev, int state)
 	}
 }
 
+
+static void *raid5_takeover_raid1(mddev_t *mddev)
+{
+	int chunksect;
+
+	if (mddev->raid_disks != 2 ||
+	    mddev->degraded > 1)
+		return ERR_PTR(-EINVAL);
+
+	/* Should check if there are write-behind devices? */
+
+	chunksect = 64*2; /* 64K by default */
+
+	/* The array must be an exact multiple of chunksize */
+	while (chunksect && (mddev->array_sectors & (chunksect-1)))
+		chunksect >>= 1;
+
+	if ((chunksect<<9) < STRIPE_SIZE)
+		/* array size does not allow a suitable chunk size */
+		return ERR_PTR(-EINVAL);
+
+	mddev->new_level = 5;
+	mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
+	mddev->new_chunk = chunksect << 9;
+
+	return setup_conf(mddev);
+}
+
+static void *raid5_takeover_raid6(mddev_t *mddev)
+{
+	int new_layout;
+
+	switch (mddev->layout) {
+	case ALGORITHM_LEFT_ASYMMETRIC_6:
+		new_layout = ALGORITHM_LEFT_ASYMMETRIC;
+		break;
+	case ALGORITHM_RIGHT_ASYMMETRIC_6:
+		new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
+		break;
+	case ALGORITHM_LEFT_SYMMETRIC_6:
+		new_layout = ALGORITHM_LEFT_SYMMETRIC;
+		break;
+	case ALGORITHM_RIGHT_SYMMETRIC_6:
+		new_layout = ALGORITHM_RIGHT_SYMMETRIC;
+		break;
+	case ALGORITHM_PARITY_0_6:
+		new_layout = ALGORITHM_PARITY_0;
+		break;
+	case ALGORITHM_PARITY_N:
+		new_layout = ALGORITHM_PARITY_N;
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+	mddev->new_level = 5;
+	mddev->new_layout = new_layout;
+	mddev->delta_disks = -1;
+	mddev->raid_disks -= 1;
+	return setup_conf(mddev);
+}
+
+
+static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
+{
+	/* For a 2-drive array, the layout and chunk size can be changed
+	 * immediately as not restriping is needed.
+	 * For larger arrays we record the new value - after validation
+	 * to be used by a reshape pass.
+	 */
+	raid5_conf_t *conf = mddev_to_conf(mddev);
+
+	if (new_layout >= 0 && !algorithm_valid_raid5(new_layout))
+		return -EINVAL;
+	if (new_chunk > 0) {
+		if (new_chunk & (new_chunk-1))
+			/* not a power of 2 */
+			return -EINVAL;
+		if (new_chunk < PAGE_SIZE)
+			return -EINVAL;
+		if (mddev->array_sectors & ((new_chunk>>9)-1))
+			/* not factor of array size */
+			return -EINVAL;
+	}
+
+	/* They look valid */
+
+	if (mddev->raid_disks == 2) {
+
+		if (new_layout >= 0) {
+			conf->algorithm = new_layout;
+			mddev->layout = mddev->new_layout = new_layout;
+		}
+		if (new_chunk > 0) {
+			conf->chunk_size = new_chunk;
+			mddev->chunk_size = mddev->new_chunk = new_chunk;
+		}
+		set_bit(MD_CHANGE_DEVS, &mddev->flags);
+		md_wakeup_thread(mddev->thread);
+	} else {
+		if (new_layout >= 0)
+			mddev->new_layout = new_layout;
+		if (new_chunk > 0)
+			mddev->new_chunk = new_chunk;
+	}
+	return 0;
+}
+
+static int raid6_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
+{
+	if (new_layout >= 0 && !algorithm_valid_raid6(new_layout))
+		return -EINVAL;
+	if (new_chunk > 0) {
+		if (new_chunk & (new_chunk-1))
+			/* not a power of 2 */
+			return -EINVAL;
+		if (new_chunk < PAGE_SIZE)
+			return -EINVAL;
+		if (mddev->array_sectors & ((new_chunk>>9)-1))
+			/* not factor of array size */
+			return -EINVAL;
+	}
+
+	/* They look valid */
+
+	if (new_layout >= 0)
+		mddev->new_layout = new_layout;
+	if (new_chunk > 0)
+		mddev->new_chunk = new_chunk;
+
+	return 0;
+}
+
+static void *raid5_takeover(mddev_t *mddev)
+{
+	/* raid5 can take over:
+	 *  raid0 - if all devices are the same - make it a raid4 layout
+	 *  raid1 - if there are two drives.  We need to know the chunk size
+	 *  raid4 - trivial - just use a raid4 layout.
+	 *  raid6 - Providing it is a *_6 layout
+	 *
+	 * For now, just do raid1
+	 */
+
+	if (mddev->level == 1)
+		return raid5_takeover_raid1(mddev);
+	if (mddev->level == 4) {
+		mddev->new_layout = ALGORITHM_PARITY_N;
+		mddev->new_level = 5;
+		return setup_conf(mddev);
+	}
+	if (mddev->level == 6)
+		return raid5_takeover_raid6(mddev);
+
+	return ERR_PTR(-EINVAL);
+}
+
+
+static struct mdk_personality raid5_personality;
+
+static void *raid6_takeover(mddev_t *mddev)
+{
+	/* Currently can only take over a raid5.  We map the
+	 * personality to an equivalent raid6 personality
+	 * with the Q block at the end.
+	 */
+	int new_layout;
+
+	if (mddev->pers != &raid5_personality)
+		return ERR_PTR(-EINVAL);
+	if (mddev->degraded > 1)
+		return ERR_PTR(-EINVAL);
+	if (mddev->raid_disks > 253)
+		return ERR_PTR(-EINVAL);
+	if (mddev->raid_disks < 3)
+		return ERR_PTR(-EINVAL);
+
+	switch (mddev->layout) {
+	case ALGORITHM_LEFT_ASYMMETRIC:
+		new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
+		break;
+	case ALGORITHM_RIGHT_ASYMMETRIC:
+		new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
+		break;
+	case ALGORITHM_LEFT_SYMMETRIC:
+		new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
+		break;
+	case ALGORITHM_RIGHT_SYMMETRIC:
+		new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
+		break;
+	case ALGORITHM_PARITY_0:
+		new_layout = ALGORITHM_PARITY_0_6;
+		break;
+	case ALGORITHM_PARITY_N:
+		new_layout = ALGORITHM_PARITY_N;
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+	mddev->new_level = 6;
+	mddev->new_layout = new_layout;
+	mddev->delta_disks = 1;
+	mddev->raid_disks += 1;
+	return setup_conf(mddev);
+}
+
+
 static struct mdk_personality raid6_personality =
 {
 	.name		= "raid6",
@@ -4687,11 +5304,13 @@ static struct mdk_personality raid6_personality =
 	.spare_active	= raid5_spare_active,
 	.sync_request	= sync_request,
 	.resize		= raid5_resize,
-#ifdef CONFIG_MD_RAID5_RESHAPE
+	.size		= raid5_size,
 	.check_reshape	= raid5_check_reshape,
 	.start_reshape  = raid5_start_reshape,
-#endif
+	.finish_reshape = raid5_finish_reshape,
 	.quiesce	= raid5_quiesce,
+	.takeover	= raid6_takeover,
+	.reconfig	= raid6_reconfig,
 };
 static struct mdk_personality raid5_personality =
 {
@@ -4708,11 +5327,13 @@ static struct mdk_personality raid5_personality =
 	.spare_active	= raid5_spare_active,
 	.sync_request	= sync_request,
 	.resize		= raid5_resize,
-#ifdef CONFIG_MD_RAID5_RESHAPE
+	.size		= raid5_size,
 	.check_reshape	= raid5_check_reshape,
 	.start_reshape  = raid5_start_reshape,
-#endif
+	.finish_reshape = raid5_finish_reshape,
 	.quiesce	= raid5_quiesce,
+	.takeover	= raid5_takeover,
+	.reconfig	= raid5_reconfig,
 };
 
 static struct mdk_personality raid4_personality =
@@ -4730,20 +5351,15 @@ static struct mdk_personality raid4_personality =
 	.spare_active	= raid5_spare_active,
 	.sync_request	= sync_request,
 	.resize		= raid5_resize,
-#ifdef CONFIG_MD_RAID5_RESHAPE
+	.size		= raid5_size,
 	.check_reshape	= raid5_check_reshape,
 	.start_reshape  = raid5_start_reshape,
-#endif
+	.finish_reshape = raid5_finish_reshape,
 	.quiesce	= raid5_quiesce,
 };
 
 static int __init raid5_init(void)
 {
-	int e;
-
-	e = raid6_select_algo();
-	if ( e )
-		return e;
 	register_md_personality(&raid6_personality);
 	register_md_personality(&raid5_personality);
 	register_md_personality(&raid4_personality);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
new file mode 100644
index 00000000000..52ba99954de
--- /dev/null
+++ b/drivers/md/raid5.h
@@ -0,0 +1,474 @@
+#ifndef _RAID5_H
+#define _RAID5_H
+
+#include <linux/raid/xor.h>
+
+/*
+ *
+ * Each stripe contains one buffer per disc.  Each buffer can be in
+ * one of a number of states stored in "flags".  Changes between
+ * these states happen *almost* exclusively under a per-stripe
+ * spinlock.  Some very specific changes can happen in bi_end_io, and
+ * these are not protected by the spin lock.
+ *
+ * The flag bits that are used to represent these states are:
+ *   R5_UPTODATE and R5_LOCKED
+ *
+ * State Empty == !UPTODATE, !LOCK
+ *        We have no data, and there is no active request
+ * State Want == !UPTODATE, LOCK
+ *        A read request is being submitted for this block
+ * State Dirty == UPTODATE, LOCK
+ *        Some new data is in this buffer, and it is being written out
+ * State Clean == UPTODATE, !LOCK
+ *        We have valid data which is the same as on disc
+ *
+ * The possible state transitions are:
+ *
+ *  Empty -> Want   - on read or write to get old data for  parity calc
+ *  Empty -> Dirty  - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE)
+ *  Empty -> Clean  - on compute_block when computing a block for failed drive
+ *  Want  -> Empty  - on failed read
+ *  Want  -> Clean  - on successful completion of read request
+ *  Dirty -> Clean  - on successful completion of write request
+ *  Dirty -> Clean  - on failed write
+ *  Clean -> Dirty  - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW)
+ *
+ * The Want->Empty, Want->Clean, Dirty->Clean, transitions
+ * all happen in b_end_io at interrupt time.
+ * Each sets the Uptodate bit before releasing the Lock bit.
+ * This leaves one multi-stage transition:
+ *    Want->Dirty->Clean
+ * This is safe because thinking that a Clean buffer is actually dirty
+ * will at worst delay some action, and the stripe will be scheduled
+ * for attention after the transition is complete.
+ *
+ * There is one possibility that is not covered by these states.  That
+ * is if one drive has failed and there is a spare being rebuilt.  We
+ * can't distinguish between a clean block that has been generated
+ * from parity calculations, and a clean block that has been
+ * successfully written to the spare ( or to parity when resyncing).
+ * To distingush these states we have a stripe bit STRIPE_INSYNC that
+ * is set whenever a write is scheduled to the spare, or to the parity
+ * disc if there is no spare.  A sync request clears this bit, and
+ * when we find it set with no buffers locked, we know the sync is
+ * complete.
+ *
+ * Buffers for the md device that arrive via make_request are attached
+ * to the appropriate stripe in one of two lists linked on b_reqnext.
+ * One list (bh_read) for read requests, one (bh_write) for write.
+ * There should never be more than one buffer on the two lists
+ * together, but we are not guaranteed of that so we allow for more.
+ *
+ * If a buffer is on the read list when the associated cache buffer is
+ * Uptodate, the data is copied into the read buffer and it's b_end_io
+ * routine is called.  This may happen in the end_request routine only
+ * if the buffer has just successfully been read.  end_request should
+ * remove the buffers from the list and then set the Uptodate bit on
+ * the buffer.  Other threads may do this only if they first check
+ * that the Uptodate bit is set.  Once they have checked that they may
+ * take buffers off the read queue.
+ *
+ * When a buffer on the write list is committed for write it is copied
+ * into the cache buffer, which is then marked dirty, and moved onto a
+ * third list, the written list (bh_written).  Once both the parity
+ * block and the cached buffer are successfully written, any buffer on
+ * a written list can be returned with b_end_io.
+ *
+ * The write list and read list both act as fifos.  The read list is
+ * protected by the device_lock.  The write and written lists are
+ * protected by the stripe lock.  The device_lock, which can be
+ * claimed while the stipe lock is held, is only for list
+ * manipulations and will only be held for a very short time.  It can
+ * be claimed from interrupts.
+ *
+ *
+ * Stripes in the stripe cache can be on one of two lists (or on
+ * neither).  The "inactive_list" contains stripes which are not
+ * currently being used for any request.  They can freely be reused
+ * for another stripe.  The "handle_list" contains stripes that need
+ * to be handled in some way.  Both of these are fifo queues.  Each
+ * stripe is also (potentially) linked to a hash bucket in the hash
+ * table so that it can be found by sector number.  Stripes that are
+ * not hashed must be on the inactive_list, and will normally be at
+ * the front.  All stripes start life this way.
+ *
+ * The inactive_list, handle_list and hash bucket lists are all protected by the
+ * device_lock.
+ *  - stripes on the inactive_list never have their stripe_lock held.
+ *  - stripes have a reference counter. If count==0, they are on a list.
+ *  - If a stripe might need handling, STRIPE_HANDLE is set.
+ *  - When refcount reaches zero, then if STRIPE_HANDLE it is put on
+ *    handle_list else inactive_list
+ *
+ * This, combined with the fact that STRIPE_HANDLE is only ever
+ * cleared while a stripe has a non-zero count means that if the
+ * refcount is 0 and STRIPE_HANDLE is set, then it is on the
+ * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then
+ * the stripe is on inactive_list.
+ *
+ * The possible transitions are:
+ *  activate an unhashed/inactive stripe (get_active_stripe())
+ *     lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev
+ *  activate a hashed, possibly active stripe (get_active_stripe())
+ *     lockdev check-hash if(!cnt++)unlink-stripe unlockdev
+ *  attach a request to an active stripe (add_stripe_bh())
+ *     lockdev attach-buffer unlockdev
+ *  handle a stripe (handle_stripe())
+ *     lockstripe clrSTRIPE_HANDLE ...
+ *		(lockdev check-buffers unlockdev) ..
+ *		change-state ..
+ *		record io/ops needed unlockstripe schedule io/ops
+ *  release an active stripe (release_stripe())
+ *     lockdev if (!--cnt) { if  STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
+ *
+ * The refcount counts each thread that have activated the stripe,
+ * plus raid5d if it is handling it, plus one for each active request
+ * on a cached buffer, and plus one if the stripe is undergoing stripe
+ * operations.
+ *
+ * Stripe operations are performed outside the stripe lock,
+ * the stripe operations are:
+ * -copying data between the stripe cache and user application buffers
+ * -computing blocks to save a disk access, or to recover a missing block
+ * -updating the parity on a write operation (reconstruct write and
+ *  read-modify-write)
+ * -checking parity correctness
+ * -running i/o to disk
+ * These operations are carried out by raid5_run_ops which uses the async_tx
+ * api to (optionally) offload operations to dedicated hardware engines.
+ * When requesting an operation handle_stripe sets the pending bit for the
+ * operation and increments the count.  raid5_run_ops is then run whenever
+ * the count is non-zero.
+ * There are some critical dependencies between the operations that prevent some
+ * from being requested while another is in flight.
+ * 1/ Parity check operations destroy the in cache version of the parity block,
+ *    so we prevent parity dependent operations like writes and compute_blocks
+ *    from starting while a check is in progress.  Some dma engines can perform
+ *    the check without damaging the parity block, in these cases the parity
+ *    block is re-marked up to date (assuming the check was successful) and is
+ *    not re-read from disk.
+ * 2/ When a write operation is requested we immediately lock the affected
+ *    blocks, and mark them as not up to date.  This causes new read requests
+ *    to be held off, as well as parity checks and compute block operations.
+ * 3/ Once a compute block operation has been requested handle_stripe treats
+ *    that block as if it is up to date.  raid5_run_ops guaruntees that any
+ *    operation that is dependent on the compute block result is initiated after
+ *    the compute block completes.
+ */
+
+/*
+ * Operations state - intermediate states that are visible outside of sh->lock
+ * In general _idle indicates nothing is running, _run indicates a data
+ * processing operation is active, and _result means the data processing result
+ * is stable and can be acted upon.  For simple operations like biofill and
+ * compute that only have an _idle and _run state they are indicated with
+ * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN)
+ */
+/**
+ * enum check_states - handles syncing / repairing a stripe
+ * @check_state_idle - check operations are quiesced
+ * @check_state_run - check operation is running
+ * @check_state_result - set outside lock when check result is valid
+ * @check_state_compute_run - check failed and we are repairing
+ * @check_state_compute_result - set outside lock when compute result is valid
+ */
+enum check_states {
+	check_state_idle = 0,
+	check_state_run, /* parity check */
+	check_state_check_result,
+	check_state_compute_run, /* parity repair */
+	check_state_compute_result,
+};
+
+/**
+ * enum reconstruct_states - handles writing or expanding a stripe
+ */
+enum reconstruct_states {
+	reconstruct_state_idle = 0,
+	reconstruct_state_prexor_drain_run,	/* prexor-write */
+	reconstruct_state_drain_run,		/* write */
+	reconstruct_state_run,			/* expand */
+	reconstruct_state_prexor_drain_result,
+	reconstruct_state_drain_result,
+	reconstruct_state_result,
+};
+
+struct stripe_head {
+	struct hlist_node	hash;
+	struct list_head	lru;	      /* inactive_list or handle_list */
+	struct raid5_private_data *raid_conf;
+	short			generation;	/* increments with every
+						 * reshape */
+	sector_t		sector;		/* sector of this row */
+	short			pd_idx;		/* parity disk index */
+	short			qd_idx;		/* 'Q' disk index for raid6 */
+	short			ddf_layout;/* use DDF ordering to calculate Q */
+	unsigned long		state;		/* state flags */
+	atomic_t		count;	      /* nr of active thread/requests */
+	spinlock_t		lock;
+	int			bm_seq;	/* sequence number for bitmap flushes */
+	int			disks;		/* disks in stripe */
+	enum check_states	check_state;
+	enum reconstruct_states reconstruct_state;
+	/* stripe_operations
+	 * @target - STRIPE_OP_COMPUTE_BLK target
+	 */
+	struct stripe_operations {
+		int		   target;
+		u32		   zero_sum_result;
+	} ops;
+	struct r5dev {
+		struct bio	req;
+		struct bio_vec	vec;
+		struct page	*page;
+		struct bio	*toread, *read, *towrite, *written;
+		sector_t	sector;			/* sector of this page */
+		unsigned long	flags;
+	} dev[1]; /* allocated with extra space depending of RAID geometry */
+};
+
+/* stripe_head_state - collects and tracks the dynamic state of a stripe_head
+ *     for handle_stripe.  It is only valid under spin_lock(sh->lock);
+ */
+struct stripe_head_state {
+	int syncing, expanding, expanded;
+	int locked, uptodate, to_read, to_write, failed, written;
+	int to_fill, compute, req_compute, non_overwrite;
+	int failed_num;
+	unsigned long ops_request;
+};
+
+/* r6_state - extra state data only relevant to r6 */
+struct r6_state {
+	int p_failed, q_failed, failed_num[2];
+};
+
+/* Flags */
+#define	R5_UPTODATE	0	/* page contains current data */
+#define	R5_LOCKED	1	/* IO has been submitted on "req" */
+#define	R5_OVERWRITE	2	/* towrite covers whole page */
+/* and some that are internal to handle_stripe */
+#define	R5_Insync	3	/* rdev && rdev->in_sync at start */
+#define	R5_Wantread	4	/* want to schedule a read */
+#define	R5_Wantwrite	5
+#define	R5_Overlap	7	/* There is a pending overlapping request on this block */
+#define	R5_ReadError	8	/* seen a read error here recently */
+#define	R5_ReWrite	9	/* have tried to over-write the readerror */
+
+#define	R5_Expanded	10	/* This block now has post-expand data */
+#define	R5_Wantcompute	11 /* compute_block in progress treat as
+				    * uptodate
+				    */
+#define	R5_Wantfill	12 /* dev->toread contains a bio that needs
+				    * filling
+				    */
+#define R5_Wantdrain	13 /* dev->towrite needs to be drained */
+/*
+ * Write method
+ */
+#define RECONSTRUCT_WRITE	1
+#define READ_MODIFY_WRITE	2
+/* not a write method, but a compute_parity mode */
+#define	CHECK_PARITY		3
+/* Additional compute_parity mode -- updates the parity w/o LOCKING */
+#define UPDATE_PARITY		4
+
+/*
+ * Stripe state
+ */
+#define STRIPE_HANDLE		2
+#define	STRIPE_SYNCING		3
+#define	STRIPE_INSYNC		4
+#define	STRIPE_PREREAD_ACTIVE	5
+#define	STRIPE_DELAYED		6
+#define	STRIPE_DEGRADED		7
+#define	STRIPE_BIT_DELAY	8
+#define	STRIPE_EXPANDING	9
+#define	STRIPE_EXPAND_SOURCE	10
+#define	STRIPE_EXPAND_READY	11
+#define	STRIPE_IO_STARTED	12 /* do not count towards 'bypass_count' */
+#define	STRIPE_FULL_WRITE	13 /* all blocks are set to be overwritten */
+#define	STRIPE_BIOFILL_RUN	14
+#define	STRIPE_COMPUTE_RUN	15
+/*
+ * Operation request flags
+ */
+#define STRIPE_OP_BIOFILL	0
+#define STRIPE_OP_COMPUTE_BLK	1
+#define STRIPE_OP_PREXOR	2
+#define STRIPE_OP_BIODRAIN	3
+#define STRIPE_OP_POSTXOR	4
+#define STRIPE_OP_CHECK	5
+
+/*
+ * Plugging:
+ *
+ * To improve write throughput, we need to delay the handling of some
+ * stripes until there has been a chance that several write requests
+ * for the one stripe have all been collected.
+ * In particular, any write request that would require pre-reading
+ * is put on a "delayed" queue until there are no stripes currently
+ * in a pre-read phase.  Further, if the "delayed" queue is empty when
+ * a stripe is put on it then we "plug" the queue and do not process it
+ * until an unplug call is made. (the unplug_io_fn() is called).
+ *
+ * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add
+ * it to the count of prereading stripes.
+ * When write is initiated, or the stripe refcnt == 0 (just in case) we
+ * clear the PREREAD_ACTIVE flag and decrement the count
+ * Whenever the 'handle' queue is empty and the device is not plugged, we
+ * move any strips from delayed to handle and clear the DELAYED flag and set
+ * PREREAD_ACTIVE.
+ * In stripe_handle, if we find pre-reading is necessary, we do it if
+ * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
+ * HANDLE gets cleared if stripe_handle leave nothing locked.
+ */
+
+
+struct disk_info {
+	mdk_rdev_t	*rdev;
+};
+
+struct raid5_private_data {
+	struct hlist_head	*stripe_hashtbl;
+	mddev_t			*mddev;
+	struct disk_info	*spare;
+	int			chunk_size, level, algorithm;
+	int			max_degraded;
+	int			raid_disks;
+	int			max_nr_stripes;
+
+	/* reshape_progress is the leading edge of a 'reshape'
+	 * It has value MaxSector when no reshape is happening
+	 * If delta_disks < 0, it is the last sector we started work on,
+	 * else is it the next sector to work on.
+	 */
+	sector_t		reshape_progress;
+	/* reshape_safe is the trailing edge of a reshape.  We know that
+	 * before (or after) this address, all reshape has completed.
+	 */
+	sector_t		reshape_safe;
+	int			previous_raid_disks;
+	int			prev_chunk, prev_algo;
+	short			generation; /* increments with every reshape */
+	unsigned long		reshape_checkpoint; /* Time we last updated
+						     * metadata */
+
+	struct list_head	handle_list; /* stripes needing handling */
+	struct list_head	hold_list; /* preread ready stripes */
+	struct list_head	delayed_list; /* stripes that have plugged requests */
+	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
+	struct bio		*retry_read_aligned; /* currently retrying aligned bios   */
+	struct bio		*retry_read_aligned_list; /* aligned bios retry list  */
+	atomic_t		preread_active_stripes; /* stripes with scheduled io */
+	atomic_t		active_aligned_reads;
+	atomic_t		pending_full_writes; /* full write backlog */
+	int			bypass_count; /* bypassed prereads */
+	int			bypass_threshold; /* preread nice */
+	struct list_head	*last_hold; /* detect hold_list promotions */
+
+	atomic_t		reshape_stripes; /* stripes with pending writes for reshape */
+	/* unfortunately we need two cache names as we temporarily have
+	 * two caches.
+	 */
+	int			active_name;
+	char			cache_name[2][20];
+	struct kmem_cache		*slab_cache; /* for allocating stripes */
+
+	int			seq_flush, seq_write;
+	int			quiesce;
+
+	int			fullsync;  /* set to 1 if a full sync is needed,
+					    * (fresh device added).
+					    * Cleared when a sync completes.
+					    */
+
+	struct page 		*spare_page; /* Used when checking P/Q in raid6 */
+
+	/*
+	 * Free stripes pool
+	 */
+	atomic_t		active_stripes;
+	struct list_head	inactive_list;
+	wait_queue_head_t	wait_for_stripe;
+	wait_queue_head_t	wait_for_overlap;
+	int			inactive_blocked;	/* release of inactive stripes blocked,
+							 * waiting for 25% to be free
+							 */
+	int			pool_size; /* number of disks in stripeheads in pool */
+	spinlock_t		device_lock;
+	struct disk_info	*disks;
+
+	/* When taking over an array from a different personality, we store
+	 * the new thread here until we fully activate the array.
+	 */
+	struct mdk_thread_s	*thread;
+};
+
+typedef struct raid5_private_data raid5_conf_t;
+
+#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
+
+/*
+ * Our supported algorithms
+ */
+#define ALGORITHM_LEFT_ASYMMETRIC	0 /* Rotating Parity N with Data Restart */
+#define ALGORITHM_RIGHT_ASYMMETRIC	1 /* Rotating Parity 0 with Data Restart */
+#define ALGORITHM_LEFT_SYMMETRIC	2 /* Rotating Parity N with Data Continuation */
+#define ALGORITHM_RIGHT_SYMMETRIC	3 /* Rotating Parity 0 with Data Continuation */
+
+/* Define non-rotating (raid4) algorithms.  These allow
+ * conversion of raid4 to raid5.
+ */
+#define ALGORITHM_PARITY_0		4 /* P or P,Q are initial devices */
+#define ALGORITHM_PARITY_N		5 /* P or P,Q are final devices. */
+
+/* DDF RAID6 layouts differ from md/raid6 layouts in two ways.
+ * Firstly, the exact positioning of the parity block is slightly
+ * different between the 'LEFT_*' modes of md and the "_N_*" modes
+ * of DDF.
+ * Secondly, or order of datablocks over which the Q syndrome is computed
+ * is different.
+ * Consequently we have different layouts for DDF/raid6 than md/raid6.
+ * These layouts are from the DDFv1.2 spec.
+ * Interestingly DDFv1.2-Errata-A does not specify N_CONTINUE but
+ * leaves RLQ=3 as 'Vendor Specific'
+ */
+
+#define ALGORITHM_ROTATING_ZERO_RESTART	8 /* DDF PRL=6 RLQ=1 */
+#define ALGORITHM_ROTATING_N_RESTART	9 /* DDF PRL=6 RLQ=2 */
+#define ALGORITHM_ROTATING_N_CONTINUE	10 /*DDF PRL=6 RLQ=3 */
+
+
+/* For every RAID5 algorithm we define a RAID6 algorithm
+ * with exactly the same layout for data and parity, and
+ * with the Q block always on the last device (N-1).
+ * This allows trivial conversion from RAID5 to RAID6
+ */
+#define ALGORITHM_LEFT_ASYMMETRIC_6	16
+#define ALGORITHM_RIGHT_ASYMMETRIC_6	17
+#define ALGORITHM_LEFT_SYMMETRIC_6	18
+#define ALGORITHM_RIGHT_SYMMETRIC_6	19
+#define ALGORITHM_PARITY_0_6		20
+#define ALGORITHM_PARITY_N_6		ALGORITHM_PARITY_N
+
+static inline int algorithm_valid_raid5(int layout)
+{
+	return (layout >= 0) &&
+		(layout <= 5);
+}
+static inline int algorithm_valid_raid6(int layout)
+{
+	return (layout >= 0 && layout <= 5)
+		||
+		(layout == 8 || layout == 10)
+		||
+		(layout >= 16 && layout <= 20);
+}
+
+static inline int algorithm_is_DDF(int layout)
+{
+	return layout >= 8 && layout <= 10;
+}
+#endif
diff --git a/drivers/md/raid6.h b/drivers/md/raid6.h
deleted file mode 100644
index 98dcde88470..00000000000
--- a/drivers/md/raid6.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/* -*- linux-c -*- ------------------------------------------------------- *
- *
- *   Copyright 2003 H. Peter Anvin - All Rights Reserved
- *
- *   This program is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Bostom MA 02111-1307, USA; either version 2 of the License, or
- *   (at your option) any later version; incorporated herein by reference.
- *
- * ----------------------------------------------------------------------- */
-
-#ifndef LINUX_RAID_RAID6_H
-#define LINUX_RAID_RAID6_H
-
-#ifdef __KERNEL__
-
-/* Set to 1 to use kernel-wide empty_zero_page */
-#define RAID6_USE_EMPTY_ZERO_PAGE 0
-
-#include <linux/raid/md.h>
-#include <linux/raid/raid5.h>
-
-typedef raid5_conf_t raid6_conf_t; /* Same configuration */
-
-/* Additional compute_parity mode -- updates the parity w/o LOCKING */
-#define UPDATE_PARITY	4
-
-/* We need a pre-zeroed page... if we don't want to use the kernel-provided
-   one define it here */
-#if RAID6_USE_EMPTY_ZERO_PAGE
-# define raid6_empty_zero_page empty_zero_page
-#else
-extern const char raid6_empty_zero_page[PAGE_SIZE];
-#endif
-
-#else /* ! __KERNEL__ */
-/* Used for testing in user space */
-
-#include <errno.h>
-#include <inttypes.h>
-#include <limits.h>
-#include <stddef.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-
-/* Not standard, but glibc defines it */
-#define BITS_PER_LONG __WORDSIZE
-
-typedef uint8_t  u8;
-typedef uint16_t u16;
-typedef uint32_t u32;
-typedef uint64_t u64;
-
-#ifndef PAGE_SIZE
-# define PAGE_SIZE 4096
-#endif
-extern const char raid6_empty_zero_page[PAGE_SIZE];
-
-#define __init
-#define __exit
-#define __attribute_const__ __attribute__((const))
-#define noinline __attribute__((noinline))
-
-#define preempt_enable()
-#define preempt_disable()
-#define cpu_has_feature(x) 1
-#define enable_kernel_altivec()
-#define disable_kernel_altivec()
-
-#endif /* __KERNEL__ */
-
-/* Routine choices */
-struct raid6_calls {
-	void (*gen_syndrome)(int, size_t, void **);
-	int  (*valid)(void);	/* Returns 1 if this routine set is usable */
-	const char *name;	/* Name of this routine set */
-	int prefer;		/* Has special performance attribute */
-};
-
-/* Selected algorithm */
-extern struct raid6_calls raid6_call;
-
-/* Algorithm list */
-extern const struct raid6_calls * const raid6_algos[];
-int raid6_select_algo(void);
-
-/* Return values from chk_syndrome */
-#define RAID6_OK	0
-#define RAID6_P_BAD	1
-#define RAID6_Q_BAD	2
-#define RAID6_PQ_BAD	3
-
-/* Galois field tables */
-extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256)));
-extern const u8 raid6_gfexp[256]      __attribute__((aligned(256)));
-extern const u8 raid6_gfinv[256]      __attribute__((aligned(256)));
-extern const u8 raid6_gfexi[256]      __attribute__((aligned(256)));
-
-/* Recovery routines */
-void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, void **ptrs);
-void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs);
-void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs);
-
-/* Some definitions to allow code to be compiled for testing in userspace */
-#ifndef __KERNEL__
-
-# define jiffies	raid6_jiffies()
-# define printk 	printf
-# define GFP_KERNEL	0
-# define __get_free_pages(x,y)	((unsigned long)mmap(NULL, PAGE_SIZE << (y), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0))
-# define free_pages(x,y)	munmap((void *)(x), (y)*PAGE_SIZE)
-
-static inline void cpu_relax(void)
-{
-	/* Nothing */
-}
-
-#undef  HZ
-#define HZ 1000
-static inline uint32_t raid6_jiffies(void)
-{
-	struct timeval tv;
-	gettimeofday(&tv, NULL);
-	return tv.tv_sec*1000 + tv.tv_usec/1000;
-}
-
-#endif /* ! __KERNEL__ */
-
-#endif /* LINUX_RAID_RAID6_H */
diff --git a/drivers/md/raid6algos.c b/drivers/md/raid6algos.c
index 21987e3dbe6..866215ac7f2 100644
--- a/drivers/md/raid6algos.c
+++ b/drivers/md/raid6algos.c
@@ -5,7 +5,7 @@
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
  *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
  *   (at your option) any later version; incorporated herein by reference.
  *
  * ----------------------------------------------------------------------- */
@@ -16,13 +16,20 @@
  * Algorithm list and algorithm selection for RAID-6
  */
 
-#include "raid6.h"
+#include <linux/raid/pq.h>
 #ifndef __KERNEL__
 #include <sys/mman.h>
 #include <stdio.h>
+#else
+#if !RAID6_USE_EMPTY_ZERO_PAGE
+/* In .bss so it's zeroed */
+const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
+EXPORT_SYMBOL(raid6_empty_zero_page);
+#endif
 #endif
 
 struct raid6_calls raid6_call;
+EXPORT_SYMBOL_GPL(raid6_call);
 
 /* Various routine sets */
 extern const struct raid6_calls raid6_intx1;
@@ -79,6 +86,7 @@ const struct raid6_calls * const raid6_algos[] = {
 #else
 /* Need more time to be stable in userspace */
 #define RAID6_TIME_JIFFIES_LG2	9
+#define time_before(x, y) ((x) < (y))
 #endif
 
 /* Try to pick the best algorithm */
@@ -152,3 +160,12 @@ int __init raid6_select_algo(void)
 
 	return best ? 0 : -EINVAL;
 }
+
+static void raid6_exit(void)
+{
+	do { } while (0);
+}
+
+subsys_initcall(raid6_select_algo);
+module_exit(raid6_exit);
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/raid6altivec.uc b/drivers/md/raid6altivec.uc
index b9afd35b881..699dfeee494 100644
--- a/drivers/md/raid6altivec.uc
+++ b/drivers/md/raid6altivec.uc
@@ -5,7 +5,7 @@
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
  *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
  *   (at your option) any later version; incorporated herein by reference.
  *
  * ----------------------------------------------------------------------- */
@@ -22,7 +22,7 @@
  * bracked this with preempt_disable/enable or in a lock)
  */
 
-#include "raid6.h"
+#include <linux/raid/pq.h>
 
 #ifdef CONFIG_ALTIVEC
 
diff --git a/drivers/md/raid6int.uc b/drivers/md/raid6int.uc
index ad004cee0e2..f9bf9cba357 100644
--- a/drivers/md/raid6int.uc
+++ b/drivers/md/raid6int.uc
@@ -5,7 +5,7 @@
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
  *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
  *   (at your option) any later version; incorporated herein by reference.
  *
  * ----------------------------------------------------------------------- */
@@ -18,7 +18,7 @@
  * This file is postprocessed using unroll.pl
  */
 
-#include "raid6.h"
+#include <linux/raid/pq.h>
 
 /*
  * This is the C data type to use
diff --git a/drivers/md/raid6mmx.c b/drivers/md/raid6mmx.c
index d4e4a1bd70a..e7f6c13132b 100644
--- a/drivers/md/raid6mmx.c
+++ b/drivers/md/raid6mmx.c
@@ -5,7 +5,7 @@
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
  *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
  *   (at your option) any later version; incorporated herein by reference.
  *
  * ----------------------------------------------------------------------- */
@@ -18,7 +18,7 @@
 
 #if defined(__i386__) && !defined(__arch_um__)
 
-#include "raid6.h"
+#include <linux/raid/pq.h>
 #include "raid6x86.h"
 
 /* Shared with raid6sse1.c */
diff --git a/drivers/md/raid6recov.c b/drivers/md/raid6recov.c
index a8c4d9451bd..2609f00e0d6 100644
--- a/drivers/md/raid6recov.c
+++ b/drivers/md/raid6recov.c
@@ -5,7 +5,7 @@
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
  *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
  *   (at your option) any later version; incorporated herein by reference.
  *
  * ----------------------------------------------------------------------- */
@@ -18,7 +18,7 @@
  * the syndrome.)
  */
 
-#include "raid6.h"
+#include <linux/raid/pq.h>
 
 /* Recover two failed data blocks. */
 void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
@@ -63,9 +63,7 @@ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
 		p++; q++;
 	}
 }
-
-
-
+EXPORT_SYMBOL_GPL(raid6_2data_recov);
 
 /* Recover failure of one data block plus the P block */
 void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
@@ -97,9 +95,10 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
 		q++; dq++;
 	}
 }
+EXPORT_SYMBOL_GPL(raid6_datap_recov);
 
-
-#ifndef __KERNEL__		/* Testing only */
+#ifndef __KERNEL__
+/* Testing only */
 
 /* Recover two failed blocks. */
 void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs)
diff --git a/drivers/md/raid6sse1.c b/drivers/md/raid6sse1.c
index 0666237276f..b274dd5eab8 100644
--- a/drivers/md/raid6sse1.c
+++ b/drivers/md/raid6sse1.c
@@ -5,7 +5,7 @@
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
  *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
  *   (at your option) any later version; incorporated herein by reference.
  *
  * ----------------------------------------------------------------------- */
@@ -23,7 +23,7 @@
 
 #if defined(__i386__) && !defined(__arch_um__)
 
-#include "raid6.h"
+#include <linux/raid/pq.h>
 #include "raid6x86.h"
 
 /* Defined in raid6mmx.c */
diff --git a/drivers/md/raid6sse2.c b/drivers/md/raid6sse2.c
index b034ad86803..6ed6c6c0389 100644
--- a/drivers/md/raid6sse2.c
+++ b/drivers/md/raid6sse2.c
@@ -5,7 +5,7 @@
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
  *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
  *   (at your option) any later version; incorporated herein by reference.
  *
  * ----------------------------------------------------------------------- */
@@ -19,7 +19,7 @@
 
 #if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
 
-#include "raid6.h"
+#include <linux/raid/pq.h>
 #include "raid6x86.h"
 
 static const struct raid6_sse_constants {
diff --git a/drivers/md/raid6test/Makefile b/drivers/md/raid6test/Makefile
index 78e0396adf2..58ffdf4f516 100644
--- a/drivers/md/raid6test/Makefile
+++ b/drivers/md/raid6test/Makefile
@@ -5,7 +5,7 @@
 
 CC	 = gcc
 OPTFLAGS = -O2			# Adjust as desired
-CFLAGS	 = -I.. -g $(OPTFLAGS)
+CFLAGS	 = -I.. -I ../../../include -g $(OPTFLAGS)
 LD	 = ld
 PERL	 = perl
 AR	 = ar
diff --git a/drivers/md/raid6test/test.c b/drivers/md/raid6test/test.c
index 559cc41b258..7a930318b17 100644
--- a/drivers/md/raid6test/test.c
+++ b/drivers/md/raid6test/test.c
@@ -17,7 +17,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
-#include "raid6.h"
+#include <linux/raid/pq.h>
 
 #define NDISKS		16	/* Including P and Q */
 
diff --git a/drivers/md/raid6x86.h b/drivers/md/raid6x86.h
index 99fea7a70ca..4c22c156855 100644
--- a/drivers/md/raid6x86.h
+++ b/drivers/md/raid6x86.h
@@ -5,7 +5,7 @@
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
  *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
  *   (at your option) any later version; incorporated herein by reference.
  *
  * ----------------------------------------------------------------------- */
diff --git a/drivers/mfd/twl4030-core.c b/drivers/mfd/twl4030-core.c
index 68826f1e36b..ec90e953adc 100644
--- a/drivers/mfd/twl4030-core.c
+++ b/drivers/mfd/twl4030-core.c
@@ -592,11 +592,9 @@ add_children(struct twl4030_platform_data *pdata, unsigned long features)
 
 	/* maybe add LDOs that are omitted on cost-reduced parts */
 	if (twl_has_regulator() && !(features & TPS_SUBSET)) {
-		/*
 		child = add_regulator(TWL4030_REG_VPLL2, pdata->vpll2);
 		if (IS_ERR(child))
 			return PTR_ERR(child);
-		*/
 
 		child = add_regulator(TWL4030_REG_VMMC2, pdata->vmmc2);
 		if (IS_ERR(child))
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index df6ce4a06cf..1445ea8f10a 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -21,6 +21,7 @@
 #include <linux/leds.h>
 #include <linux/scatterlist.h>
 #include <linux/log2.h>
+#include <linux/regulator/consumer.h>
 
 #include <linux/mmc/card.h>
 #include <linux/mmc/host.h>
@@ -523,6 +524,105 @@ u32 mmc_vddrange_to_ocrmask(int vdd_min, int vdd_max)
 }
 EXPORT_SYMBOL(mmc_vddrange_to_ocrmask);
 
+#ifdef CONFIG_REGULATOR
+
+/**
+ * mmc_regulator_get_ocrmask - return mask of supported voltages
+ * @supply: regulator to use
+ *
+ * This returns either a negative errno, or a mask of voltages that
+ * can be provided to MMC/SD/SDIO devices using the specified voltage
+ * regulator.  This would normally be called before registering the
+ * MMC host adapter.
+ */
+int mmc_regulator_get_ocrmask(struct regulator *supply)
+{
+	int			result = 0;
+	int			count;
+	int			i;
+
+	count = regulator_count_voltages(supply);
+	if (count < 0)
+		return count;
+
+	for (i = 0; i < count; i++) {
+		int		vdd_uV;
+		int		vdd_mV;
+
+		vdd_uV = regulator_list_voltage(supply, i);
+		if (vdd_uV <= 0)
+			continue;
+
+		vdd_mV = vdd_uV / 1000;
+		result |= mmc_vddrange_to_ocrmask(vdd_mV, vdd_mV);
+	}
+
+	return result;
+}
+EXPORT_SYMBOL(mmc_regulator_get_ocrmask);
+
+/**
+ * mmc_regulator_set_ocr - set regulator to match host->ios voltage
+ * @vdd_bit: zero for power off, else a bit number (host->ios.vdd)
+ * @supply: regulator to use
+ *
+ * Returns zero on success, else negative errno.
+ *
+ * MMC host drivers may use this to enable or disable a regulator using
+ * a particular supply voltage.  This would normally be called from the
+ * set_ios() method.
+ */
+int mmc_regulator_set_ocr(struct regulator *supply, unsigned short vdd_bit)
+{
+	int			result = 0;
+	int			min_uV, max_uV;
+	int			enabled;
+
+	enabled = regulator_is_enabled(supply);
+	if (enabled < 0)
+		return enabled;
+
+	if (vdd_bit) {
+		int		tmp;
+		int		voltage;
+
+		/* REVISIT mmc_vddrange_to_ocrmask() may have set some
+		 * bits this regulator doesn't quite support ... don't
+		 * be too picky, most cards and regulators are OK with
+		 * a 0.1V range goof (it's a small error percentage).
+		 */
+		tmp = vdd_bit - ilog2(MMC_VDD_165_195);
+		if (tmp == 0) {
+			min_uV = 1650 * 1000;
+			max_uV = 1950 * 1000;
+		} else {
+			min_uV = 1900 * 1000 + tmp * 100 * 1000;
+			max_uV = min_uV + 100 * 1000;
+		}
+
+		/* avoid needless changes to this voltage; the regulator
+		 * might not allow this operation
+		 */
+		voltage = regulator_get_voltage(supply);
+		if (voltage < 0)
+			result = voltage;
+		else if (voltage < min_uV || voltage > max_uV)
+			result = regulator_set_voltage(supply, min_uV, max_uV);
+		else
+			result = 0;
+
+		if (result == 0 && !enabled)
+			result = regulator_enable(supply);
+	} else if (enabled) {
+		result = regulator_disable(supply);
+	}
+
+	return result;
+}
+EXPORT_SYMBOL(mmc_regulator_set_ocr);
+
+#endif
+
 /*
  * Mask off any voltages we don't support and select
  * the lowest voltage
diff --git a/drivers/mtd/maps/pxa2xx-flash.c b/drivers/mtd/maps/pxa2xx-flash.c
index e9026cb1c5b..572d32fdf38 100644
--- a/drivers/mtd/maps/pxa2xx-flash.c
+++ b/drivers/mtd/maps/pxa2xx-flash.c
@@ -117,7 +117,7 @@ static int __init pxa2xx_flash_probe(struct platform_device *pdev)
 	return 0;
 }
 
-static int __exit pxa2xx_flash_remove(struct platform_device *dev)
+static int __devexit pxa2xx_flash_remove(struct platform_device *dev)
 {
 	struct pxa2xx_flash_info *info = platform_get_drvdata(dev);
 
diff --git a/drivers/parisc/asp.c b/drivers/parisc/asp.c
index 7931133526c..9ca21098b14 100644
--- a/drivers/parisc/asp.c
+++ b/drivers/parisc/asp.c
@@ -81,7 +81,7 @@ static int __init asp_init_chip(struct parisc_device *dev)
 	asp.hpa = ASP_INTERRUPT_ADDR;
 
 	printk(KERN_INFO "%s version %d at 0x%lx found.\n", 
-		asp.name, asp.version, dev->hpa.start);
+		asp.name, asp.version, (unsigned long)dev->hpa.start);
 
 	/* the IRQ ASP should use */
 	ret = -EBUSY;
diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c
index cd4dd7ed2c0..5d610cbcfe8 100644
--- a/drivers/parisc/ccio-dma.c
+++ b/drivers/parisc/ccio-dma.c
@@ -406,8 +406,6 @@ resource_found:
 	}
 	ioc->avg_search[ioc->avg_idx++] = cr_start;
 	ioc->avg_idx &= CCIO_SEARCH_SAMPLE - 1;
-#endif
-#ifdef CCIO_COLLECT_STATS
 	ioc->used_pages += pages_needed;
 #endif
 	/* 
@@ -453,10 +451,10 @@ ccio_free_range(struct ioc *ioc, dma_addr_t iova, unsigned long pages_mapped)
 		unsigned long mask = ~(~0UL >> pages_mapped);
 		CCIO_FREE_MAPPINGS(ioc, res_idx, mask, 8);
 #else
-		CCIO_FREE_MAPPINGS(ioc, res_idx, 0xff, 8);
+		CCIO_FREE_MAPPINGS(ioc, res_idx, 0xffUL, 8);
 #endif
 	} else if(pages_mapped <= 16) {
-		CCIO_FREE_MAPPINGS(ioc, res_idx, 0xffff, 16);
+		CCIO_FREE_MAPPINGS(ioc, res_idx, 0xffffUL, 16);
 	} else if(pages_mapped <= 32) {
 		CCIO_FREE_MAPPINGS(ioc, res_idx, ~(unsigned int)0, 32);
 #ifdef __LP64__
@@ -1028,8 +1026,10 @@ static int ccio_proc_info(struct seq_file *m, void *p)
 
 	while (ioc != NULL) {
 		unsigned int total_pages = ioc->res_size << 3;
+#ifdef CCIO_COLLECT_STATS
 		unsigned long avg = 0, min, max;
 		int j;
+#endif
 
 		len += seq_printf(m, "%s\n", ioc->name);
 		
@@ -1060,8 +1060,7 @@ static int ccio_proc_info(struct seq_file *m, void *p)
 		avg /= CCIO_SEARCH_SAMPLE;
 		len += seq_printf(m, "  Bitmap search : %ld/%ld/%ld (min/avg/max CPU Cycles)\n",
 				  min, avg, max);
-#endif
-#ifdef CCIO_COLLECT_STATS
+
 		len += seq_printf(m, "pci_map_single(): %8ld calls  %8ld pages (avg %d/1000)\n",
 				  ioc->msingle_calls, ioc->msingle_pages,
 				  (int)((ioc->msingle_pages * 1000)/ioc->msingle_calls));
@@ -1400,7 +1399,7 @@ ccio_init_resource(struct resource *res, char *name, void __iomem *ioaddr)
 	result = insert_resource(&iomem_resource, res);
 	if (result < 0) {
 		printk(KERN_ERR "%s() failed to claim CCIO bus address space (%08lx,%08lx)\n", 
-			__func__, res->start, res->end);
+			__func__, (unsigned long)res->start, (unsigned long)res->end);
 	}
 }
 
@@ -1551,7 +1550,8 @@ static int __init ccio_probe(struct parisc_device *dev)
 
 	ioc->name = dev->id.hversion == U2_IOA_RUNWAY ? "U2" : "UTurn";
 
-	printk(KERN_INFO "Found %s at 0x%lx\n", ioc->name, dev->hpa.start);
+	printk(KERN_INFO "Found %s at 0x%lx\n", ioc->name,
+		(unsigned long)dev->hpa.start);
 
 	for (i = 0; i < ioc_count; i++) {
 		ioc_p = &(*ioc_p)->next;
diff --git a/drivers/parisc/dino.c b/drivers/parisc/dino.c
index bb5a1c9597c..52ae0b1d470 100644
--- a/drivers/parisc/dino.c
+++ b/drivers/parisc/dino.c
@@ -819,7 +819,9 @@ dino_bridge_init(struct dino_device *dino_dev, const char *name)
 
 		result = ccio_request_resource(dino_dev->hba.dev, &res[i]);
 		if (result < 0) {
-			printk(KERN_ERR "%s: failed to claim PCI Bus address space %d (0x%lx-0x%lx)!\n", name, i, res[i].start, res[i].end);
+			printk(KERN_ERR "%s: failed to claim PCI Bus address "
+			       "space %d (0x%lx-0x%lx)!\n", name, i,
+			       (unsigned long)res[i].start, (unsigned long)res[i].end);
 			return result;
 		}
 	}
@@ -899,7 +901,8 @@ static int __init dino_common_init(struct parisc_device *dev,
 	if (request_resource(&ioport_resource, res) < 0) {
 		printk(KERN_ERR "%s: request I/O Port region failed "
 		       "0x%lx/%lx (hpa 0x%p)\n",
-		       name, res->start, res->end, dino_dev->hba.base_addr);
+		       name, (unsigned long)res->start, (unsigned long)res->end,
+		       dino_dev->hba.base_addr);
 		return 1;
 	}
 
diff --git a/drivers/parisc/eisa.c b/drivers/parisc/eisa.c
index 7891db50c48..f415fdd9a88 100644
--- a/drivers/parisc/eisa.c
+++ b/drivers/parisc/eisa.c
@@ -314,7 +314,7 @@ static int __init eisa_probe(struct parisc_device *dev)
 	char *name = is_mongoose(dev) ? "Mongoose" : "Wax";
 
 	printk(KERN_INFO "%s EISA Adapter found at 0x%08lx\n", 
-		name, dev->hpa.start);
+		name, (unsigned long)dev->hpa.start);
 
 	eisa_dev.hba.dev = dev;
 	eisa_dev.hba.iommu = ccio_get_iommu(dev);
diff --git a/drivers/parisc/eisa_enumerator.c b/drivers/parisc/eisa_enumerator.c
index 6d8aae003f6..c709ecc2b7f 100644
--- a/drivers/parisc/eisa_enumerator.c
+++ b/drivers/parisc/eisa_enumerator.c
@@ -98,7 +98,7 @@ static int configure_memory(const unsigned char *buf,
 			res->start = mem_parent->start + get_24(buf+len+2);
 			res->end = res->start + get_16(buf+len+5)*1024;
 			res->flags = IORESOURCE_MEM;
-			printk("memory %lx-%lx ", res->start, res->end);
+			printk("memory %lx-%lx ", (unsigned long)res->start, (unsigned long)res->end);
 			result = request_resource(mem_parent, res);
 			if (result < 0) {
 				printk("\n" KERN_ERR "EISA Enumerator: failed to claim EISA Bus address space!\n");
@@ -188,7 +188,7 @@ static int configure_port(const unsigned char *buf, struct resource *io_parent,
 			res->start = get_16(buf+len+1);
 			res->end = get_16(buf+len+1)+(c&HPEE_PORT_SIZE_MASK)+1;
 			res->flags = IORESOURCE_IO;
-			printk("ioports %lx-%lx ", res->start, res->end);
+			printk("ioports %lx-%lx ", (unsigned long)res->start, (unsigned long)res->end);
 			result = request_resource(io_parent, res);
 			if (result < 0) {
 				printk("\n" KERN_ERR "EISA Enumerator: failed to claim EISA Bus address space!\n");
diff --git a/drivers/parisc/iosapic.c b/drivers/parisc/iosapic.c
index 501aaf1f253..73348c4047e 100644
--- a/drivers/parisc/iosapic.c
+++ b/drivers/parisc/iosapic.c
@@ -714,7 +714,7 @@ static void iosapic_set_affinity_irq(unsigned int irq,
 	if (dest_cpu < 0)
 		return;
 
-	irq_desc[irq].affinity = cpumask_of_cpu(dest_cpu);
+	cpumask_copy(irq_desc[irq].affinity, cpumask_of(dest_cpu));
 	vi->txn_addr = txn_affinity_addr(irq, dest_cpu);
 
 	spin_lock_irqsave(&iosapic_lock, flags);
diff --git a/drivers/parisc/led.c b/drivers/parisc/led.c
index 454b6532e40..9581d361945 100644
--- a/drivers/parisc/led.c
+++ b/drivers/parisc/led.c
@@ -3,7 +3,7 @@
  *
  *      (c) Copyright 2000 Red Hat Software
  *      (c) Copyright 2000 Helge Deller <hdeller@redhat.com>
- *      (c) Copyright 2001-2005 Helge Deller <deller@gmx.de>
+ *      (c) Copyright 2001-2009 Helge Deller <deller@gmx.de>
  *      (c) Copyright 2001 Randolph Chung <tausq@debian.org>
  *
  *      This program is free software; you can redistribute it and/or modify
@@ -243,13 +243,11 @@ static int __init led_create_procfs(void)
 
 	proc_pdc_root = proc_mkdir("pdc", 0);
 	if (!proc_pdc_root) return -1;
-	proc_pdc_root->owner = THIS_MODULE;
 	ent = create_proc_entry("led", S_IFREG|S_IRUGO|S_IWUSR, proc_pdc_root);
 	if (!ent) return -1;
 	ent->data = (void *)LED_NOLCD; /* LED */
 	ent->read_proc = led_proc_read;
 	ent->write_proc = led_proc_write;
-	ent->owner = THIS_MODULE;
 
 	if (led_type == LED_HASLCD)
 	{
@@ -258,7 +256,6 @@ static int __init led_create_procfs(void)
 		ent->data = (void *)LED_HASLCD; /* LCD */
 		ent->read_proc = led_proc_read;
 		ent->write_proc = led_proc_write;
-		ent->owner = THIS_MODULE;
 	}
 
 	return 0;
@@ -463,9 +460,20 @@ static void led_work_func (struct work_struct *unused)
 	if (likely(led_lanrxtx))  currentleds |= led_get_net_activity();
 	if (likely(led_diskio))   currentleds |= led_get_diskio_activity();
 
-	/* blink all LEDs twice a second if we got an Oops (HPMC) */
-	if (unlikely(oops_in_progress)) 
-		currentleds = (count_HZ<=(HZ/2)) ? 0 : 0xff;
+	/* blink LEDs if we got an Oops (HPMC) */
+	if (unlikely(oops_in_progress)) {
+		if (boot_cpu_data.cpu_type >= pcxl2) {
+			/* newer machines don't have loadavg. LEDs, so we
+			 * let all LEDs blink twice per second instead */
+			currentleds = (count_HZ <= (HZ/2)) ? 0 : 0xff;
+		} else {
+			/* old machines: blink loadavg. LEDs twice per second */
+			if (count_HZ <= (HZ/2))
+				currentleds &= ~(LED4|LED5|LED6|LED7);
+			else
+				currentleds |= (LED4|LED5|LED6|LED7);
+		}
+	}
 
 	if (currentleds != lastleds)
 	{
@@ -511,7 +519,7 @@ static int led_halt(struct notifier_block *nb, unsigned long event, void *buf)
 	
 	/* Cancel the work item and delete the queue */
 	if (led_wq) {
-		cancel_rearming_delayed_workqueue(led_wq, &led_task);
+		cancel_delayed_work_sync(&led_task);
 		destroy_workqueue(led_wq);
 		led_wq = NULL;
 	}
@@ -630,7 +638,7 @@ int lcd_print( const char *str )
 	
 	/* temporarily disable the led work task */
 	if (led_wq)
-		cancel_rearming_delayed_workqueue(led_wq, &led_task);
+		cancel_delayed_work_sync(&led_task);
 
 	/* copy display string to buffer for procfs */
 	strlcpy(lcd_text, str, sizeof(lcd_text));
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 9dbd5066aca..23e56a564e0 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -164,7 +164,8 @@ static inline void context_clear_entry(struct context_entry *context)
  * 1: writable
  * 2-6: reserved
  * 7: super page
- * 8-11: available
+ * 8-10: available
+ * 11: snoop behavior
  * 12-63: Host physcial address
  */
 struct dma_pte {
@@ -186,6 +187,11 @@ static inline void dma_set_pte_writable(struct dma_pte *pte)
 	pte->val |= DMA_PTE_WRITE;
 }
 
+static inline void dma_set_pte_snp(struct dma_pte *pte)
+{
+	pte->val |= DMA_PTE_SNP;
+}
+
 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
 {
 	pte->val = (pte->val & ~3) | (prot & 3);
@@ -231,6 +237,7 @@ struct dmar_domain {
 	int		flags;		/* flags to find out type of domain */
 
 	int		iommu_coherency;/* indicate coherency of iommu access */
+	int		iommu_snooping; /* indicate snooping control feature*/
 	int		iommu_count;	/* reference count of iommu */
 	spinlock_t	iommu_lock;	/* protect iommu set in domain */
 	u64		max_addr;	/* maximum mapped address */
@@ -421,7 +428,6 @@ static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 	return g_iommus[iommu_id];
 }
 
-/* "Coherency" capability may be different across iommus */
 static void domain_update_iommu_coherency(struct dmar_domain *domain)
 {
 	int i;
@@ -438,6 +444,29 @@ static void domain_update_iommu_coherency(struct dmar_domain *domain)
 	}
 }
 
+static void domain_update_iommu_snooping(struct dmar_domain *domain)
+{
+	int i;
+
+	domain->iommu_snooping = 1;
+
+	i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
+	for (; i < g_num_of_iommus; ) {
+		if (!ecap_sc_support(g_iommus[i]->ecap)) {
+			domain->iommu_snooping = 0;
+			break;
+		}
+		i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
+	}
+}
+
+/* Some capabilities may be different across iommus */
+static void domain_update_iommu_cap(struct dmar_domain *domain)
+{
+	domain_update_iommu_coherency(domain);
+	domain_update_iommu_snooping(domain);
+}
+
 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
 {
 	struct dmar_drhd_unit *drhd = NULL;
@@ -689,15 +718,17 @@ static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
 {
 	int addr_width = agaw_to_width(domain->agaw);
+	int npages;
 
 	start &= (((u64)1) << addr_width) - 1;
 	end &= (((u64)1) << addr_width) - 1;
 	/* in case it's partial page */
 	start = PAGE_ALIGN(start);
 	end &= PAGE_MASK;
+	npages = (end - start) / VTD_PAGE_SIZE;
 
 	/* we don't need lock here, nobody else touches the iova range */
-	while (start < end) {
+	while (npages--) {
 		dma_pte_clear_one(domain, start);
 		start += VTD_PAGE_SIZE;
 	}
@@ -1241,6 +1272,11 @@ static int domain_init(struct dmar_domain *domain, int guest_width)
 	else
 		domain->iommu_coherency = 0;
 
+	if (ecap_sc_support(iommu->ecap))
+		domain->iommu_snooping = 1;
+	else
+		domain->iommu_snooping = 0;
+
 	domain->iommu_count = 1;
 
 	/* always allocate the top pgd */
@@ -1369,7 +1405,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
 	spin_lock_irqsave(&domain->iommu_lock, flags);
 	if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
 		domain->iommu_count++;
-		domain_update_iommu_coherency(domain);
+		domain_update_iommu_cap(domain);
 	}
 	spin_unlock_irqrestore(&domain->iommu_lock, flags);
 	return 0;
@@ -1469,6 +1505,8 @@ domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
 		BUG_ON(dma_pte_addr(pte));
 		dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
 		dma_set_pte_prot(pte, prot);
+		if (prot & DMA_PTE_SNP)
+			dma_set_pte_snp(pte);
 		domain_flush_cache(domain, pte, sizeof(*pte));
 		start_pfn++;
 		index++;
@@ -2119,7 +2157,7 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
 error:
 	if (iova)
 		__free_iova(&domain->iovad, iova);
-	printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
+	printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
 		pci_name(pdev), size, (unsigned long long)paddr, dir);
 	return 0;
 }
@@ -2218,7 +2256,7 @@ static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
 	start_addr = iova->pfn_lo << PAGE_SHIFT;
 	size = aligned_size((u64)dev_addr, size);
 
-	pr_debug("Device %s unmapping: %lx@%llx\n",
+	pr_debug("Device %s unmapping: %zx@%llx\n",
 		pci_name(pdev), size, (unsigned long long)start_addr);
 
 	/*  clear the whole page */
@@ -2282,8 +2320,6 @@ static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
 	free_pages((unsigned long)vaddr, order);
 }
 
-#define SG_ENT_VIRT_ADDRESS(sg)	(sg_virt((sg)))
-
 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
 			   int nelems, enum dma_data_direction dir,
 			   struct dma_attrs *attrs)
@@ -2294,7 +2330,7 @@ static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
 	unsigned long start_addr;
 	struct iova *iova;
 	size_t size = 0;
-	void *addr;
+	phys_addr_t addr;
 	struct scatterlist *sg;
 	struct intel_iommu *iommu;
 
@@ -2310,7 +2346,7 @@ static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
 	if (!iova)
 		return;
 	for_each_sg(sglist, sg, nelems, i) {
-		addr = SG_ENT_VIRT_ADDRESS(sg);
+		addr = page_to_phys(sg_page(sg)) + sg->offset;
 		size += aligned_size((u64)addr, sg->length);
 	}
 
@@ -2337,7 +2373,7 @@ static int intel_nontranslate_map_sg(struct device *hddev,
 
 	for_each_sg(sglist, sg, nelems, i) {
 		BUG_ON(!sg_page(sg));
-		sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
+		sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
 		sg->dma_length = sg->length;
 	}
 	return nelems;
@@ -2346,7 +2382,7 @@ static int intel_nontranslate_map_sg(struct device *hddev,
 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
 			enum dma_data_direction dir, struct dma_attrs *attrs)
 {
-	void *addr;
+	phys_addr_t addr;
 	int i;
 	struct pci_dev *pdev = to_pci_dev(hwdev);
 	struct dmar_domain *domain;
@@ -2370,8 +2406,7 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne
 	iommu = domain_get_iommu(domain);
 
 	for_each_sg(sglist, sg, nelems, i) {
-		addr = SG_ENT_VIRT_ADDRESS(sg);
-		addr = (void *)virt_to_phys(addr);
+		addr = page_to_phys(sg_page(sg)) + sg->offset;
 		size += aligned_size((u64)addr, sg->length);
 	}
 
@@ -2394,8 +2429,7 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne
 	start_addr = iova->pfn_lo << PAGE_SHIFT;
 	offset = 0;
 	for_each_sg(sglist, sg, nelems, i) {
-		addr = SG_ENT_VIRT_ADDRESS(sg);
-		addr = (void *)virt_to_phys(addr);
+		addr = page_to_phys(sg_page(sg)) + sg->offset;
 		size = aligned_size((u64)addr, sg->length);
 		ret = domain_page_mapping(domain, start_addr + offset,
 			((u64)addr) & PAGE_MASK,
@@ -2628,6 +2662,33 @@ static int vm_domain_add_dev_info(struct dmar_domain *domain,
 	return 0;
 }
 
+static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
+					   struct pci_dev *pdev)
+{
+	struct pci_dev *tmp, *parent;
+
+	if (!iommu || !pdev)
+		return;
+
+	/* dependent device detach */
+	tmp = pci_find_upstream_pcie_bridge(pdev);
+	/* Secondary interface's bus number and devfn 0 */
+	if (tmp) {
+		parent = pdev->bus->self;
+		while (parent != tmp) {
+			iommu_detach_dev(iommu, parent->bus->number,
+				parent->devfn);
+			parent = parent->bus->self;
+		}
+		if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
+			iommu_detach_dev(iommu,
+				tmp->subordinate->number, 0);
+		else /* this is a legacy PCI bridge */
+			iommu_detach_dev(iommu,
+				tmp->bus->number, tmp->devfn);
+	}
+}
+
 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
 					  struct pci_dev *pdev)
 {
@@ -2653,6 +2714,7 @@ static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
 			spin_unlock_irqrestore(&device_domain_lock, flags);
 
 			iommu_detach_dev(iommu, info->bus, info->devfn);
+			iommu_detach_dependent_devices(iommu, pdev);
 			free_devinfo_mem(info);
 
 			spin_lock_irqsave(&device_domain_lock, flags);
@@ -2676,7 +2738,7 @@ static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
 		spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
 		clear_bit(iommu->seq_id, &domain->iommu_bmp);
 		domain->iommu_count--;
-		domain_update_iommu_coherency(domain);
+		domain_update_iommu_cap(domain);
 		spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
 	}
 
@@ -2702,15 +2764,16 @@ static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
 
 		iommu = device_to_iommu(info->bus, info->devfn);
 		iommu_detach_dev(iommu, info->bus, info->devfn);
+		iommu_detach_dependent_devices(iommu, info->dev);
 
 		/* clear this iommu in iommu_bmp, update iommu count
-		 * and coherency
+		 * and capabilities
 		 */
 		spin_lock_irqsave(&domain->iommu_lock, flags2);
 		if (test_and_clear_bit(iommu->seq_id,
 				       &domain->iommu_bmp)) {
 			domain->iommu_count--;
-			domain_update_iommu_coherency(domain);
+			domain_update_iommu_cap(domain);
 		}
 		spin_unlock_irqrestore(&domain->iommu_lock, flags2);
 
@@ -2933,6 +2996,8 @@ static int intel_iommu_map_range(struct iommu_domain *domain,
 		prot |= DMA_PTE_READ;
 	if (iommu_prot & IOMMU_WRITE)
 		prot |= DMA_PTE_WRITE;
+	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
+		prot |= DMA_PTE_SNP;
 
 	max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
 	if (dmar_domain->max_addr < max_addr) {
@@ -2986,6 +3051,17 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 	return phys;
 }
 
+static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
+				      unsigned long cap)
+{
+	struct dmar_domain *dmar_domain = domain->priv;
+
+	if (cap == IOMMU_CAP_CACHE_COHERENCY)
+		return dmar_domain->iommu_snooping;
+
+	return 0;
+}
+
 static struct iommu_ops intel_iommu_ops = {
 	.domain_init	= intel_iommu_domain_init,
 	.domain_destroy = intel_iommu_domain_destroy,
@@ -2994,6 +3070,7 @@ static struct iommu_ops intel_iommu_ops = {
 	.map		= intel_iommu_map_range,
 	.unmap		= intel_iommu_unmap_range,
 	.iova_to_phys	= intel_iommu_iova_to_phys,
+	.domain_has_cap = intel_iommu_domain_has_cap,
 };
 
 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
diff --git a/drivers/pcmcia/pxa2xx_cm_x255.c b/drivers/pcmcia/pxa2xx_cm_x255.c
index 4ed64d8e95e..5143a760153 100644
--- a/drivers/pcmcia/pxa2xx_cm_x255.c
+++ b/drivers/pcmcia/pxa2xx_cm_x255.c
@@ -63,7 +63,7 @@ static void cmx255_pcmcia_socket_state(struct soc_pcmcia_socket *skt,
 				       struct pcmcia_state *state)
 {
 	int cd = skt->nr ? GPIO_PCMCIA_S1_CD_VALID : GPIO_PCMCIA_S0_CD_VALID;
-	int rdy = skt->nr ? GPIO_PCMCIA_S0_RDYINT : GPIO_PCMCIA_S1_RDYINT;
+	int rdy = skt->nr ? GPIO_PCMCIA_S1_RDYINT : GPIO_PCMCIA_S0_RDYINT;
 
 	state->detect = !gpio_get_value(cd);
 	state->ready  = !!gpio_get_value(rdy);
diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index e7e0cf102d6..e58c0ce65aa 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -29,8 +29,12 @@ config REGULATOR_DEBUG
 	  Say yes here to enable debugging support.
 
 config REGULATOR_FIXED_VOLTAGE
-	tristate
+	tristate "Fixed voltage regulator support"
 	default n
+	help
+	  This driver provides support for fixed voltage regulators,
+	  useful for systems which use a combination of software
+	  managed regulators and simple non-configurable regulators.
 
 config REGULATOR_VIRTUAL_CONSUMER
 	tristate "Virtual regulator consumer support"
@@ -52,6 +56,13 @@ config REGULATOR_BQ24022
 	  charging select between 100 mA and 500 mA charging current
 	  limit.
 
+config REGULATOR_TWL4030
+	bool "TI TWL4030/TWL5030/TPS695x0 PMIC"
+	depends on TWL4030_CORE
+	help
+	  This driver supports the voltage regulators provided by
+	  this family of companion chips.
+
 config REGULATOR_WM8350
 	tristate "Wolfson Microelectroncis WM8350 AudioPlus PMIC"
 	depends on MFD_WM8350
diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile
index 61b30c6ddec..bac133afc06 100644
--- a/drivers/regulator/Makefile
+++ b/drivers/regulator/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_REGULATOR_FIXED_VOLTAGE) += fixed.o
 obj-$(CONFIG_REGULATOR_VIRTUAL_CONSUMER) += virtual.o
 
 obj-$(CONFIG_REGULATOR_BQ24022) += bq24022.o
+obj-$(CONFIG_REGULATOR_TWL4030) += twl4030-regulator.o
 obj-$(CONFIG_REGULATOR_WM8350) += wm8350-regulator.o
 obj-$(CONFIG_REGULATOR_WM8400) += wm8400-regulator.o
 obj-$(CONFIG_REGULATOR_DA903X)	+= da903x.o
diff --git a/drivers/regulator/bq24022.c b/drivers/regulator/bq24022.c
index c175e38a4cd..7ecb820ceeb 100644
--- a/drivers/regulator/bq24022.c
+++ b/drivers/regulator/bq24022.c
@@ -105,7 +105,8 @@ static int __init bq24022_probe(struct platform_device *pdev)
 	ret = gpio_direction_output(pdata->gpio_iset2, 0);
 	ret = gpio_direction_output(pdata->gpio_nce, 1);
 
-	bq24022 = regulator_register(&bq24022_desc, &pdev->dev, pdata);
+	bq24022 = regulator_register(&bq24022_desc, &pdev->dev,
+				     pdata->init_data, pdata);
 	if (IS_ERR(bq24022)) {
 		dev_dbg(&pdev->dev, "couldn't register regulator\n");
 		ret = PTR_ERR(bq24022);
diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index f511a406fca..01f7702a805 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -28,33 +28,7 @@
 static DEFINE_MUTEX(regulator_list_mutex);
 static LIST_HEAD(regulator_list);
 static LIST_HEAD(regulator_map_list);
-
-/*
- * struct regulator_dev
- *
- * Voltage / Current regulator class device. One for each regulator.
- */
-struct regulator_dev {
-	struct regulator_desc *desc;
-	int use_count;
-
-	/* lists we belong to */
-	struct list_head list; /* list of all regulators */
-	struct list_head slist; /* list of supplied regulators */
-
-	/* lists we own */
-	struct list_head consumer_list; /* consumers we supply */
-	struct list_head supply_list; /* regulators we supply */
-
-	struct blocking_notifier_head notifier;
-	struct mutex mutex; /* consumer lock */
-	struct module *owner;
-	struct device dev;
-	struct regulation_constraints *constraints;
-	struct regulator_dev *supply;	/* for tree */
-
-	void *reg_data;		/* regulator_dev data */
-};
+static int has_full_constraints;
 
 /*
  * struct regulator_map
@@ -79,7 +53,6 @@ struct regulator {
 	int uA_load;
 	int min_uV;
 	int max_uV;
-	int enabled; /* count of client enables */
 	char *supply_name;
 	struct device_attribute dev_attr;
 	struct regulator_dev *rdev;
@@ -312,6 +285,47 @@ static ssize_t regulator_state_show(struct device *dev,
 }
 static DEVICE_ATTR(state, 0444, regulator_state_show, NULL);
 
+static ssize_t regulator_status_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct regulator_dev *rdev = dev_get_drvdata(dev);
+	int status;
+	char *label;
+
+	status = rdev->desc->ops->get_status(rdev);
+	if (status < 0)
+		return status;
+
+	switch (status) {
+	case REGULATOR_STATUS_OFF:
+		label = "off";
+		break;
+	case REGULATOR_STATUS_ON:
+		label = "on";
+		break;
+	case REGULATOR_STATUS_ERROR:
+		label = "error";
+		break;
+	case REGULATOR_STATUS_FAST:
+		label = "fast";
+		break;
+	case REGULATOR_STATUS_NORMAL:
+		label = "normal";
+		break;
+	case REGULATOR_STATUS_IDLE:
+		label = "idle";
+		break;
+	case REGULATOR_STATUS_STANDBY:
+		label = "standby";
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	return sprintf(buf, "%s\n", label);
+}
+static DEVICE_ATTR(status, 0444, regulator_status_show, NULL);
+
 static ssize_t regulator_min_uA_show(struct device *dev,
 				    struct device_attribute *attr, char *buf)
 {
@@ -678,6 +692,73 @@ static int set_machine_constraints(struct regulator_dev *rdev,
 	else
 		name = "regulator";
 
+	/* constrain machine-level voltage specs to fit
+	 * the actual range supported by this regulator.
+	 */
+	if (ops->list_voltage && rdev->desc->n_voltages) {
+		int	count = rdev->desc->n_voltages;
+		int	i;
+		int	min_uV = INT_MAX;
+		int	max_uV = INT_MIN;
+		int	cmin = constraints->min_uV;
+		int	cmax = constraints->max_uV;
+
+		/* it's safe to autoconfigure fixed-voltage supplies */
+		if (count == 1 && !cmin) {
+			cmin = INT_MIN;
+			cmax = INT_MAX;
+		}
+
+		/* voltage constraints are optional */
+		if ((cmin == 0) && (cmax == 0))
+			goto out;
+
+		/* else require explicit machine-level constraints */
+		if (cmin <= 0 || cmax <= 0 || cmax < cmin) {
+			pr_err("%s: %s '%s' voltage constraints\n",
+				       __func__, "invalid", name);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		/* initial: [cmin..cmax] valid, [min_uV..max_uV] not */
+		for (i = 0; i < count; i++) {
+			int	value;
+
+			value = ops->list_voltage(rdev, i);
+			if (value <= 0)
+				continue;
+
+			/* maybe adjust [min_uV..max_uV] */
+			if (value >= cmin && value < min_uV)
+				min_uV = value;
+			if (value <= cmax && value > max_uV)
+				max_uV = value;
+		}
+
+		/* final: [min_uV..max_uV] valid iff constraints valid */
+		if (max_uV < min_uV) {
+			pr_err("%s: %s '%s' voltage constraints\n",
+				       __func__, "unsupportable", name);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		/* use regulator's subset of machine constraints */
+		if (constraints->min_uV < min_uV) {
+			pr_debug("%s: override '%s' %s, %d -> %d\n",
+				       __func__, name, "min_uV",
+					constraints->min_uV, min_uV);
+			constraints->min_uV = min_uV;
+		}
+		if (constraints->max_uV > max_uV) {
+			pr_debug("%s: override '%s' %s, %d -> %d\n",
+				       __func__, name, "max_uV",
+					constraints->max_uV, max_uV);
+			constraints->max_uV = max_uV;
+		}
+	}
+
 	rdev->constraints = constraints;
 
 	/* do we need to apply the constraint voltage */
@@ -695,10 +776,6 @@ static int set_machine_constraints(struct regulator_dev *rdev,
 			}
 	}
 
-	/* are we enabled at boot time by firmware / bootloader */
-	if (rdev->constraints->boot_on)
-		rdev->use_count = 1;
-
 	/* do we need to setup our suspend state */
 	if (constraints->initial_state) {
 		ret = suspend_prepare(rdev, constraints->initial_state);
@@ -710,11 +787,27 @@ static int set_machine_constraints(struct regulator_dev *rdev,
 		}
 	}
 
-	/* if always_on is set then turn the regulator on if it's not
-	 * already on. */
-	if (constraints->always_on && ops->enable &&
-	    ((ops->is_enabled && !ops->is_enabled(rdev)) ||
-	     (!ops->is_enabled && !constraints->boot_on))) {
+	if (constraints->initial_mode) {
+		if (!ops->set_mode) {
+			printk(KERN_ERR "%s: no set_mode operation for %s\n",
+			       __func__, name);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		ret = ops->set_mode(rdev, constraints->initial_mode);
+		if (ret < 0) {
+			printk(KERN_ERR
+			       "%s: failed to set initial mode for %s: %d\n",
+			       __func__, name, ret);
+			goto out;
+		}
+	}
+
+	/* If the constraints say the regulator should be on at this point
+	 * and we have control then make sure it is enabled.
+	 */
+	if ((constraints->always_on || constraints->boot_on) && ops->enable) {
 		ret = ops->enable(rdev);
 		if (ret < 0) {
 			printk(KERN_ERR "%s: failed to enable %s\n",
@@ -817,6 +910,19 @@ static void unset_consumer_device_supply(struct regulator_dev *rdev,
 	}
 }
 
+static void unset_regulator_supplies(struct regulator_dev *rdev)
+{
+	struct regulator_map *node, *n;
+
+	list_for_each_entry_safe(node, n, &regulator_map_list, list) {
+		if (rdev == node->regulator) {
+			list_del(&node->list);
+			kfree(node);
+			return;
+		}
+	}
+}
+
 #define REG_STR_SIZE	32
 
 static struct regulator *create_regulator(struct regulator_dev *rdev,
@@ -898,9 +1004,12 @@ overflow_err:
  * @id: Supply name or regulator ID.
  *
  * Returns a struct regulator corresponding to the regulator producer,
- * or IS_ERR() condition containing errno.  Use of supply names
- * configured via regulator_set_device_supply() is strongly
- * encouraged.
+ * or IS_ERR() condition containing errno.
+ *
+ * Use of supply names configured via regulator_set_device_supply() is
+ * strongly encouraged.  It is recommended that the supply name used
+ * should match the name used for the supply and/or the relevant
+ * device pins in the datasheet.
  */
 struct regulator *regulator_get(struct device *dev, const char *id)
 {
@@ -922,8 +1031,6 @@ struct regulator *regulator_get(struct device *dev, const char *id)
 			goto found;
 		}
 	}
-	printk(KERN_ERR "regulator: Unable to get requested regulator: %s\n",
-	       id);
 	mutex_unlock(&regulator_list_mutex);
 	return regulator;
 
@@ -961,10 +1068,6 @@ void regulator_put(struct regulator *regulator)
 	mutex_lock(&regulator_list_mutex);
 	rdev = regulator->rdev;
 
-	if (WARN(regulator->enabled, "Releasing supply %s while enabled\n",
-			       regulator->supply_name))
-		_regulator_disable(rdev);
-
 	/* remove any sysfs entries */
 	if (regulator->dev) {
 		sysfs_remove_link(&rdev->dev.kobj, regulator->supply_name);
@@ -1039,12 +1142,7 @@ int regulator_enable(struct regulator *regulator)
 	int ret = 0;
 
 	mutex_lock(&rdev->mutex);
-	if (regulator->enabled == 0)
-		ret = _regulator_enable(rdev);
-	else if (regulator->enabled < 0)
-		ret = -EIO;
-	if (ret == 0)
-		regulator->enabled++;
+	ret = _regulator_enable(rdev);
 	mutex_unlock(&rdev->mutex);
 	return ret;
 }
@@ -1055,6 +1153,11 @@ static int _regulator_disable(struct regulator_dev *rdev)
 {
 	int ret = 0;
 
+	if (WARN(rdev->use_count <= 0,
+			"unbalanced disables for %s\n",
+			rdev->desc->name))
+		return -EIO;
+
 	/* are we the last user and permitted to disable ? */
 	if (rdev->use_count == 1 && !rdev->constraints->always_on) {
 
@@ -1103,16 +1206,7 @@ int regulator_disable(struct regulator *regulator)
 	int ret = 0;
 
 	mutex_lock(&rdev->mutex);
-	if (regulator->enabled == 1) {
-		ret = _regulator_disable(rdev);
-		if (ret == 0)
-			regulator->uA_load = 0;
-	} else if (WARN(regulator->enabled <= 0,
-			"unbalanced disables for supply %s\n",
-			regulator->supply_name))
-		ret = -EIO;
-	if (ret == 0)
-		regulator->enabled--;
+	ret = _regulator_disable(rdev);
 	mutex_unlock(&rdev->mutex);
 	return ret;
 }
@@ -1159,7 +1253,6 @@ int regulator_force_disable(struct regulator *regulator)
 	int ret;
 
 	mutex_lock(&regulator->rdev->mutex);
-	regulator->enabled = 0;
 	regulator->uA_load = 0;
 	ret = _regulator_force_disable(regulator->rdev);
 	mutex_unlock(&regulator->rdev->mutex);
@@ -1204,6 +1297,56 @@ int regulator_is_enabled(struct regulator *regulator)
 EXPORT_SYMBOL_GPL(regulator_is_enabled);
 
 /**
+ * regulator_count_voltages - count regulator_list_voltage() selectors
+ * @regulator: regulator source
+ *
+ * Returns number of selectors, or negative errno.  Selectors are
+ * numbered starting at zero, and typically correspond to bitfields
+ * in hardware registers.
+ */
+int regulator_count_voltages(struct regulator *regulator)
+{
+	struct regulator_dev	*rdev = regulator->rdev;
+
+	return rdev->desc->n_voltages ? : -EINVAL;
+}
+EXPORT_SYMBOL_GPL(regulator_count_voltages);
+
+/**
+ * regulator_list_voltage - enumerate supported voltages
+ * @regulator: regulator source
+ * @selector: identify voltage to list
+ * Context: can sleep
+ *
+ * Returns a voltage that can be passed to @regulator_set_voltage(),
+ * zero if this selector code can't be used on this sytem, or a
+ * negative errno.
+ */
+int regulator_list_voltage(struct regulator *regulator, unsigned selector)
+{
+	struct regulator_dev	*rdev = regulator->rdev;
+	struct regulator_ops	*ops = rdev->desc->ops;
+	int			ret;
+
+	if (!ops->list_voltage || selector >= rdev->desc->n_voltages)
+		return -EINVAL;
+
+	mutex_lock(&rdev->mutex);
+	ret = ops->list_voltage(rdev, selector);
+	mutex_unlock(&rdev->mutex);
+
+	if (ret > 0) {
+		if (ret < rdev->constraints->min_uV)
+			ret = 0;
+		else if (ret > rdev->constraints->max_uV)
+			ret = 0;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(regulator_list_voltage);
+
+/**
  * regulator_set_voltage - set regulator output voltage
  * @regulator: regulator source
  * @min_uV: Minimum required voltage in uV
@@ -1243,6 +1386,7 @@ int regulator_set_voltage(struct regulator *regulator, int min_uV, int max_uV)
 	ret = rdev->desc->ops->set_voltage(rdev, min_uV, max_uV);
 
 out:
+	_notifier_call_chain(rdev, REGULATOR_EVENT_VOLTAGE_CHANGE, NULL);
 	mutex_unlock(&rdev->mutex);
 	return ret;
 }
@@ -1543,20 +1687,23 @@ int regulator_unregister_notifier(struct regulator *regulator,
 }
 EXPORT_SYMBOL_GPL(regulator_unregister_notifier);
 
-/* notify regulator consumers and downstream regulator consumers */
+/* notify regulator consumers and downstream regulator consumers.
+ * Note mutex must be held by caller.
+ */
 static void _notifier_call_chain(struct regulator_dev *rdev,
 				  unsigned long event, void *data)
 {
 	struct regulator_dev *_rdev;
 
 	/* call rdev chain first */
-	mutex_lock(&rdev->mutex);
 	blocking_notifier_call_chain(&rdev->notifier, event, NULL);
-	mutex_unlock(&rdev->mutex);
 
 	/* now notify regulator we supply */
-	list_for_each_entry(_rdev, &rdev->supply_list, slist)
-		_notifier_call_chain(_rdev, event, data);
+	list_for_each_entry(_rdev, &rdev->supply_list, slist) {
+	  mutex_lock(&_rdev->mutex);
+	  _notifier_call_chain(_rdev, event, data);
+	  mutex_unlock(&_rdev->mutex);
+	}
 }
 
 /**
@@ -1703,6 +1850,7 @@ EXPORT_SYMBOL_GPL(regulator_bulk_free);
  *
  * Called by regulator drivers to notify clients a regulator event has
  * occurred. We also notify regulator clients downstream.
+ * Note lock must be held by caller.
  */
 int regulator_notifier_call_chain(struct regulator_dev *rdev,
 				  unsigned long event, void *data)
@@ -1744,6 +1892,11 @@ static int add_regulator_attributes(struct regulator_dev *rdev)
 		if (status < 0)
 			return status;
 	}
+	if (ops->get_status) {
+		status = device_create_file(dev, &dev_attr_status);
+		if (status < 0)
+			return status;
+	}
 
 	/* some attributes are type-specific */
 	if (rdev->desc->type == REGULATOR_CURRENT) {
@@ -1828,17 +1981,18 @@ static int add_regulator_attributes(struct regulator_dev *rdev)
  * regulator_register - register regulator
  * @regulator_desc: regulator to register
  * @dev: struct device for the regulator
+ * @init_data: platform provided init data, passed through by driver
  * @driver_data: private regulator data
  *
  * Called by regulator drivers to register a regulator.
  * Returns 0 on success.
  */
 struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc,
-	struct device *dev, void *driver_data)
+	struct device *dev, struct regulator_init_data *init_data,
+	void *driver_data)
 {
 	static atomic_t regulator_no = ATOMIC_INIT(0);
 	struct regulator_dev *rdev;
-	struct regulator_init_data *init_data = dev->platform_data;
 	int ret, i;
 
 	if (regulator_desc == NULL)
@@ -1945,6 +2099,7 @@ void regulator_unregister(struct regulator_dev *rdev)
 		return;
 
 	mutex_lock(&regulator_list_mutex);
+	unset_regulator_supplies(rdev);
 	list_del(&rdev->list);
 	if (rdev->supply)
 		sysfs_remove_link(&rdev->dev.kobj, "supply");
@@ -1989,6 +2144,23 @@ out:
 EXPORT_SYMBOL_GPL(regulator_suspend_prepare);
 
 /**
+ * regulator_has_full_constraints - the system has fully specified constraints
+ *
+ * Calling this function will cause the regulator API to disable all
+ * regulators which have a zero use count and don't have an always_on
+ * constraint in a late_initcall.
+ *
+ * The intention is that this will become the default behaviour in a
+ * future kernel release so users are encouraged to use this facility
+ * now.
+ */
+void regulator_has_full_constraints(void)
+{
+	has_full_constraints = 1;
+}
+EXPORT_SYMBOL_GPL(regulator_has_full_constraints);
+
+/**
  * rdev_get_drvdata - get rdev regulator driver data
  * @rdev: regulator
  *
@@ -2055,3 +2227,77 @@ static int __init regulator_init(void)
 
 /* init early to allow our consumers to complete system booting */
 core_initcall(regulator_init);
+
+static int __init regulator_init_complete(void)
+{
+	struct regulator_dev *rdev;
+	struct regulator_ops *ops;
+	struct regulation_constraints *c;
+	int enabled, ret;
+	const char *name;
+
+	mutex_lock(&regulator_list_mutex);
+
+	/* If we have a full configuration then disable any regulators
+	 * which are not in use or always_on.  This will become the
+	 * default behaviour in the future.
+	 */
+	list_for_each_entry(rdev, &regulator_list, list) {
+		ops = rdev->desc->ops;
+		c = rdev->constraints;
+
+		if (c->name)
+			name = c->name;
+		else if (rdev->desc->name)
+			name = rdev->desc->name;
+		else
+			name = "regulator";
+
+		if (!ops->disable || c->always_on)
+			continue;
+
+		mutex_lock(&rdev->mutex);
+
+		if (rdev->use_count)
+			goto unlock;
+
+		/* If we can't read the status assume it's on. */
+		if (ops->is_enabled)
+			enabled = ops->is_enabled(rdev);
+		else
+			enabled = 1;
+
+		if (!enabled)
+			goto unlock;
+
+		if (has_full_constraints) {
+			/* We log since this may kill the system if it
+			 * goes wrong. */
+			printk(KERN_INFO "%s: disabling %s\n",
+			       __func__, name);
+			ret = ops->disable(rdev);
+			if (ret != 0) {
+				printk(KERN_ERR
+				       "%s: couldn't disable %s: %d\n",
+				       __func__, name, ret);
+			}
+		} else {
+			/* The intention is that in future we will
+			 * assume that full constraints are provided
+			 * so warn even if we aren't going to do
+			 * anything here.
+			 */
+			printk(KERN_WARNING
+			       "%s: incomplete constraints, leaving %s on\n",
+			       __func__, name);
+		}
+
+unlock:
+		mutex_unlock(&rdev->mutex);
+	}
+
+	mutex_unlock(&regulator_list_mutex);
+
+	return 0;
+}
+late_initcall(regulator_init_complete);
diff --git a/drivers/regulator/da903x.c b/drivers/regulator/da903x.c
index fe77730a7ed..72b15495183 100644
--- a/drivers/regulator/da903x.c
+++ b/drivers/regulator/da903x.c
@@ -471,7 +471,8 @@ static int __devinit da903x_regulator_probe(struct platform_device *pdev)
 	if (ri->desc.id == DA9030_ID_LDO1 || ri->desc.id == DA9030_ID_LDO15)
 		ri->desc.ops = &da9030_regulator_ldo1_15_ops;
 
-	rdev = regulator_register(&ri->desc, &pdev->dev, ri);
+	rdev = regulator_register(&ri->desc, &pdev->dev,
+				  pdev->dev.platform_data, ri);
 	if (IS_ERR(rdev)) {
 		dev_err(&pdev->dev, "failed to register regulator %s\n",
 				ri->desc.name);
diff --git a/drivers/regulator/fixed.c b/drivers/regulator/fixed.c
index d31db3e1491..23d554628a7 100644
--- a/drivers/regulator/fixed.c
+++ b/drivers/regulator/fixed.c
@@ -73,7 +73,8 @@ static int regulator_fixed_voltage_probe(struct platform_device *pdev)
 
 	drvdata->microvolts = config->microvolts;
 
-	drvdata->dev = regulator_register(&drvdata->desc, drvdata);
+	drvdata->dev = regulator_register(&drvdata->desc, &pdev->dev,
+					  config->init_data, drvdata);
 	if (IS_ERR(drvdata->dev)) {
 		ret = PTR_ERR(drvdata->dev);
 		goto err_name;
diff --git a/drivers/regulator/pcf50633-regulator.c b/drivers/regulator/pcf50633-regulator.c
index 4cc85ec6e12..cd761d85c8f 100644
--- a/drivers/regulator/pcf50633-regulator.c
+++ b/drivers/regulator/pcf50633-regulator.c
@@ -284,7 +284,8 @@ static int __devinit pcf50633_regulator_probe(struct platform_device *pdev)
 	/* Already set by core driver */
 	pcf = platform_get_drvdata(pdev);
 
-	rdev = regulator_register(&regulators[pdev->id], &pdev->dev, pcf);
+	rdev = regulator_register(&regulators[pdev->id], &pdev->dev,
+				  pdev->dev.platform_data, pcf);
 	if (IS_ERR(rdev))
 		return PTR_ERR(rdev);
 
diff --git a/drivers/regulator/twl4030-regulator.c b/drivers/regulator/twl4030-regulator.c
new file mode 100644
index 00000000000..e2032fb60b5
--- /dev/null
+++ b/drivers/regulator/twl4030-regulator.c
@@ -0,0 +1,500 @@
+/*
+ * twl4030-regulator.c -- support regulators in twl4030 family chips
+ *
+ * Copyright (C) 2008 David Brownell
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/driver.h>
+#include <linux/regulator/machine.h>
+#include <linux/i2c/twl4030.h>
+
+
+/*
+ * The TWL4030/TW5030/TPS659x0 family chips include power management, a
+ * USB OTG transceiver, an RTC, ADC, PWM, and lots more.  Some versions
+ * include an audio codec, battery charger, and more voltage regulators.
+ * These chips are often used in OMAP-based systems.
+ *
+ * This driver implements software-based resource control for various
+ * voltage regulators.  This is usually augmented with state machine
+ * based control.
+ */
+
+struct twlreg_info {
+	/* start of regulator's PM_RECEIVER control register bank */
+	u8			base;
+
+	/* twl4030 resource ID, for resource control state machine */
+	u8			id;
+
+	/* voltage in mV = table[VSEL]; table_len must be a power-of-two */
+	u8			table_len;
+	const u16		*table;
+
+	/* chip constraints on regulator behavior */
+	u16			min_mV;
+
+	/* used by regulator core */
+	struct regulator_desc	desc;
+};
+
+
+/* LDO control registers ... offset is from the base of its register bank.
+ * The first three registers of all power resource banks help hardware to
+ * manage the various resource groups.
+ */
+#define VREG_GRP		0
+#define VREG_TYPE		1
+#define VREG_REMAP		2
+#define VREG_DEDICATED		3	/* LDO control */
+
+
+static inline int
+twl4030reg_read(struct twlreg_info *info, unsigned offset)
+{
+	u8 value;
+	int status;
+
+	status = twl4030_i2c_read_u8(TWL4030_MODULE_PM_RECEIVER,
+			&value, info->base + offset);
+	return (status < 0) ? status : value;
+}
+
+static inline int
+twl4030reg_write(struct twlreg_info *info, unsigned offset, u8 value)
+{
+	return twl4030_i2c_write_u8(TWL4030_MODULE_PM_RECEIVER,
+			value, info->base + offset);
+}
+
+/*----------------------------------------------------------------------*/
+
+/* generic power resource operations, which work on all regulators */
+
+static int twl4030reg_grp(struct regulator_dev *rdev)
+{
+	return twl4030reg_read(rdev_get_drvdata(rdev), VREG_GRP);
+}
+
+/*
+ * Enable/disable regulators by joining/leaving the P1 (processor) group.
+ * We assume nobody else is updating the DEV_GRP registers.
+ */
+
+#define P3_GRP		BIT(7)		/* "peripherals" */
+#define P2_GRP		BIT(6)		/* secondary processor, modem, etc */
+#define P1_GRP		BIT(5)		/* CPU/Linux */
+
+static int twl4030reg_is_enabled(struct regulator_dev *rdev)
+{
+	int	state = twl4030reg_grp(rdev);
+
+	if (state < 0)
+		return state;
+
+	return (state & P1_GRP) != 0;
+}
+
+static int twl4030reg_enable(struct regulator_dev *rdev)
+{
+	struct twlreg_info	*info = rdev_get_drvdata(rdev);
+	int			grp;
+
+	grp = twl4030reg_read(info, VREG_GRP);
+	if (grp < 0)
+		return grp;
+
+	grp |= P1_GRP;
+	return twl4030reg_write(info, VREG_GRP, grp);
+}
+
+static int twl4030reg_disable(struct regulator_dev *rdev)
+{
+	struct twlreg_info	*info = rdev_get_drvdata(rdev);
+	int			grp;
+
+	grp = twl4030reg_read(info, VREG_GRP);
+	if (grp < 0)
+		return grp;
+
+	grp &= ~P1_GRP;
+	return twl4030reg_write(info, VREG_GRP, grp);
+}
+
+static int twl4030reg_get_status(struct regulator_dev *rdev)
+{
+	int	state = twl4030reg_grp(rdev);
+
+	if (state < 0)
+		return state;
+	state &= 0x0f;
+
+	/* assume state != WARM_RESET; we'd not be running...  */
+	if (!state)
+		return REGULATOR_STATUS_OFF;
+	return (state & BIT(3))
+		? REGULATOR_STATUS_NORMAL
+		: REGULATOR_STATUS_STANDBY;
+}
+
+static int twl4030reg_set_mode(struct regulator_dev *rdev, unsigned mode)
+{
+	struct twlreg_info	*info = rdev_get_drvdata(rdev);
+	unsigned		message;
+	int			status;
+
+	/* We can only set the mode through state machine commands... */
+	switch (mode) {
+	case REGULATOR_MODE_NORMAL:
+		message = MSG_SINGULAR(DEV_GRP_P1, info->id, RES_STATE_ACTIVE);
+		break;
+	case REGULATOR_MODE_STANDBY:
+		message = MSG_SINGULAR(DEV_GRP_P1, info->id, RES_STATE_SLEEP);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* Ensure the resource is associated with some group */
+	status = twl4030reg_grp(rdev);
+	if (status < 0)
+		return status;
+	if (!(status & (P3_GRP | P2_GRP | P1_GRP)))
+		return -EACCES;
+
+	status = twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER,
+			message >> 8, 0x15 /* PB_WORD_MSB */ );
+	if (status >= 0)
+		return status;
+
+	return twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER,
+			message, 0x16 /* PB_WORD_LSB */ );
+}
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * Support for adjustable-voltage LDOs uses a four bit (or less) voltage
+ * select field in its control register.   We use tables indexed by VSEL
+ * to record voltages in milliVolts.  (Accuracy is about three percent.)
+ *
+ * Note that VSEL values for VAUX2 changed in twl5030 and newer silicon;
+ * currently handled by listing two slightly different VAUX2 regulators,
+ * only one of which will be configured.
+ *
+ * VSEL values documented as "TI cannot support these values" are flagged
+ * in these tables as UNSUP() values; we normally won't assign them.
+ *
+ * VAUX3 at 3V is incorrectly listed in some TI manuals as unsupported.
+ * TI are revising the twl5030/tps659x0 specs to support that 3.0V setting.
+ */
+#ifdef CONFIG_TWL4030_ALLOW_UNSUPPORTED
+#define UNSUP_MASK	0x0000
+#else
+#define UNSUP_MASK	0x8000
+#endif
+
+#define UNSUP(x)	(UNSUP_MASK | (x))
+#define IS_UNSUP(x)	(UNSUP_MASK & (x))
+#define LDO_MV(x)	(~UNSUP_MASK & (x))
+
+
+static const u16 VAUX1_VSEL_table[] = {
+	UNSUP(1500), UNSUP(1800), 2500, 2800,
+	3000, 3000, 3000, 3000,
+};
+static const u16 VAUX2_4030_VSEL_table[] = {
+	UNSUP(1000), UNSUP(1000), UNSUP(1200), 1300,
+	1500, 1800, UNSUP(1850), 2500,
+	UNSUP(2600), 2800, UNSUP(2850), UNSUP(3000),
+	UNSUP(3150), UNSUP(3150), UNSUP(3150), UNSUP(3150),
+};
+static const u16 VAUX2_VSEL_table[] = {
+	1700, 1700, 1900, 1300,
+	1500, 1800, 2000, 2500,
+	2100, 2800, 2200, 2300,
+	2400, 2400, 2400, 2400,
+};
+static const u16 VAUX3_VSEL_table[] = {
+	1500, 1800, 2500, 2800,
+	3000, 3000, 3000, 3000,
+};
+static const u16 VAUX4_VSEL_table[] = {
+	700, 1000, 1200, UNSUP(1300),
+	1500, 1800, UNSUP(1850), 2500,
+	UNSUP(2600), 2800, UNSUP(2850), UNSUP(3000),
+	UNSUP(3150), UNSUP(3150), UNSUP(3150), UNSUP(3150),
+};
+static const u16 VMMC1_VSEL_table[] = {
+	1850, 2850, 3000, 3150,
+};
+static const u16 VMMC2_VSEL_table[] = {
+	UNSUP(1000), UNSUP(1000), UNSUP(1200), UNSUP(1300),
+	UNSUP(1500), UNSUP(1800), 1850, UNSUP(2500),
+	2600, 2800, 2850, 3000,
+	3150, 3150, 3150, 3150,
+};
+static const u16 VPLL1_VSEL_table[] = {
+	1000, 1200, 1300, 1800,
+	UNSUP(2800), UNSUP(3000), UNSUP(3000), UNSUP(3000),
+};
+static const u16 VPLL2_VSEL_table[] = {
+	700, 1000, 1200, 1300,
+	UNSUP(1500), 1800, UNSUP(1850), UNSUP(2500),
+	UNSUP(2600), UNSUP(2800), UNSUP(2850), UNSUP(3000),
+	UNSUP(3150), UNSUP(3150), UNSUP(3150), UNSUP(3150),
+};
+static const u16 VSIM_VSEL_table[] = {
+	UNSUP(1000), UNSUP(1200), UNSUP(1300), 1800,
+	2800, 3000, 3000, 3000,
+};
+static const u16 VDAC_VSEL_table[] = {
+	1200, 1300, 1800, 1800,
+};
+
+
+static int twl4030ldo_list_voltage(struct regulator_dev *rdev, unsigned index)
+{
+	struct twlreg_info	*info = rdev_get_drvdata(rdev);
+	int			mV = info->table[index];
+
+	return IS_UNSUP(mV) ? 0 : (LDO_MV(mV) * 1000);
+}
+
+static int
+twl4030ldo_set_voltage(struct regulator_dev *rdev, int min_uV, int max_uV)
+{
+	struct twlreg_info	*info = rdev_get_drvdata(rdev);
+	int			vsel;
+
+	for (vsel = 0; vsel < info->table_len; vsel++) {
+		int mV = info->table[vsel];
+		int uV;
+
+		if (IS_UNSUP(mV))
+			continue;
+		uV = LDO_MV(mV) * 1000;
+
+		/* REVISIT for VAUX2, first match may not be best/lowest */
+
+		/* use the first in-range value */
+		if (min_uV <= uV && uV <= max_uV)
+			return twl4030reg_write(info, VREG_DEDICATED, vsel);
+	}
+
+	return -EDOM;
+}
+
+static int twl4030ldo_get_voltage(struct regulator_dev *rdev)
+{
+	struct twlreg_info	*info = rdev_get_drvdata(rdev);
+	int			vsel = twl4030reg_read(info, VREG_DEDICATED);
+
+	if (vsel < 0)
+		return vsel;
+
+	vsel &= info->table_len - 1;
+	return LDO_MV(info->table[vsel]) * 1000;
+}
+
+static struct regulator_ops twl4030ldo_ops = {
+	.list_voltage	= twl4030ldo_list_voltage,
+
+	.set_voltage	= twl4030ldo_set_voltage,
+	.get_voltage	= twl4030ldo_get_voltage,
+
+	.enable		= twl4030reg_enable,
+	.disable	= twl4030reg_disable,
+	.is_enabled	= twl4030reg_is_enabled,
+
+	.set_mode	= twl4030reg_set_mode,
+
+	.get_status	= twl4030reg_get_status,
+};
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * Fixed voltage LDOs don't have a VSEL field to update.
+ */
+static int twl4030fixed_list_voltage(struct regulator_dev *rdev, unsigned index)
+{
+	struct twlreg_info	*info = rdev_get_drvdata(rdev);
+
+	return info->min_mV * 1000;
+}
+
+static int twl4030fixed_get_voltage(struct regulator_dev *rdev)
+{
+	struct twlreg_info	*info = rdev_get_drvdata(rdev);
+
+	return info->min_mV * 1000;
+}
+
+static struct regulator_ops twl4030fixed_ops = {
+	.list_voltage	= twl4030fixed_list_voltage,
+
+	.get_voltage	= twl4030fixed_get_voltage,
+
+	.enable		= twl4030reg_enable,
+	.disable	= twl4030reg_disable,
+	.is_enabled	= twl4030reg_is_enabled,
+
+	.set_mode	= twl4030reg_set_mode,
+
+	.get_status	= twl4030reg_get_status,
+};
+
+/*----------------------------------------------------------------------*/
+
+#define TWL_ADJUSTABLE_LDO(label, offset, num) { \
+	.base = offset, \
+	.id = num, \
+	.table_len = ARRAY_SIZE(label##_VSEL_table), \
+	.table = label##_VSEL_table, \
+	.desc = { \
+		.name = #label, \
+		.id = TWL4030_REG_##label, \
+		.n_voltages = ARRAY_SIZE(label##_VSEL_table), \
+		.ops = &twl4030ldo_ops, \
+		.type = REGULATOR_VOLTAGE, \
+		.owner = THIS_MODULE, \
+		}, \
+	}
+
+#define TWL_FIXED_LDO(label, offset, mVolts, num) { \
+	.base = offset, \
+	.id = num, \
+	.min_mV = mVolts, \
+	.desc = { \
+		.name = #label, \
+		.id = TWL4030_REG_##label, \
+		.n_voltages = 1, \
+		.ops = &twl4030fixed_ops, \
+		.type = REGULATOR_VOLTAGE, \
+		.owner = THIS_MODULE, \
+		}, \
+	}
+
+/*
+ * We list regulators here if systems need some level of
+ * software control over them after boot.
+ */
+static struct twlreg_info twl4030_regs[] = {
+	TWL_ADJUSTABLE_LDO(VAUX1, 0x17, 1),
+	TWL_ADJUSTABLE_LDO(VAUX2_4030, 0x1b, 2),
+	TWL_ADJUSTABLE_LDO(VAUX2, 0x1b, 2),
+	TWL_ADJUSTABLE_LDO(VAUX3, 0x1f, 3),
+	TWL_ADJUSTABLE_LDO(VAUX4, 0x23, 4),
+	TWL_ADJUSTABLE_LDO(VMMC1, 0x27, 5),
+	TWL_ADJUSTABLE_LDO(VMMC2, 0x2b, 6),
+	/*
+	TWL_ADJUSTABLE_LDO(VPLL1, 0x2f, 7),
+	*/
+	TWL_ADJUSTABLE_LDO(VPLL2, 0x33, 8),
+	TWL_ADJUSTABLE_LDO(VSIM, 0x37, 9),
+	TWL_ADJUSTABLE_LDO(VDAC, 0x3b, 10),
+	/*
+	TWL_ADJUSTABLE_LDO(VINTANA1, 0x3f, 11),
+	TWL_ADJUSTABLE_LDO(VINTANA2, 0x43, 12),
+	TWL_ADJUSTABLE_LDO(VINTDIG, 0x47, 13),
+	TWL_SMPS(VIO, 0x4b, 14),
+	TWL_SMPS(VDD1, 0x55, 15),
+	TWL_SMPS(VDD2, 0x63, 16),
+	 */
+	TWL_FIXED_LDO(VUSB1V5, 0x71, 1500, 17),
+	TWL_FIXED_LDO(VUSB1V8, 0x74, 1800, 18),
+	TWL_FIXED_LDO(VUSB3V1, 0x77, 3100, 19),
+	/* VUSBCP is managed *only* by the USB subchip */
+};
+
+static int twl4030reg_probe(struct platform_device *pdev)
+{
+	int				i;
+	struct twlreg_info		*info;
+	struct regulator_init_data	*initdata;
+	struct regulation_constraints	*c;
+	struct regulator_dev		*rdev;
+
+	for (i = 0, info = NULL; i < ARRAY_SIZE(twl4030_regs); i++) {
+		if (twl4030_regs[i].desc.id != pdev->id)
+			continue;
+		info = twl4030_regs + i;
+		break;
+	}
+	if (!info)
+		return -ENODEV;
+
+	initdata = pdev->dev.platform_data;
+	if (!initdata)
+		return -EINVAL;
+
+	/* Constrain board-specific capabilities according to what
+	 * this driver and the chip itself can actually do.
+	 */
+	c = &initdata->constraints;
+	c->valid_modes_mask &= REGULATOR_MODE_NORMAL | REGULATOR_MODE_STANDBY;
+	c->valid_ops_mask &= REGULATOR_CHANGE_VOLTAGE
+				| REGULATOR_CHANGE_MODE
+				| REGULATOR_CHANGE_STATUS;
+
+	rdev = regulator_register(&info->desc, &pdev->dev, initdata, info);
+	if (IS_ERR(rdev)) {
+		dev_err(&pdev->dev, "can't register %s, %ld\n",
+				info->desc.name, PTR_ERR(rdev));
+		return PTR_ERR(rdev);
+	}
+	platform_set_drvdata(pdev, rdev);
+
+	/* NOTE:  many regulators support short-circuit IRQs (presentable
+	 * as REGULATOR_OVER_CURRENT notifications?) configured via:
+	 *  - SC_CONFIG
+	 *  - SC_DETECT1 (vintana2, vmmc1/2, vaux1/2/3/4)
+	 *  - SC_DETECT2 (vusb, vdac, vio, vdd1/2, vpll2)
+	 *  - IT_CONFIG
+	 */
+
+	return 0;
+}
+
+static int __devexit twl4030reg_remove(struct platform_device *pdev)
+{
+	regulator_unregister(platform_get_drvdata(pdev));
+	return 0;
+}
+
+MODULE_ALIAS("platform:twl4030_reg");
+
+static struct platform_driver twl4030reg_driver = {
+	.probe		= twl4030reg_probe,
+	.remove		= __devexit_p(twl4030reg_remove),
+	/* NOTE: short name, to work around driver model truncation of
+	 * "twl4030_regulator.12" (and friends) to "twl4030_regulator.1".
+	 */
+	.driver.name	= "twl4030_reg",
+	.driver.owner	= THIS_MODULE,
+};
+
+static int __init twl4030reg_init(void)
+{
+	return platform_driver_register(&twl4030reg_driver);
+}
+subsys_initcall(twl4030reg_init);
+
+static void __exit twl4030reg_exit(void)
+{
+	platform_driver_unregister(&twl4030reg_driver);
+}
+module_exit(twl4030reg_exit)
+
+MODULE_DESCRIPTION("TWL4030 regulator driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/regulator/virtual.c b/drivers/regulator/virtual.c
index 5ddb464b1c3..3d08348584e 100644
--- a/drivers/regulator/virtual.c
+++ b/drivers/regulator/virtual.c
@@ -226,13 +226,17 @@ static ssize_t set_mode(struct device *dev, struct device_attribute *attr,
 	unsigned int mode;
 	int ret;
 
-	if (strncmp(buf, "fast", strlen("fast")) == 0)
+	/*
+	 * sysfs_streq() doesn't need the \n's, but we add them so the strings
+	 * will be shared with show_mode(), above.
+	 */
+	if (sysfs_streq(buf, "fast\n") == 0)
 		mode = REGULATOR_MODE_FAST;
-	else if (strncmp(buf, "normal", strlen("normal")) == 0)
+	else if (sysfs_streq(buf, "normal\n") == 0)
 		mode = REGULATOR_MODE_NORMAL;
-	else if (strncmp(buf, "idle", strlen("idle")) == 0)
+	else if (sysfs_streq(buf, "idle\n") == 0)
 		mode = REGULATOR_MODE_IDLE;
-	else if (strncmp(buf, "standby", strlen("standby")) == 0)
+	else if (sysfs_streq(buf, "standby\n") == 0)
 		mode = REGULATOR_MODE_STANDBY;
 	else {
 		dev_err(dev, "Configuring invalid mode\n");
@@ -256,7 +260,7 @@ static DEVICE_ATTR(min_microamps, 0666, show_min_uA, set_min_uA);
 static DEVICE_ATTR(max_microamps, 0666, show_max_uA, set_max_uA);
 static DEVICE_ATTR(mode, 0666, show_mode, set_mode);
 
-struct device_attribute *attributes[] = {
+static struct device_attribute *attributes[] = {
 	&dev_attr_min_microvolts,
 	&dev_attr_max_microvolts,
 	&dev_attr_min_microamps,
diff --git a/drivers/regulator/wm8350-regulator.c b/drivers/regulator/wm8350-regulator.c
index 5056e23e441..771eca1066b 100644
--- a/drivers/regulator/wm8350-regulator.c
+++ b/drivers/regulator/wm8350-regulator.c
@@ -24,6 +24,9 @@
 #include <linux/regulator/driver.h>
 #include <linux/regulator/machine.h>
 
+/* Maximum value possible for VSEL */
+#define WM8350_DCDC_MAX_VSEL 0x66
+
 /* Microamps */
 static const int isink_cur[] = {
 	4,
@@ -385,6 +388,14 @@ static int wm8350_dcdc_get_voltage(struct regulator_dev *rdev)
 	return wm8350_dcdc_val_to_mvolts(val) * 1000;
 }
 
+static int wm8350_dcdc_list_voltage(struct regulator_dev *rdev,
+				    unsigned selector)
+{
+	if (selector > WM8350_DCDC_MAX_VSEL)
+		return -EINVAL;
+	return wm8350_dcdc_val_to_mvolts(selector) * 1000;
+}
+
 static int wm8350_dcdc_set_suspend_voltage(struct regulator_dev *rdev, int uV)
 {
 	struct wm8350 *wm8350 = rdev_get_drvdata(rdev);
@@ -775,6 +786,14 @@ static int wm8350_ldo_get_voltage(struct regulator_dev *rdev)
 	return wm8350_ldo_val_to_mvolts(val) * 1000;
 }
 
+static int wm8350_ldo_list_voltage(struct regulator_dev *rdev,
+				    unsigned selector)
+{
+	if (selector > WM8350_LDO1_VSEL_MASK)
+		return -EINVAL;
+	return wm8350_ldo_val_to_mvolts(selector) * 1000;
+}
+
 int wm8350_dcdc_set_slot(struct wm8350 *wm8350, int dcdc, u16 start,
 			 u16 stop, u16 fault)
 {
@@ -1031,18 +1050,30 @@ static unsigned int wm8350_dcdc_get_mode(struct regulator_dev *rdev)
 	int dcdc = rdev_get_id(rdev);
 	u16 mask, sleep, active, force;
 	int mode = REGULATOR_MODE_NORMAL;
+	int reg;
 
-	if (dcdc < WM8350_DCDC_1 || dcdc > WM8350_DCDC_6)
-		return -EINVAL;
-
-	if (dcdc == WM8350_DCDC_2 || dcdc == WM8350_DCDC_5)
+	switch (dcdc) {
+	case WM8350_DCDC_1:
+		reg = WM8350_DCDC1_FORCE_PWM;
+		break;
+	case WM8350_DCDC_3:
+		reg = WM8350_DCDC3_FORCE_PWM;
+		break;
+	case WM8350_DCDC_4:
+		reg = WM8350_DCDC4_FORCE_PWM;
+		break;
+	case WM8350_DCDC_6:
+		reg = WM8350_DCDC6_FORCE_PWM;
+		break;
+	default:
 		return -EINVAL;
+	}
 
 	mask = 1 << (dcdc - WM8350_DCDC_1);
 	active = wm8350_reg_read(wm8350, WM8350_DCDC_ACTIVE_OPTIONS) & mask;
+	force = wm8350_reg_read(wm8350, reg) & WM8350_DCDC1_FORCE_PWM_ENA;
 	sleep = wm8350_reg_read(wm8350, WM8350_DCDC_SLEEP_OPTIONS) & mask;
-	force = wm8350_reg_read(wm8350, WM8350_DCDC1_FORCE_PWM)
-	    & WM8350_DCDC1_FORCE_PWM_ENA;
+
 	dev_dbg(wm8350->dev, "mask %x active %x sleep %x force %x",
 		mask, active, sleep, force);
 
@@ -1150,6 +1181,7 @@ static int wm8350_ldo_is_enabled(struct regulator_dev *rdev)
 static struct regulator_ops wm8350_dcdc_ops = {
 	.set_voltage = wm8350_dcdc_set_voltage,
 	.get_voltage = wm8350_dcdc_get_voltage,
+	.list_voltage = wm8350_dcdc_list_voltage,
 	.enable = wm8350_dcdc_enable,
 	.disable = wm8350_dcdc_disable,
 	.get_mode = wm8350_dcdc_get_mode,
@@ -1173,6 +1205,7 @@ static struct regulator_ops wm8350_dcdc2_5_ops = {
 static struct regulator_ops wm8350_ldo_ops = {
 	.set_voltage = wm8350_ldo_set_voltage,
 	.get_voltage = wm8350_ldo_get_voltage,
+	.list_voltage = wm8350_ldo_list_voltage,
 	.enable = wm8350_ldo_enable,
 	.disable = wm8350_ldo_disable,
 	.is_enabled = wm8350_ldo_is_enabled,
@@ -1197,6 +1230,7 @@ static struct regulator_desc wm8350_reg[NUM_WM8350_REGULATORS] = {
 		.ops = &wm8350_dcdc_ops,
 		.irq = WM8350_IRQ_UV_DC1,
 		.type = REGULATOR_VOLTAGE,
+		.n_voltages = WM8350_DCDC_MAX_VSEL + 1,
 		.owner = THIS_MODULE,
 	},
 	{
@@ -1213,6 +1247,7 @@ static struct regulator_desc wm8350_reg[NUM_WM8350_REGULATORS] = {
 		.ops = &wm8350_dcdc_ops,
 		.irq = WM8350_IRQ_UV_DC3,
 		.type = REGULATOR_VOLTAGE,
+		.n_voltages = WM8350_DCDC_MAX_VSEL + 1,
 		.owner = THIS_MODULE,
 	},
 	{
@@ -1221,6 +1256,7 @@ static struct regulator_desc wm8350_reg[NUM_WM8350_REGULATORS] = {
 		.ops = &wm8350_dcdc_ops,
 		.irq = WM8350_IRQ_UV_DC4,
 		.type = REGULATOR_VOLTAGE,
+		.n_voltages = WM8350_DCDC_MAX_VSEL + 1,
 		.owner = THIS_MODULE,
 	},
 	{
@@ -1237,6 +1273,7 @@ static struct regulator_desc wm8350_reg[NUM_WM8350_REGULATORS] = {
 		.ops = &wm8350_dcdc_ops,
 		.irq = WM8350_IRQ_UV_DC6,
 		.type = REGULATOR_VOLTAGE,
+		.n_voltages = WM8350_DCDC_MAX_VSEL + 1,
 		.owner = THIS_MODULE,
 	},
 	{
@@ -1245,6 +1282,7 @@ static struct regulator_desc wm8350_reg[NUM_WM8350_REGULATORS] = {
 		.ops = &wm8350_ldo_ops,
 		.irq = WM8350_IRQ_UV_LDO1,
 		.type = REGULATOR_VOLTAGE,
+		.n_voltages = WM8350_LDO1_VSEL_MASK + 1,
 		.owner = THIS_MODULE,
 	},
 	{
@@ -1253,6 +1291,7 @@ static struct regulator_desc wm8350_reg[NUM_WM8350_REGULATORS] = {
 		.ops = &wm8350_ldo_ops,
 		.irq = WM8350_IRQ_UV_LDO2,
 		.type = REGULATOR_VOLTAGE,
+		.n_voltages = WM8350_LDO2_VSEL_MASK + 1,
 		.owner = THIS_MODULE,
 	},
 	{
@@ -1261,6 +1300,7 @@ static struct regulator_desc wm8350_reg[NUM_WM8350_REGULATORS] = {
 		.ops = &wm8350_ldo_ops,
 		.irq = WM8350_IRQ_UV_LDO3,
 		.type = REGULATOR_VOLTAGE,
+		.n_voltages = WM8350_LDO3_VSEL_MASK + 1,
 		.owner = THIS_MODULE,
 	},
 	{
@@ -1269,6 +1309,7 @@ static struct regulator_desc wm8350_reg[NUM_WM8350_REGULATORS] = {
 		.ops = &wm8350_ldo_ops,
 		.irq = WM8350_IRQ_UV_LDO4,
 		.type = REGULATOR_VOLTAGE,
+		.n_voltages = WM8350_LDO4_VSEL_MASK + 1,
 		.owner = THIS_MODULE,
 	},
 	{
@@ -1293,6 +1334,7 @@ static void pmic_uv_handler(struct wm8350 *wm8350, int irq, void *data)
 {
 	struct regulator_dev *rdev = (struct regulator_dev *)data;
 
+	mutex_lock(&rdev->mutex);
 	if (irq == WM8350_IRQ_CS1 || irq == WM8350_IRQ_CS2)
 		regulator_notifier_call_chain(rdev,
 					      REGULATOR_EVENT_REGULATION_OUT,
@@ -1301,6 +1343,7 @@ static void pmic_uv_handler(struct wm8350 *wm8350, int irq, void *data)
 		regulator_notifier_call_chain(rdev,
 					      REGULATOR_EVENT_UNDER_VOLTAGE,
 					      wm8350);
+	mutex_unlock(&rdev->mutex);
 }
 
 static int wm8350_regulator_probe(struct platform_device *pdev)
@@ -1333,9 +1376,9 @@ static int wm8350_regulator_probe(struct platform_device *pdev)
 		break;
 	}
 
-
 	/* register regulator */
 	rdev = regulator_register(&wm8350_reg[pdev->id], &pdev->dev,
+				  pdev->dev.platform_data,
 				  dev_get_drvdata(&pdev->dev));
 	if (IS_ERR(rdev)) {
 		dev_err(&pdev->dev, "failed to register %s\n",
diff --git a/drivers/regulator/wm8400-regulator.c b/drivers/regulator/wm8400-regulator.c
index 56e23d44ba5..15742602907 100644
--- a/drivers/regulator/wm8400-regulator.c
+++ b/drivers/regulator/wm8400-regulator.c
@@ -43,6 +43,18 @@ static int wm8400_ldo_disable(struct regulator_dev *dev)
 			       WM8400_LDO1_ENA, 0);
 }
 
+static int wm8400_ldo_list_voltage(struct regulator_dev *dev,
+				   unsigned selector)
+{
+	if (selector > WM8400_LDO1_VSEL_MASK)
+		return -EINVAL;
+
+	if (selector < 15)
+		return 900000 + (selector * 50000);
+	else
+		return 1600000 + ((selector - 14) * 100000);
+}
+
 static int wm8400_ldo_get_voltage(struct regulator_dev *dev)
 {
 	struct wm8400 *wm8400 = rdev_get_drvdata(dev);
@@ -51,10 +63,7 @@ static int wm8400_ldo_get_voltage(struct regulator_dev *dev)
 	val = wm8400_reg_read(wm8400, WM8400_LDO1_CONTROL + rdev_get_id(dev));
 	val &= WM8400_LDO1_VSEL_MASK;
 
-	if (val < 15)
-		return 900000 + (val * 50000);
-	else
-		return 1600000 + ((val - 14) * 100000);
+	return wm8400_ldo_list_voltage(dev, val);
 }
 
 static int wm8400_ldo_set_voltage(struct regulator_dev *dev,
@@ -92,6 +101,7 @@ static struct regulator_ops wm8400_ldo_ops = {
 	.is_enabled = wm8400_ldo_is_enabled,
 	.enable = wm8400_ldo_enable,
 	.disable = wm8400_ldo_disable,
+	.list_voltage = wm8400_ldo_list_voltage,
 	.get_voltage = wm8400_ldo_get_voltage,
 	.set_voltage = wm8400_ldo_set_voltage,
 };
@@ -124,6 +134,15 @@ static int wm8400_dcdc_disable(struct regulator_dev *dev)
 			       WM8400_DC1_ENA, 0);
 }
 
+static int wm8400_dcdc_list_voltage(struct regulator_dev *dev,
+				    unsigned selector)
+{
+	if (selector > WM8400_DC1_VSEL_MASK)
+		return -EINVAL;
+
+	return 850000 + (selector * 25000);
+}
+
 static int wm8400_dcdc_get_voltage(struct regulator_dev *dev)
 {
 	struct wm8400 *wm8400 = rdev_get_drvdata(dev);
@@ -237,6 +256,7 @@ static struct regulator_ops wm8400_dcdc_ops = {
 	.is_enabled = wm8400_dcdc_is_enabled,
 	.enable = wm8400_dcdc_enable,
 	.disable = wm8400_dcdc_disable,
+	.list_voltage = wm8400_dcdc_list_voltage,
 	.get_voltage = wm8400_dcdc_get_voltage,
 	.set_voltage = wm8400_dcdc_set_voltage,
 	.get_mode = wm8400_dcdc_get_mode,
@@ -249,6 +269,7 @@ static struct regulator_desc regulators[] = {
 		.name = "LDO1",
 		.id = WM8400_LDO1,
 		.ops = &wm8400_ldo_ops,
+		.n_voltages = WM8400_LDO1_VSEL_MASK + 1,
 		.type = REGULATOR_VOLTAGE,
 		.owner = THIS_MODULE,
 	},
@@ -256,6 +277,7 @@ static struct regulator_desc regulators[] = {
 		.name = "LDO2",
 		.id = WM8400_LDO2,
 		.ops = &wm8400_ldo_ops,
+		.n_voltages = WM8400_LDO2_VSEL_MASK + 1,
 		.type = REGULATOR_VOLTAGE,
 		.owner = THIS_MODULE,
 	},
@@ -263,6 +285,7 @@ static struct regulator_desc regulators[] = {
 		.name = "LDO3",
 		.id = WM8400_LDO3,
 		.ops = &wm8400_ldo_ops,
+		.n_voltages = WM8400_LDO3_VSEL_MASK + 1,
 		.type = REGULATOR_VOLTAGE,
 		.owner = THIS_MODULE,
 	},
@@ -270,6 +293,7 @@ static struct regulator_desc regulators[] = {
 		.name = "LDO4",
 		.id = WM8400_LDO4,
 		.ops = &wm8400_ldo_ops,
+		.n_voltages = WM8400_LDO4_VSEL_MASK + 1,
 		.type = REGULATOR_VOLTAGE,
 		.owner = THIS_MODULE,
 	},
@@ -277,6 +301,7 @@ static struct regulator_desc regulators[] = {
 		.name = "DCDC1",
 		.id = WM8400_DCDC1,
 		.ops = &wm8400_dcdc_ops,
+		.n_voltages = WM8400_DC1_VSEL_MASK + 1,
 		.type = REGULATOR_VOLTAGE,
 		.owner = THIS_MODULE,
 	},
@@ -284,6 +309,7 @@ static struct regulator_desc regulators[] = {
 		.name = "DCDC2",
 		.id = WM8400_DCDC2,
 		.ops = &wm8400_dcdc_ops,
+		.n_voltages = WM8400_DC2_VSEL_MASK + 1,
 		.type = REGULATOR_VOLTAGE,
 		.owner = THIS_MODULE,
 	},
@@ -294,7 +320,7 @@ static int __devinit wm8400_regulator_probe(struct platform_device *pdev)
 	struct regulator_dev *rdev;
 
 	rdev = regulator_register(&regulators[pdev->id], &pdev->dev,
-		pdev->dev.driver_data);
+		pdev->dev.platform_data, pdev->dev.driver_data);
 
 	if (IS_ERR(rdev))
 		return PTR_ERR(rdev);
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 56002f7d26b..ffe34a12f44 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -688,22 +688,16 @@ config RTC_DRV_RS5C313
 	help
 	  If you say yes here you get support for the Ricoh RS5C313 RTC chips.
 
-config RTC_DRV_PARISC
-	tristate "PA-RISC firmware RTC support"
-	depends on PARISC
-	help
-	  Say Y or M here to enable RTC support on PA-RISC systems using
-	  firmware calls. If you do not know what you are doing, you should
+config RTC_DRV_GENERIC
+	tristate "Generic RTC support"
+	# Please consider writing a new RTC driver instead of using the generic
+	# RTC abstraction
+	depends on PARISC || M68K || PPC
+	help
+	  Say Y or M here to enable RTC support on systems using the generic
+	  RTC abstraction. If you do not know what you are doing, you should
 	  just say Y.
 
-config RTC_DRV_PPC
-       tristate "PowerPC machine dependent RTC support"
-       depends on PPC
-       help
-	 The PowerPC kernel has machine-specific functions for accessing
-	 the RTC. This exposes that functionality through the generic RTC
-	 class.
-
 config RTC_DRV_PXA
        tristate "PXA27x/PXA3xx"
        depends on ARCH_PXA
@@ -747,4 +741,13 @@ config RTC_DRV_MV
 	  This driver can also be built as a module. If so, the module
 	  will be called rtc-mv.
 
+config RTC_DRV_PS3
+	tristate "PS3 RTC"
+	depends on PPC_PS3
+	help
+	  If you say yes here you will get support for the RTC on PS3.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called rtc-ps3.
+
 endif # RTC_CLASS
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index e7b09986d26..6c0639a14f0 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -56,8 +56,7 @@ obj-$(CONFIG_RTC_DRV_PCF8563)	+= rtc-pcf8563.o
 obj-$(CONFIG_RTC_DRV_PCF8583)	+= rtc-pcf8583.o
 obj-$(CONFIG_RTC_DRV_PL030)	+= rtc-pl030.o
 obj-$(CONFIG_RTC_DRV_PL031)	+= rtc-pl031.o
-obj-$(CONFIG_RTC_DRV_PARISC)	+= rtc-parisc.o
-obj-$(CONFIG_RTC_DRV_PPC)	+= rtc-ppc.o
+obj-$(CONFIG_RTC_DRV_GENERIC)	+= rtc-generic.o
 obj-$(CONFIG_RTC_DRV_PXA)	+= rtc-pxa.o
 obj-$(CONFIG_RTC_DRV_R9701)	+= rtc-r9701.o
 obj-$(CONFIG_RTC_DRV_RS5C313)	+= rtc-rs5c313.o
@@ -77,3 +76,4 @@ obj-$(CONFIG_RTC_DRV_VR41XX)	+= rtc-vr41xx.o
 obj-$(CONFIG_RTC_DRV_WM8350)	+= rtc-wm8350.o
 obj-$(CONFIG_RTC_DRV_X1205)	+= rtc-x1205.o
 obj-$(CONFIG_RTC_DRV_PCF50633)	+= rtc-pcf50633.o
+obj-$(CONFIG_RTC_DRV_PS3)	+= rtc-ps3.o
diff --git a/drivers/rtc/rtc-generic.c b/drivers/rtc/rtc-generic.c
new file mode 100644
index 00000000000..98322004ad2
--- /dev/null
+++ b/drivers/rtc/rtc-generic.c
@@ -0,0 +1,84 @@
+/* rtc-generic: RTC driver using the generic RTC abstraction
+ *
+ * Copyright (C) 2008 Kyle McMartin <kyle@mcmartin.ca>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/platform_device.h>
+#include <linux/rtc.h>
+
+#include <asm/rtc.h>
+
+static int generic_get_time(struct device *dev, struct rtc_time *tm)
+{
+	unsigned int ret = get_rtc_time(tm);
+
+	if (ret & RTC_BATT_BAD)
+		return -EOPNOTSUPP;
+
+	return rtc_valid_tm(tm);
+}
+
+static int generic_set_time(struct device *dev, struct rtc_time *tm)
+{
+	if (set_rtc_time(tm) < 0)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+static const struct rtc_class_ops generic_rtc_ops = {
+	.read_time = generic_get_time,
+	.set_time = generic_set_time,
+};
+
+static int __init generic_rtc_probe(struct platform_device *dev)
+{
+	struct rtc_device *rtc;
+
+	rtc = rtc_device_register("rtc-generic", &dev->dev, &generic_rtc_ops,
+				  THIS_MODULE);
+	if (IS_ERR(rtc))
+		return PTR_ERR(rtc);
+
+	platform_set_drvdata(dev, rtc);
+
+	return 0;
+}
+
+static int __exit generic_rtc_remove(struct platform_device *dev)
+{
+	struct rtc_device *rtc = platform_get_drvdata(dev);
+
+	rtc_device_unregister(rtc);
+
+	return 0;
+}
+
+static struct platform_driver generic_rtc_driver = {
+	.driver = {
+		.name = "rtc-generic",
+		.owner = THIS_MODULE,
+	},
+	.remove = __exit_p(generic_rtc_remove),
+};
+
+static int __init generic_rtc_init(void)
+{
+	return platform_driver_probe(&generic_rtc_driver, generic_rtc_probe);
+}
+
+static void __exit generic_rtc_fini(void)
+{
+	platform_driver_unregister(&generic_rtc_driver);
+}
+
+module_init(generic_rtc_init);
+module_exit(generic_rtc_fini);
+
+MODULE_AUTHOR("Kyle McMartin <kyle@mcmartin.ca>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Generic RTC driver");
+MODULE_ALIAS("platform:rtc-generic");
diff --git a/drivers/rtc/rtc-parisc.c b/drivers/rtc/rtc-parisc.c
deleted file mode 100644
index b966f56da97..00000000000
--- a/drivers/rtc/rtc-parisc.c
+++ /dev/null
@@ -1,86 +0,0 @@
-/* rtc-parisc: RTC for HP PA-RISC firmware
- *
- * Copyright (C) 2008 Kyle McMartin <kyle@mcmartin.ca>
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/time.h>
-#include <linux/platform_device.h>
-#include <linux/rtc.h>
-
-#include <asm/rtc.h>
-
-static int parisc_get_time(struct device *dev, struct rtc_time *tm)
-{
-	unsigned long ret;
-
-	ret = get_rtc_time(tm);
-
-	if (ret & RTC_BATT_BAD)
-		return -EOPNOTSUPP;
-
-	return rtc_valid_tm(tm);
-}
-
-static int parisc_set_time(struct device *dev, struct rtc_time *tm)
-{
-	if (set_rtc_time(tm) < 0)
-		return -EOPNOTSUPP;
-
-	return 0;
-}
-
-static const struct rtc_class_ops parisc_rtc_ops = {
-	.read_time = parisc_get_time,
-	.set_time = parisc_set_time,
-};
-
-static int __init parisc_rtc_probe(struct platform_device *dev)
-{
-	struct rtc_device *rtc;
-
-	rtc = rtc_device_register("rtc-parisc", &dev->dev, &parisc_rtc_ops,
-				  THIS_MODULE);
-	if (IS_ERR(rtc))
-		return PTR_ERR(rtc);
-
-	platform_set_drvdata(dev, rtc);
-
-	return 0;
-}
-
-static int __exit parisc_rtc_remove(struct platform_device *dev)
-{
-	struct rtc_device *rtc = platform_get_drvdata(dev);
-
-	rtc_device_unregister(rtc);
-
-	return 0;
-}
-
-static struct platform_driver parisc_rtc_driver = {
-	.driver = {
-		.name = "rtc-parisc",
-		.owner = THIS_MODULE,
-	},
-	.probe = parisc_rtc_probe,
-	.remove = __devexit_p(parisc_rtc_remove),
-};
-
-static int __init parisc_rtc_init(void)
-{
-	return platform_driver_probe(&parisc_rtc_driver, parisc_rtc_probe);
-}
-
-static void __exit parisc_rtc_fini(void)
-{
-	platform_driver_unregister(&parisc_rtc_driver);
-}
-
-module_init(parisc_rtc_init);
-module_exit(parisc_rtc_fini);
-
-MODULE_AUTHOR("Kyle McMartin <kyle@mcmartin.ca>");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("HP PA-RISC RTC driver");
diff --git a/drivers/rtc/rtc-ppc.c b/drivers/rtc/rtc-ppc.c
deleted file mode 100644
index c8e97e25ef7..00000000000
--- a/drivers/rtc/rtc-ppc.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * RTC driver for ppc_md RTC functions
- *
- * © 2007 Red Hat, Inc.
- *
- * Author: David Woodhouse <dwmw2@infradead.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-
-#include <linux/module.h>
-#include <linux/err.h>
-#include <linux/rtc.h>
-#include <linux/platform_device.h>
-#include <asm/machdep.h>
-
-static int ppc_rtc_read_time(struct device *dev, struct rtc_time *tm)
-{
-	ppc_md.get_rtc_time(tm);
-	return 0;
-}
-
-static int ppc_rtc_set_time(struct device *dev, struct rtc_time *tm)
-{
-	return ppc_md.set_rtc_time(tm);
-}
-
-static const struct rtc_class_ops ppc_rtc_ops = {
-	.set_time = ppc_rtc_set_time,
-	.read_time = ppc_rtc_read_time,
-};
-
-static struct rtc_device *rtc;
-static struct platform_device *ppc_rtc_pdev;
-
-static int __init ppc_rtc_init(void)
-{
-	if (!ppc_md.get_rtc_time || !ppc_md.set_rtc_time)
-		return -ENODEV;
-
-	ppc_rtc_pdev = platform_device_register_simple("ppc-rtc", 0, NULL, 0);
-	if (IS_ERR(ppc_rtc_pdev))
-		return PTR_ERR(ppc_rtc_pdev);
-
-	rtc = rtc_device_register("ppc_md", &ppc_rtc_pdev->dev,
-				  &ppc_rtc_ops, THIS_MODULE);
-	if (IS_ERR(rtc)) {
-		platform_device_unregister(ppc_rtc_pdev);
-		return PTR_ERR(rtc);
-	}
-
-	return 0;
-}
-
-static void __exit ppc_rtc_exit(void)
-{
-	rtc_device_unregister(rtc);
-	platform_device_unregister(ppc_rtc_pdev);
-}
-
-module_init(ppc_rtc_init);
-module_exit(ppc_rtc_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("David Woodhouse <dwmw2@infradead.org>");
-MODULE_DESCRIPTION("Generic RTC class driver for PowerPC");
diff --git a/drivers/rtc/rtc-ps3.c b/drivers/rtc/rtc-ps3.c
new file mode 100644
index 00000000000..968133ce1ee
--- /dev/null
+++ b/drivers/rtc/rtc-ps3.c
@@ -0,0 +1,104 @@
+/*
+ * PS3 RTC Driver
+ *
+ * Copyright 2009 Sony Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/rtc.h>
+
+#include <asm/lv1call.h>
+#include <asm/ps3.h>
+
+
+static u64 read_rtc(void)
+{
+	int result;
+	u64 rtc_val;
+	u64 tb_val;
+
+	result = lv1_get_rtc(&rtc_val, &tb_val);
+	BUG_ON(result);
+
+	return rtc_val;
+}
+
+static int ps3_get_time(struct device *dev, struct rtc_time *tm)
+{
+	rtc_time_to_tm(read_rtc() + ps3_os_area_get_rtc_diff(), tm);
+	return rtc_valid_tm(tm);
+}
+
+static int ps3_set_time(struct device *dev, struct rtc_time *tm)
+{
+	unsigned long now;
+
+	rtc_tm_to_time(tm, &now);
+	ps3_os_area_set_rtc_diff(now - read_rtc());
+	return 0;
+}
+
+static const struct rtc_class_ops ps3_rtc_ops = {
+	.read_time = ps3_get_time,
+	.set_time = ps3_set_time,
+};
+
+static int __init ps3_rtc_probe(struct platform_device *dev)
+{
+	struct rtc_device *rtc;
+
+	rtc = rtc_device_register("rtc-ps3", &dev->dev, &ps3_rtc_ops,
+				  THIS_MODULE);
+	if (IS_ERR(rtc))
+		return PTR_ERR(rtc);
+
+	platform_set_drvdata(dev, rtc);
+	return 0;
+}
+
+static int __exit ps3_rtc_remove(struct platform_device *dev)
+{
+	rtc_device_unregister(platform_get_drvdata(dev));
+	return 0;
+}
+
+static struct platform_driver ps3_rtc_driver = {
+	.driver = {
+		.name = "rtc-ps3",
+		.owner = THIS_MODULE,
+	},
+	.remove = __exit_p(ps3_rtc_remove),
+};
+
+static int __init ps3_rtc_init(void)
+{
+	return platform_driver_probe(&ps3_rtc_driver, ps3_rtc_probe);
+}
+
+static void __exit ps3_rtc_fini(void)
+{
+	platform_driver_unregister(&ps3_rtc_driver);
+}
+
+module_init(ps3_rtc_init);
+module_exit(ps3_rtc_fini);
+
+MODULE_AUTHOR("Sony Corporation");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ps3 RTC driver");
+MODULE_ALIAS("platform:rtc-ps3");
diff --git a/drivers/serial/mcf.c b/drivers/serial/mcf.c
index 56841fe5f48..0eefb07beba 100644
--- a/drivers/serial/mcf.c
+++ b/drivers/serial/mcf.c
@@ -513,7 +513,7 @@ static int __init mcf_console_setup(struct console *co, char *options)
 	int parity = 'n';
 	int flow = 'n';
 
-	if ((co->index >= 0) && (co->index <= MCF_MAXPORTS))
+	if ((co->index < 0) || (co->index >= MCF_MAXPORTS))
 		co->index = 0;
 	port = &mcf_ports[co->index].port;
 	if (port->membase == 0)
diff --git a/drivers/usb/storage/isd200.c b/drivers/usb/storage/isd200.c
index 882c57b399f..fdba2f69d4c 100644
--- a/drivers/usb/storage/isd200.c
+++ b/drivers/usb/storage/isd200.c
@@ -46,6 +46,7 @@
 #include <linux/errno.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/ata.h>
 #include <linux/hdreg.h>
 #include <linux/scatterlist.h>
 
@@ -328,7 +329,7 @@ struct isd200_config {
 
 struct isd200_info {
 	struct inquiry_data InquiryData;
-	struct hd_driveid *id;
+	u16 *id;
 	struct isd200_config ConfigData;
 	unsigned char *RegsBuf;
 	unsigned char ATARegs[8];
@@ -419,19 +420,19 @@ static void isd200_build_sense(struct us_data *us, struct scsi_cmnd *srb)
 		buf->Flags = UNIT_ATTENTION;
 		buf->AdditionalSenseCode = 0;
 		buf->AdditionalSenseCodeQualifier = 0;
-	} else if(error & MCR_ERR) {
+	} else if (error & ATA_MCR) {
 		buf->ErrorCode = 0x70 | SENSE_ERRCODE_VALID;
 		buf->AdditionalSenseLength = 0xb;
 		buf->Flags =  UNIT_ATTENTION;
 		buf->AdditionalSenseCode = 0;
 		buf->AdditionalSenseCodeQualifier = 0;
-	} else if(error & TRK0_ERR) {
+	} else if (error & ATA_TRK0NF) {
 		buf->ErrorCode = 0x70 | SENSE_ERRCODE_VALID;
 		buf->AdditionalSenseLength = 0xb;
 		buf->Flags =  NOT_READY;
 		buf->AdditionalSenseCode = 0;
 		buf->AdditionalSenseCodeQualifier = 0;
-	} else if(error & ECC_ERR) {
+	} else if (error & ATA_UNC) {
 		buf->ErrorCode = 0x70 | SENSE_ERRCODE_VALID;
 		buf->AdditionalSenseLength = 0xb;
 		buf->Flags =  DATA_PROTECT;
@@ -547,16 +548,16 @@ static int isd200_action( struct us_data *us, int action,
 		ata.generic.ActionSelect = ACTION_SELECT_1|ACTION_SELECT_5;
 		ata.generic.RegisterSelect = REG_DEVICE_HEAD | REG_COMMAND;
 		ata.write.DeviceHeadByte = info->DeviceHead;
-		ata.write.CommandByte = WIN_SRST;
+		ata.write.CommandByte = ATA_CMD_DEV_RESET;
 		isd200_set_srb(info, DMA_NONE, NULL, 0);
 		break;
 
 	case ACTION_IDENTIFY:
 		US_DEBUGP("   isd200_action(IDENTIFY)\n");
 		ata.generic.RegisterSelect = REG_COMMAND;
-		ata.write.CommandByte = WIN_IDENTIFY;
+		ata.write.CommandByte = ATA_CMD_ID_ATA;
 		isd200_set_srb(info, DMA_FROM_DEVICE, info->id,
-		                                sizeof(struct hd_driveid));
+				ATA_ID_WORDS * 2);
 		break;
 
 	default:
@@ -944,22 +945,22 @@ static int isd200_try_enum(struct us_data *us, unsigned char master_slave,
 			break;
 
 		if (!detect) {
-			if (regs[ATA_REG_STATUS_OFFSET] & BUSY_STAT) {
+			if (regs[ATA_REG_STATUS_OFFSET] & ATA_BUSY) {
 				US_DEBUGP("   %s status is still BSY, try again...\n",mstr);
 			} else {
 				US_DEBUGP("   %s status !BSY, continue with next operation\n",mstr);
 				break;
 			}
 		}
-		/* check for BUSY_STAT and */
-		/* WRERR_STAT (workaround ATA Zip drive) and */ 
-		/* ERR_STAT (workaround for Archos CD-ROM) */
+		/* check for ATA_BUSY and */
+		/* ATA_DF (workaround ATA Zip drive) and */
+		/* ATA_ERR (workaround for Archos CD-ROM) */
 		else if (regs[ATA_REG_STATUS_OFFSET] &
-			 (BUSY_STAT | WRERR_STAT | ERR_STAT )) {
+			 (ATA_BUSY | ATA_DF | ATA_ERR)) {
 			US_DEBUGP("   Status indicates it is not ready, try again...\n");
 		}
 		/* check for DRDY, ATA devices set DRDY after SRST */
-		else if (regs[ATA_REG_STATUS_OFFSET] & READY_STAT) {
+		else if (regs[ATA_REG_STATUS_OFFSET] & ATA_DRDY) {
 			US_DEBUGP("   Identified ATA device\n");
 			info->DeviceFlags |= DF_ATA_DEVICE;
 			info->DeviceHead = master_slave;
@@ -1053,103 +1054,50 @@ static int isd200_manual_enum(struct us_data *us)
 	return(retStatus);
 }
 
-static void isd200_fix_driveid (struct hd_driveid *id)
+static void isd200_fix_driveid(u16 *id)
 {
 #ifndef __LITTLE_ENDIAN
 # ifdef __BIG_ENDIAN
 	int i;
-	u16 *stringcast;
-
-	id->config         = __le16_to_cpu(id->config);
-	id->cyls           = __le16_to_cpu(id->cyls);
-	id->reserved2      = __le16_to_cpu(id->reserved2);
-	id->heads          = __le16_to_cpu(id->heads);
-	id->track_bytes    = __le16_to_cpu(id->track_bytes);
-	id->sector_bytes   = __le16_to_cpu(id->sector_bytes);
-	id->sectors        = __le16_to_cpu(id->sectors);
-	id->vendor0        = __le16_to_cpu(id->vendor0);
-	id->vendor1        = __le16_to_cpu(id->vendor1);
-	id->vendor2        = __le16_to_cpu(id->vendor2);
-	stringcast = (u16 *)&id->serial_no[0];
-	for (i = 0; i < (20/2); i++)
-		stringcast[i] = __le16_to_cpu(stringcast[i]);
-	id->buf_type       = __le16_to_cpu(id->buf_type);
-	id->buf_size       = __le16_to_cpu(id->buf_size);
-	id->ecc_bytes      = __le16_to_cpu(id->ecc_bytes);
-	stringcast = (u16 *)&id->fw_rev[0];
-	for (i = 0; i < (8/2); i++)
-		stringcast[i] = __le16_to_cpu(stringcast[i]);
-	stringcast = (u16 *)&id->model[0];
-	for (i = 0; i < (40/2); i++)
-		stringcast[i] = __le16_to_cpu(stringcast[i]);
-	id->dword_io       = __le16_to_cpu(id->dword_io);
-	id->reserved50     = __le16_to_cpu(id->reserved50);
-	id->field_valid    = __le16_to_cpu(id->field_valid);
-	id->cur_cyls       = __le16_to_cpu(id->cur_cyls);
-	id->cur_heads      = __le16_to_cpu(id->cur_heads);
-	id->cur_sectors    = __le16_to_cpu(id->cur_sectors);
-	id->cur_capacity0  = __le16_to_cpu(id->cur_capacity0);
-	id->cur_capacity1  = __le16_to_cpu(id->cur_capacity1);
-	id->lba_capacity   = __le32_to_cpu(id->lba_capacity);
-	id->dma_1word      = __le16_to_cpu(id->dma_1word);
-	id->dma_mword      = __le16_to_cpu(id->dma_mword);
-	id->eide_pio_modes = __le16_to_cpu(id->eide_pio_modes);
-	id->eide_dma_min   = __le16_to_cpu(id->eide_dma_min);
-	id->eide_dma_time  = __le16_to_cpu(id->eide_dma_time);
-	id->eide_pio       = __le16_to_cpu(id->eide_pio);
-	id->eide_pio_iordy = __le16_to_cpu(id->eide_pio_iordy);
-	for (i = 0; i < 2; ++i)
-		id->words69_70[i] = __le16_to_cpu(id->words69_70[i]);
-	for (i = 0; i < 4; ++i)
-		id->words71_74[i] = __le16_to_cpu(id->words71_74[i]);
-	id->queue_depth    = __le16_to_cpu(id->queue_depth);
-	for (i = 0; i < 4; ++i)
-		id->words76_79[i] = __le16_to_cpu(id->words76_79[i]);
-	id->major_rev_num  = __le16_to_cpu(id->major_rev_num);
-	id->minor_rev_num  = __le16_to_cpu(id->minor_rev_num);
-	id->command_set_1  = __le16_to_cpu(id->command_set_1);
-	id->command_set_2  = __le16_to_cpu(id->command_set_2);
-	id->cfsse          = __le16_to_cpu(id->cfsse);
-	id->cfs_enable_1   = __le16_to_cpu(id->cfs_enable_1);
-	id->cfs_enable_2   = __le16_to_cpu(id->cfs_enable_2);
-	id->csf_default    = __le16_to_cpu(id->csf_default);
-	id->dma_ultra      = __le16_to_cpu(id->dma_ultra);
-	id->trseuc         = __le16_to_cpu(id->trseuc);
-	id->trsEuc         = __le16_to_cpu(id->trsEuc);
-	id->CurAPMvalues   = __le16_to_cpu(id->CurAPMvalues);
-	id->mprc           = __le16_to_cpu(id->mprc);
-	id->hw_config      = __le16_to_cpu(id->hw_config);
-	id->acoustic       = __le16_to_cpu(id->acoustic);
-	id->msrqs          = __le16_to_cpu(id->msrqs);
-	id->sxfert         = __le16_to_cpu(id->sxfert);
-	id->sal            = __le16_to_cpu(id->sal);
-	id->spg            = __le32_to_cpu(id->spg);
-	id->lba_capacity_2 = __le64_to_cpu(id->lba_capacity_2);
-	for (i = 0; i < 22; i++)
-		id->words104_125[i]   = __le16_to_cpu(id->words104_125[i]);
-	id->last_lun       = __le16_to_cpu(id->last_lun);
-	id->word127        = __le16_to_cpu(id->word127);
-	id->dlf            = __le16_to_cpu(id->dlf);
-	id->csfo           = __le16_to_cpu(id->csfo);
-	for (i = 0; i < 26; i++)
-		id->words130_155[i] = __le16_to_cpu(id->words130_155[i]);
-	id->word156        = __le16_to_cpu(id->word156);
-	for (i = 0; i < 3; i++)
-		id->words157_159[i] = __le16_to_cpu(id->words157_159[i]);
-	id->cfa_power      = __le16_to_cpu(id->cfa_power);
-	for (i = 0; i < 14; i++)
-		id->words161_175[i] = __le16_to_cpu(id->words161_175[i]);
-	for (i = 0; i < 31; i++)
-		id->words176_205[i] = __le16_to_cpu(id->words176_205[i]);
-	for (i = 0; i < 48; i++)
-		id->words206_254[i] = __le16_to_cpu(id->words206_254[i]);
-	id->integrity_word  = __le16_to_cpu(id->integrity_word);
+
+	for (i = 0; i < ATA_ID_WORDS; i++)
+		id[i] = __le16_to_cpu(id[i]);
 # else
 #  error "Please fix <asm/byteorder.h>"
 # endif
 #endif
 }
 
+static void isd200_dump_driveid(u16 *id)
+{
+	US_DEBUGP("   Identify Data Structure:\n");
+	US_DEBUGP("      config = 0x%x\n",	  id[ATA_ID_CONFIG]);
+	US_DEBUGP("      cyls = 0x%x\n",	  id[ATA_ID_CYLS]);
+	US_DEBUGP("      heads = 0x%x\n",	  id[ATA_ID_HEADS]);
+	US_DEBUGP("      track_bytes = 0x%x\n",	  id[4]);
+	US_DEBUGP("      sector_bytes = 0x%x\n",  id[5]);
+	US_DEBUGP("      sectors = 0x%x\n",	  id[ATA_ID_SECTORS]);
+	US_DEBUGP("      serial_no[0] = 0x%x\n",  *(char *)&id[ATA_ID_SERNO]);
+	US_DEBUGP("      buf_type = 0x%x\n",	  id[20]);
+	US_DEBUGP("      buf_size = 0x%x\n",	  id[ATA_ID_BUF_SIZE]);
+	US_DEBUGP("      ecc_bytes = 0x%x\n",	  id[22]);
+	US_DEBUGP("      fw_rev[0] = 0x%x\n",	  *(char *)&id[ATA_ID_FW_REV]);
+	US_DEBUGP("      model[0] = 0x%x\n",	  *(char *)&id[ATA_ID_PROD]);
+	US_DEBUGP("      max_multsect = 0x%x\n",  id[ATA_ID_MAX_MULTSECT] & 0xff);
+	US_DEBUGP("      dword_io = 0x%x\n",	  id[ATA_ID_DWORD_IO]);
+	US_DEBUGP("      capability = 0x%x\n",	  id[ATA_ID_CAPABILITY] >> 8);
+	US_DEBUGP("      tPIO = 0x%x\n",	  id[ATA_ID_OLD_PIO_MODES] >> 8);
+	US_DEBUGP("      tDMA = 0x%x\n",	  id[ATA_ID_OLD_DMA_MODES] >> 8);
+	US_DEBUGP("      field_valid = 0x%x\n",	  id[ATA_ID_FIELD_VALID]);
+	US_DEBUGP("      cur_cyls = 0x%x\n",	  id[ATA_ID_CUR_CYLS]);
+	US_DEBUGP("      cur_heads = 0x%x\n",	  id[ATA_ID_CUR_HEADS]);
+	US_DEBUGP("      cur_sectors = 0x%x\n",	  id[ATA_ID_CUR_SECTORS]);
+	US_DEBUGP("      cur_capacity = 0x%x\n",  ata_id_u32(id, 57));
+	US_DEBUGP("      multsect = 0x%x\n",	  id[ATA_ID_MULTSECT] & 0xff);
+	US_DEBUGP("      lba_capacity = 0x%x\n",  ata_id_u32(id, ATA_ID_LBA_CAPACITY));
+	US_DEBUGP("      command_set_1 = 0x%x\n", id[ATA_ID_COMMAND_SET_1]);
+	US_DEBUGP("      command_set_2 = 0x%x\n", id[ATA_ID_COMMAND_SET_2]);
+}
 
 /**************************************************************************
  * isd200_get_inquiry_data
@@ -1163,7 +1111,7 @@ static int isd200_get_inquiry_data( struct us_data *us )
 {
 	struct isd200_info *info = (struct isd200_info *)us->extra;
 	int retStatus = ISD200_GOOD;
-	struct hd_driveid *id = info->id;
+	u16 *id = info->id;
 
 	US_DEBUGP("Entering isd200_get_inquiry_data\n");
 
@@ -1180,8 +1128,7 @@ static int isd200_get_inquiry_data( struct us_data *us )
 			/* this must be an ATA device */
 			/* perform an ATA Command Identify */
 			transferStatus = isd200_action( us, ACTION_IDENTIFY,
-							id, 
-							sizeof(struct hd_driveid) );
+							id, ATA_ID_WORDS * 2);
 			if (transferStatus != ISD200_TRANSPORT_GOOD) {
 				/* Error issuing ATA Command Identify */
 				US_DEBUGP("   Error issuing ATA Command Identify\n");
@@ -1191,35 +1138,9 @@ static int isd200_get_inquiry_data( struct us_data *us )
 				int i;
 				__be16 *src;
 				__u16 *dest;
-				isd200_fix_driveid(id);
 
-				US_DEBUGP("   Identify Data Structure:\n");
-				US_DEBUGP("      config = 0x%x\n", id->config);
-				US_DEBUGP("      cyls = 0x%x\n", id->cyls);
-				US_DEBUGP("      heads = 0x%x\n", id->heads);
-				US_DEBUGP("      track_bytes = 0x%x\n", id->track_bytes);
-				US_DEBUGP("      sector_bytes = 0x%x\n", id->sector_bytes);
-				US_DEBUGP("      sectors = 0x%x\n", id->sectors);
-				US_DEBUGP("      serial_no[0] = 0x%x\n", id->serial_no[0]);
-				US_DEBUGP("      buf_type = 0x%x\n", id->buf_type);
-				US_DEBUGP("      buf_size = 0x%x\n", id->buf_size);
-				US_DEBUGP("      ecc_bytes = 0x%x\n", id->ecc_bytes);
-				US_DEBUGP("      fw_rev[0] = 0x%x\n", id->fw_rev[0]);
-				US_DEBUGP("      model[0] = 0x%x\n", id->model[0]);
-				US_DEBUGP("      max_multsect = 0x%x\n", id->max_multsect);
-				US_DEBUGP("      dword_io = 0x%x\n", id->dword_io);
-				US_DEBUGP("      capability = 0x%x\n", id->capability);
-				US_DEBUGP("      tPIO = 0x%x\n", id->tPIO);
-				US_DEBUGP("      tDMA = 0x%x\n", id->tDMA);
-				US_DEBUGP("      field_valid = 0x%x\n", id->field_valid);
-				US_DEBUGP("      cur_cyls = 0x%x\n", id->cur_cyls);
-				US_DEBUGP("      cur_heads = 0x%x\n", id->cur_heads);
-				US_DEBUGP("      cur_sectors = 0x%x\n", id->cur_sectors);
-				US_DEBUGP("      cur_capacity = 0x%x\n", (id->cur_capacity1 << 16) + id->cur_capacity0 );
-				US_DEBUGP("      multsect = 0x%x\n", id->multsect);
-				US_DEBUGP("      lba_capacity = 0x%x\n", id->lba_capacity);
-				US_DEBUGP("      command_set_1 = 0x%x\n", id->command_set_1);
-				US_DEBUGP("      command_set_2 = 0x%x\n", id->command_set_2);
+				isd200_fix_driveid(id);
+				isd200_dump_driveid(id);
 
 				memset(&info->InquiryData, 0, sizeof(info->InquiryData));
 
@@ -1229,30 +1150,30 @@ static int isd200_get_inquiry_data( struct us_data *us )
 				/* The length must be at least 36 (5 + 31) */
 				info->InquiryData.AdditionalLength = 0x1F;
 
-				if (id->command_set_1 & COMMANDSET_MEDIA_STATUS) {
+				if (id[ATA_ID_COMMAND_SET_1] & COMMANDSET_MEDIA_STATUS) {
 					/* set the removable bit */
 					info->InquiryData.DeviceTypeModifier = DEVICE_REMOVABLE;
 					info->DeviceFlags |= DF_REMOVABLE_MEDIA;
 				}
 
 				/* Fill in vendor identification fields */
-				src = (__be16*)id->model;
+				src = (__be16 *)&id[ATA_ID_PROD];
 				dest = (__u16*)info->InquiryData.VendorId;
 				for (i=0;i<4;i++)
 					dest[i] = be16_to_cpu(src[i]);
 
-				src = (__be16*)(id->model+8);
+				src = (__be16 *)&id[ATA_ID_PROD + 8/2];
 				dest = (__u16*)info->InquiryData.ProductId;
 				for (i=0;i<8;i++)
 					dest[i] = be16_to_cpu(src[i]);
 
-				src = (__be16*)id->fw_rev;
+				src = (__be16 *)&id[ATA_ID_FW_REV];
 				dest = (__u16*)info->InquiryData.ProductRevisionLevel;
 				for (i=0;i<2;i++)
 					dest[i] = be16_to_cpu(src[i]);
 
 				/* determine if it supports Media Status Notification */
-				if (id->command_set_2 & COMMANDSET_MEDIA_STATUS) {
+				if (id[ATA_ID_COMMAND_SET_2] & COMMANDSET_MEDIA_STATUS) {
 					US_DEBUGP("   Device supports Media Status Notification\n");
 
 					/* Indicate that it is enabled, even though it is not
@@ -1301,7 +1222,7 @@ static int isd200_scsi_to_ata(struct scsi_cmnd *srb, struct us_data *us,
 			      union ata_cdb * ataCdb)
 {
 	struct isd200_info *info = (struct isd200_info *)us->extra;
-	struct hd_driveid *id = info->id;
+	u16 *id = info->id;
 	int sendToTransport = 1;
 	unsigned char sectnum, head;
 	unsigned short cylinder;
@@ -1369,13 +1290,12 @@ static int isd200_scsi_to_ata(struct scsi_cmnd *srb, struct us_data *us,
 
 		US_DEBUGP("   ATA OUT - SCSIOP_READ_CAPACITY\n");
 
-		if (id->capability & CAPABILITY_LBA ) {
-			capacity = id->lba_capacity - 1;
-		} else {
-			capacity = (id->heads *
-				    id->cyls *
-				    id->sectors) - 1;
-		}
+		if (ata_id_has_lba(id))
+			capacity = ata_id_u32(id, ATA_ID_LBA_CAPACITY) - 1;
+		else
+			capacity = (id[ATA_ID_HEADS] * id[ATA_ID_CYLS] *
+				    id[ATA_ID_SECTORS]) - 1;
+
 		readCapacityData.LogicalBlockAddress = cpu_to_be32(capacity);
 		readCapacityData.BytesPerBlock = cpu_to_be32(0x200);
 
@@ -1392,16 +1312,16 @@ static int isd200_scsi_to_ata(struct scsi_cmnd *srb, struct us_data *us,
 		lba = be32_to_cpu(*(__be32 *)&srb->cmnd[2]);
 		blockCount = (unsigned long)srb->cmnd[7]<<8 | (unsigned long)srb->cmnd[8];
 
-		if (id->capability & CAPABILITY_LBA) {
+		if (ata_id_has_lba(id)) {
 			sectnum = (unsigned char)(lba);
 			cylinder = (unsigned short)(lba>>8);
 			head = ATA_ADDRESS_DEVHEAD_LBA_MODE | (unsigned char)(lba>>24 & 0x0F);
 		} else {
-			sectnum = (unsigned char)((lba % id->sectors) + 1);
-			cylinder = (unsigned short)(lba / (id->sectors *
-							   id->heads));
-			head = (unsigned char)((lba / id->sectors) %
-					       id->heads);
+			sectnum = (u8)((lba % id[ATA_ID_SECTORS]) + 1);
+			cylinder = (u16)(lba / (id[ATA_ID_SECTORS] *
+					id[ATA_ID_HEADS]));
+			head = (u8)((lba / id[ATA_ID_SECTORS]) %
+					id[ATA_ID_HEADS]);
 		}
 		ataCdb->generic.SignatureByte0 = info->ConfigData.ATAMajorCommand;
 		ataCdb->generic.SignatureByte1 = info->ConfigData.ATAMinorCommand;
@@ -1415,7 +1335,7 @@ static int isd200_scsi_to_ata(struct scsi_cmnd *srb, struct us_data *us,
 		ataCdb->write.CylinderHighByte = (unsigned char)(cylinder>>8);
 		ataCdb->write.CylinderLowByte = (unsigned char)cylinder;
 		ataCdb->write.DeviceHeadByte = (head | ATA_ADDRESS_DEVHEAD_STD);
-		ataCdb->write.CommandByte = WIN_READ;
+		ataCdb->write.CommandByte = ATA_CMD_PIO_READ;
 		break;
 
 	case WRITE_10:
@@ -1424,14 +1344,16 @@ static int isd200_scsi_to_ata(struct scsi_cmnd *srb, struct us_data *us,
 		lba = be32_to_cpu(*(__be32 *)&srb->cmnd[2]);
 		blockCount = (unsigned long)srb->cmnd[7]<<8 | (unsigned long)srb->cmnd[8];
 
-		if (id->capability & CAPABILITY_LBA) {
+		if (ata_id_has_lba(id)) {
 			sectnum = (unsigned char)(lba);
 			cylinder = (unsigned short)(lba>>8);
 			head = ATA_ADDRESS_DEVHEAD_LBA_MODE | (unsigned char)(lba>>24 & 0x0F);
 		} else {
-			sectnum = (unsigned char)((lba % id->sectors) + 1);
-			cylinder = (unsigned short)(lba / (id->sectors * id->heads));
-			head = (unsigned char)((lba / id->sectors) % id->heads);
+			sectnum = (u8)((lba % id[ATA_ID_SECTORS]) + 1);
+			cylinder = (u16)(lba / (id[ATA_ID_SECTORS] *
+					id[ATA_ID_HEADS]));
+			head = (u8)((lba / id[ATA_ID_SECTORS]) %
+					id[ATA_ID_HEADS]);
 		}
 		ataCdb->generic.SignatureByte0 = info->ConfigData.ATAMajorCommand;
 		ataCdb->generic.SignatureByte1 = info->ConfigData.ATAMinorCommand;
@@ -1445,7 +1367,7 @@ static int isd200_scsi_to_ata(struct scsi_cmnd *srb, struct us_data *us,
 		ataCdb->write.CylinderHighByte = (unsigned char)(cylinder>>8);
 		ataCdb->write.CylinderLowByte = (unsigned char)cylinder;
 		ataCdb->write.DeviceHeadByte = (head | ATA_ADDRESS_DEVHEAD_STD);
-		ataCdb->write.CommandByte = WIN_WRITE;
+		ataCdb->write.CommandByte = ATA_CMD_PIO_WRITE;
 		break;
 
 	case ALLOW_MEDIUM_REMOVAL:
@@ -1459,7 +1381,7 @@ static int isd200_scsi_to_ata(struct scsi_cmnd *srb, struct us_data *us,
 			ataCdb->generic.TransferBlockSize = 1;
 			ataCdb->generic.RegisterSelect = REG_COMMAND;
 			ataCdb->write.CommandByte = (srb->cmnd[4] & 0x1) ?
-				WIN_DOORLOCK : WIN_DOORUNLOCK;
+				ATA_CMD_MEDIA_LOCK : ATA_CMD_MEDIA_UNLOCK;
 			isd200_srb_set_bufflen(srb, 0);
 		} else {
 			US_DEBUGP("   Not removeable media, just report okay\n");
@@ -1539,8 +1461,7 @@ static int isd200_init_info(struct us_data *us)
 	if (!info)
 		retStatus = ISD200_ERROR;
 	else {
-		info->id = (struct hd_driveid *)
-				kzalloc(sizeof(struct hd_driveid), GFP_KERNEL);
+		info->id = kzalloc(ATA_ID_WORDS * 2, GFP_KERNEL);
 		info->RegsBuf = (unsigned char *)
 				kmalloc(sizeof(info->ATARegs), GFP_KERNEL);
 		info->srb.sense_buffer =