diff options
Diffstat (limited to 'drivers')
96 files changed, 6646 insertions, 2308 deletions
diff --git a/drivers/base/iommu.c b/drivers/base/iommu.c index c2d1eed9037..9f0e672f4be 100644 --- a/drivers/base/iommu.c +++ b/drivers/base/iommu.c @@ -98,3 +98,10 @@ phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, return iommu_ops->iova_to_phys(domain, iova); } EXPORT_SYMBOL_GPL(iommu_iova_to_phys); + +int iommu_domain_has_cap(struct iommu_domain *domain, + unsigned long cap) +{ + return iommu_ops->domain_has_cap(domain, cap); +} +EXPORT_SYMBOL_GPL(iommu_domain_has_cap); diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 45c5a33daf4..31693bc2444 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -4,6 +4,7 @@ * Filesystem request handling methods */ +#include <linux/ata.h> #include <linux/hdreg.h> #include <linux/blkdev.h> #include <linux/skbuff.h> @@ -267,7 +268,7 @@ aoecmd_ata_rw(struct aoedev *d) writebit = 0; } - ah->cmdstat = WIN_READ | writebit | extbit; + ah->cmdstat = ATA_CMD_PIO_READ | writebit | extbit; /* mark all tracking fields and load out */ buf->nframesout += 1; @@ -362,10 +363,10 @@ resend(struct aoedev *d, struct aoetgt *t, struct frame *f) switch (ah->cmdstat) { default: break; - case WIN_READ: - case WIN_READ_EXT: - case WIN_WRITE: - case WIN_WRITE_EXT: + case ATA_CMD_PIO_READ: + case ATA_CMD_PIO_READ_EXT: + case ATA_CMD_PIO_WRITE: + case ATA_CMD_PIO_WRITE_EXT: put_lba(ah, f->lba); n = f->bcnt; @@ -812,8 +813,8 @@ aoecmd_ata_rsp(struct sk_buff *skb) d->htgt = NULL; n = ahout->scnt << 9; switch (ahout->cmdstat) { - case WIN_READ: - case WIN_READ_EXT: + case ATA_CMD_PIO_READ: + case ATA_CMD_PIO_READ_EXT: if (skb->len - sizeof *hin - sizeof *ahin < n) { printk(KERN_ERR "aoe: %s. skb->len=%d need=%ld\n", @@ -823,8 +824,8 @@ aoecmd_ata_rsp(struct sk_buff *skb) return; } memcpy(f->bufaddr, ahin+1, n); - case WIN_WRITE: - case WIN_WRITE_EXT: + case ATA_CMD_PIO_WRITE: + case ATA_CMD_PIO_WRITE_EXT: ifp = getif(t, skb->dev); if (ifp) { ifp->lost = 0; @@ -838,7 +839,7 @@ aoecmd_ata_rsp(struct sk_buff *skb) goto xmit; } break; - case WIN_IDENTIFY: + case ATA_CMD_ID_ATA: if (skb->len - sizeof *hin - sizeof *ahin < 512) { printk(KERN_INFO "aoe: runt data size in ataid. skb->len=%d\n", @@ -914,7 +915,7 @@ aoecmd_ata_id(struct aoedev *d) /* set up ata header */ ah->scnt = 1; - ah->cmdstat = WIN_IDENTIFY; + ah->cmdstat = ATA_CMD_ID_ATA; ah->lba3 = 0xa0; skb->dev = t->ifp->nd; diff --git a/drivers/block/hd.c b/drivers/block/hd.c index 482c0c4b964..3c11f062a18 100644 --- a/drivers/block/hd.c +++ b/drivers/block/hd.c @@ -42,6 +42,8 @@ #include <linux/ata.h> #include <linux/hdreg.h> +#define HD_IRQ 14 + #define REALLY_SLOW_IO #include <asm/system.h> #include <asm/io.h> diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c index 119be3442f2..6cccdc3f522 100644 --- a/drivers/block/xsysace.c +++ b/drivers/block/xsysace.c @@ -89,6 +89,7 @@ #include <linux/delay.h> #include <linux/slab.h> #include <linux/blkdev.h> +#include <linux/ata.h> #include <linux/hdreg.h> #include <linux/platform_device.h> #if defined(CONFIG_OF) @@ -208,7 +209,7 @@ struct ace_device { struct gendisk *gd; /* Inserted CF card parameters */ - struct hd_driveid cf_id; + u16 cf_id[ATA_ID_WORDS]; }; static int ace_major; @@ -402,21 +403,14 @@ static void ace_dump_regs(struct ace_device *ace) ace_in32(ace, ACE_CFGLBA), ace_in(ace, ACE_FATSTAT)); } -void ace_fix_driveid(struct hd_driveid *id) +void ace_fix_driveid(u16 *id) { #if defined(__BIG_ENDIAN) - u16 *buf = (void *)id; int i; /* All half words have wrong byte order; swap the bytes */ - for (i = 0; i < sizeof(struct hd_driveid); i += 2, buf++) - *buf = le16_to_cpu(*buf); - - /* Some of the data values are 32bit; swap the half words */ - id->lba_capacity = ((id->lba_capacity >> 16) & 0x0000FFFF) | - ((id->lba_capacity << 16) & 0xFFFF0000); - id->spg = ((id->spg >> 16) & 0x0000FFFF) | - ((id->spg << 16) & 0xFFFF0000); + for (i = 0; i < ATA_ID_WORDS; i++, id++) + *id = le16_to_cpu(*id); #endif } @@ -614,7 +608,7 @@ static void ace_fsm_dostate(struct ace_device *ace) break; case ACE_FSM_STATE_IDENTIFY_COMPLETE: - ace_fix_driveid(&ace->cf_id); + ace_fix_driveid(&ace->cf_id[0]); ace_dump_mem(&ace->cf_id, 512); /* Debug: Dump out disk ID */ if (ace->data_result) { @@ -627,9 +621,10 @@ static void ace_fsm_dostate(struct ace_device *ace) ace->media_change = 0; /* Record disk parameters */ - set_capacity(ace->gd, ace->cf_id.lba_capacity); + set_capacity(ace->gd, + ata_id_u32(&ace->cf_id, ATA_ID_LBA_CAPACITY)); dev_info(ace->dev, "capacity: %i sectors\n", - ace->cf_id.lba_capacity); + ata_id_u32(&ace->cf_id, ATA_ID_LBA_CAPACITY)); } /* We're done, drop to IDLE state and notify waiters */ @@ -928,12 +923,13 @@ static int ace_release(struct gendisk *disk, fmode_t mode) static int ace_getgeo(struct block_device *bdev, struct hd_geometry *geo) { struct ace_device *ace = bdev->bd_disk->private_data; + u16 *cf_id = &ace->cf_id[0]; dev_dbg(ace->dev, "ace_getgeo()\n"); - geo->heads = ace->cf_id.heads; - geo->sectors = ace->cf_id.sectors; - geo->cylinders = ace->cf_id.cyls; + geo->heads = cf_id[ATA_ID_HEADS]; + geo->sectors = cf_id[ATA_ID_SECTORS]; + geo->cylinders = cf_id[ATA_ID_CYLS]; return 0; } diff --git a/drivers/char/hw_random/timeriomem-rng.c b/drivers/char/hw_random/timeriomem-rng.c index 10ad41be589..dcd352ad0e7 100644 --- a/drivers/char/hw_random/timeriomem-rng.c +++ b/drivers/char/hw_random/timeriomem-rng.c @@ -90,10 +90,30 @@ static struct hwrng timeriomem_rng_ops = { static int __init timeriomem_rng_probe(struct platform_device *pdev) { + struct resource *res, *mem; int ret; + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + + if (!res) + return -ENOENT; + + mem = request_mem_region(res->start, res->end - res->start + 1, + pdev->name); + if (mem == NULL) + return -EBUSY; + + dev_set_drvdata(&pdev->dev, mem); + timeriomem_rng_data = pdev->dev.platform_data; + timeriomem_rng_data->address = ioremap(res->start, + res->end - res->start + 1); + if (!timeriomem_rng_data->address) { + ret = -ENOMEM; + goto err_ioremap; + } + if (timeriomem_rng_data->period != 0 && usecs_to_jiffies(timeriomem_rng_data->period) > 0) { timeriomem_rng_timer.expires = jiffies; @@ -104,23 +124,34 @@ static int __init timeriomem_rng_probe(struct platform_device *pdev) timeriomem_rng_data->present = 1; ret = hwrng_register(&timeriomem_rng_ops); - if (ret) { - dev_err(&pdev->dev, "problem registering\n"); - return ret; - } + if (ret) + goto err_register; dev_info(&pdev->dev, "32bits from 0x%p @ %dus\n", timeriomem_rng_data->address, timeriomem_rng_data->period); return 0; + +err_register: + dev_err(&pdev->dev, "problem registering\n"); + iounmap(timeriomem_rng_data->address); +err_ioremap: + release_resource(mem); + + return ret; } static int __devexit timeriomem_rng_remove(struct platform_device *pdev) { + struct resource *mem = dev_get_drvdata(&pdev->dev); + del_timer_sync(&timeriomem_rng_timer); hwrng_unregister(&timeriomem_rng_ops); + iounmap(timeriomem_rng_data->address); + release_resource(mem); + return 0; } diff --git a/drivers/crypto/ixp4xx_crypto.c b/drivers/crypto/ixp4xx_crypto.c index d9e751be8c5..af9761ccf9f 100644 --- a/drivers/crypto/ixp4xx_crypto.c +++ b/drivers/crypto/ixp4xx_crypto.c @@ -101,6 +101,7 @@ struct buffer_desc { u32 phys_addr; u32 __reserved[4]; struct buffer_desc *next; + enum dma_data_direction dir; }; struct crypt_ctl { @@ -132,14 +133,10 @@ struct crypt_ctl { struct ablk_ctx { struct buffer_desc *src; struct buffer_desc *dst; - unsigned src_nents; - unsigned dst_nents; }; struct aead_ctx { struct buffer_desc *buffer; - unsigned short assoc_nents; - unsigned short src_nents; struct scatterlist ivlist; /* used when the hmac is not on one sg entry */ u8 *hmac_virt; @@ -312,7 +309,7 @@ static struct crypt_ctl *get_crypt_desc_emerg(void) } } -static void free_buf_chain(struct buffer_desc *buf, u32 phys) +static void free_buf_chain(struct device *dev, struct buffer_desc *buf,u32 phys) { while (buf) { struct buffer_desc *buf1; @@ -320,6 +317,7 @@ static void free_buf_chain(struct buffer_desc *buf, u32 phys) buf1 = buf->next; phys1 = buf->phys_next; + dma_unmap_single(dev, buf->phys_next, buf->buf_len, buf->dir); dma_pool_free(buffer_pool, buf, phys); buf = buf1; phys = phys1; @@ -348,7 +346,6 @@ static void one_packet(dma_addr_t phys) struct crypt_ctl *crypt; struct ixp_ctx *ctx; int failed; - enum dma_data_direction src_direction = DMA_BIDIRECTIONAL; failed = phys & 0x1 ? -EBADMSG : 0; phys &= ~0x3; @@ -358,13 +355,8 @@ static void one_packet(dma_addr_t phys) case CTL_FLAG_PERFORM_AEAD: { struct aead_request *req = crypt->data.aead_req; struct aead_ctx *req_ctx = aead_request_ctx(req); - dma_unmap_sg(dev, req->assoc, req_ctx->assoc_nents, - DMA_TO_DEVICE); - dma_unmap_sg(dev, &req_ctx->ivlist, 1, DMA_BIDIRECTIONAL); - dma_unmap_sg(dev, req->src, req_ctx->src_nents, - DMA_BIDIRECTIONAL); - free_buf_chain(req_ctx->buffer, crypt->src_buf); + free_buf_chain(dev, req_ctx->buffer, crypt->src_buf); if (req_ctx->hmac_virt) { finish_scattered_hmac(crypt); } @@ -374,16 +366,11 @@ static void one_packet(dma_addr_t phys) case CTL_FLAG_PERFORM_ABLK: { struct ablkcipher_request *req = crypt->data.ablk_req; struct ablk_ctx *req_ctx = ablkcipher_request_ctx(req); - int nents; + if (req_ctx->dst) { - nents = req_ctx->dst_nents; - dma_unmap_sg(dev, req->dst, nents, DMA_FROM_DEVICE); - free_buf_chain(req_ctx->dst, crypt->dst_buf); - src_direction = DMA_TO_DEVICE; + free_buf_chain(dev, req_ctx->dst, crypt->dst_buf); } - nents = req_ctx->src_nents; - dma_unmap_sg(dev, req->src, nents, src_direction); - free_buf_chain(req_ctx->src, crypt->src_buf); + free_buf_chain(dev, req_ctx->src, crypt->src_buf); req->base.complete(&req->base, failed); break; } @@ -750,56 +737,35 @@ static int setup_cipher(struct crypto_tfm *tfm, int encrypt, return 0; } -static int count_sg(struct scatterlist *sg, int nbytes) +static struct buffer_desc *chainup_buffers(struct device *dev, + struct scatterlist *sg, unsigned nbytes, + struct buffer_desc *buf, gfp_t flags, + enum dma_data_direction dir) { - int i; - for (i = 0; nbytes > 0; i++, sg = sg_next(sg)) - nbytes -= sg->length; - return i; -} - -static struct buffer_desc *chainup_buffers(struct scatterlist *sg, - unsigned nbytes, struct buffer_desc *buf, gfp_t flags) -{ - int nents = 0; - - while (nbytes > 0) { + for (;nbytes > 0; sg = scatterwalk_sg_next(sg)) { + unsigned len = min(nbytes, sg->length); struct buffer_desc *next_buf; u32 next_buf_phys; - unsigned len = min(nbytes, sg_dma_len(sg)); + void *ptr; - nents++; nbytes -= len; - if (!buf->phys_addr) { - buf->phys_addr = sg_dma_address(sg); - buf->buf_len = len; - buf->next = NULL; - buf->phys_next = 0; - goto next; - } - /* Two consecutive chunks on one page may be handled by the old - * buffer descriptor, increased by the length of the new one - */ - if (sg_dma_address(sg) == buf->phys_addr + buf->buf_len) { - buf->buf_len += len; - goto next; - } + ptr = page_address(sg_page(sg)) + sg->offset; next_buf = dma_pool_alloc(buffer_pool, flags, &next_buf_phys); - if (!next_buf) - return NULL; + if (!next_buf) { + buf = NULL; + break; + } + sg_dma_address(sg) = dma_map_single(dev, ptr, len, dir); buf->next = next_buf; buf->phys_next = next_buf_phys; - buf = next_buf; - buf->next = NULL; - buf->phys_next = 0; + buf->phys_addr = sg_dma_address(sg); buf->buf_len = len; -next: - if (nbytes > 0) { - sg = sg_next(sg); - } + buf->dir = dir; } + buf->next = NULL; + buf->phys_next = 0; return buf; } @@ -860,12 +826,12 @@ static int ablk_perform(struct ablkcipher_request *req, int encrypt) struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); struct ixp_ctx *ctx = crypto_ablkcipher_ctx(tfm); unsigned ivsize = crypto_ablkcipher_ivsize(tfm); - int ret = -ENOMEM; struct ix_sa_dir *dir; struct crypt_ctl *crypt; - unsigned int nbytes = req->nbytes, nents; + unsigned int nbytes = req->nbytes; enum dma_data_direction src_direction = DMA_BIDIRECTIONAL; struct ablk_ctx *req_ctx = ablkcipher_request_ctx(req); + struct buffer_desc src_hook; gfp_t flags = req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ? GFP_KERNEL : GFP_ATOMIC; @@ -878,7 +844,7 @@ static int ablk_perform(struct ablkcipher_request *req, int encrypt) crypt = get_crypt_desc(); if (!crypt) - return ret; + return -ENOMEM; crypt->data.ablk_req = req; crypt->crypto_ctx = dir->npe_ctx_phys; @@ -891,53 +857,41 @@ static int ablk_perform(struct ablkcipher_request *req, int encrypt) BUG_ON(ivsize && !req->info); memcpy(crypt->iv, req->info, ivsize); if (req->src != req->dst) { + struct buffer_desc dst_hook; crypt->mode |= NPE_OP_NOT_IN_PLACE; - nents = count_sg(req->dst, nbytes); /* This was never tested by Intel * for more than one dst buffer, I think. */ - BUG_ON(nents != 1); - req_ctx->dst_nents = nents; - dma_map_sg(dev, req->dst, nents, DMA_FROM_DEVICE); - req_ctx->dst = dma_pool_alloc(buffer_pool, flags,&crypt->dst_buf); - if (!req_ctx->dst) - goto unmap_sg_dest; - req_ctx->dst->phys_addr = 0; - if (!chainup_buffers(req->dst, nbytes, req_ctx->dst, flags)) + BUG_ON(req->dst->length < nbytes); + req_ctx->dst = NULL; + if (!chainup_buffers(dev, req->dst, nbytes, &dst_hook, + flags, DMA_FROM_DEVICE)) goto free_buf_dest; src_direction = DMA_TO_DEVICE; + req_ctx->dst = dst_hook.next; + crypt->dst_buf = dst_hook.phys_next; } else { req_ctx->dst = NULL; - req_ctx->dst_nents = 0; } - nents = count_sg(req->src, nbytes); - req_ctx->src_nents = nents; - dma_map_sg(dev, req->src, nents, src_direction); - - req_ctx->src = dma_pool_alloc(buffer_pool, flags, &crypt->src_buf); - if (!req_ctx->src) - goto unmap_sg_src; - req_ctx->src->phys_addr = 0; - if (!chainup_buffers(req->src, nbytes, req_ctx->src, flags)) + req_ctx->src = NULL; + if (!chainup_buffers(dev, req->src, nbytes, &src_hook, + flags, src_direction)) goto free_buf_src; + req_ctx->src = src_hook.next; + crypt->src_buf = src_hook.phys_next; crypt->ctl_flags |= CTL_FLAG_PERFORM_ABLK; qmgr_put_entry(SEND_QID, crypt_virt2phys(crypt)); BUG_ON(qmgr_stat_overflow(SEND_QID)); return -EINPROGRESS; free_buf_src: - free_buf_chain(req_ctx->src, crypt->src_buf); -unmap_sg_src: - dma_unmap_sg(dev, req->src, req_ctx->src_nents, src_direction); + free_buf_chain(dev, req_ctx->src, crypt->src_buf); free_buf_dest: if (req->src != req->dst) { - free_buf_chain(req_ctx->dst, crypt->dst_buf); -unmap_sg_dest: - dma_unmap_sg(dev, req->src, req_ctx->dst_nents, - DMA_FROM_DEVICE); + free_buf_chain(dev, req_ctx->dst, crypt->dst_buf); } crypt->ctl_flags = CTL_FLAG_UNUSED; - return ret; + return -ENOMEM; } static int ablk_encrypt(struct ablkcipher_request *req) @@ -985,7 +939,7 @@ static int hmac_inconsistent(struct scatterlist *sg, unsigned start, break; offset += sg->length; - sg = sg_next(sg); + sg = scatterwalk_sg_next(sg); } return (start + nbytes > offset + sg->length); } @@ -997,11 +951,10 @@ static int aead_perform(struct aead_request *req, int encrypt, struct ixp_ctx *ctx = crypto_aead_ctx(tfm); unsigned ivsize = crypto_aead_ivsize(tfm); unsigned authsize = crypto_aead_authsize(tfm); - int ret = -ENOMEM; struct ix_sa_dir *dir; struct crypt_ctl *crypt; - unsigned int cryptlen, nents; - struct buffer_desc *buf; + unsigned int cryptlen; + struct buffer_desc *buf, src_hook; struct aead_ctx *req_ctx = aead_request_ctx(req); gfp_t flags = req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ? GFP_KERNEL : GFP_ATOMIC; @@ -1022,7 +975,7 @@ static int aead_perform(struct aead_request *req, int encrypt, } crypt = get_crypt_desc(); if (!crypt) - return ret; + return -ENOMEM; crypt->data.aead_req = req; crypt->crypto_ctx = dir->npe_ctx_phys; @@ -1041,31 +994,27 @@ static int aead_perform(struct aead_request *req, int encrypt, BUG(); /* -ENOTSUP because of my lazyness */ } - req_ctx->buffer = dma_pool_alloc(buffer_pool, flags, &crypt->src_buf); - if (!req_ctx->buffer) - goto out; - req_ctx->buffer->phys_addr = 0; /* ASSOC data */ - nents = count_sg(req->assoc, req->assoclen); - req_ctx->assoc_nents = nents; - dma_map_sg(dev, req->assoc, nents, DMA_TO_DEVICE); - buf = chainup_buffers(req->assoc, req->assoclen, req_ctx->buffer,flags); + buf = chainup_buffers(dev, req->assoc, req->assoclen, &src_hook, + flags, DMA_TO_DEVICE); + req_ctx->buffer = src_hook.next; + crypt->src_buf = src_hook.phys_next; if (!buf) - goto unmap_sg_assoc; + goto out; /* IV */ sg_init_table(&req_ctx->ivlist, 1); sg_set_buf(&req_ctx->ivlist, iv, ivsize); - dma_map_sg(dev, &req_ctx->ivlist, 1, DMA_BIDIRECTIONAL); - buf = chainup_buffers(&req_ctx->ivlist, ivsize, buf, flags); + buf = chainup_buffers(dev, &req_ctx->ivlist, ivsize, buf, flags, + DMA_BIDIRECTIONAL); if (!buf) - goto unmap_sg_iv; + goto free_chain; if (unlikely(hmac_inconsistent(req->src, cryptlen, authsize))) { /* The 12 hmac bytes are scattered, * we need to copy them into a safe buffer */ req_ctx->hmac_virt = dma_pool_alloc(buffer_pool, flags, &crypt->icv_rev_aes); if (unlikely(!req_ctx->hmac_virt)) - goto unmap_sg_iv; + goto free_chain; if (!encrypt) { scatterwalk_map_and_copy(req_ctx->hmac_virt, req->src, cryptlen, authsize, 0); @@ -1075,33 +1024,28 @@ static int aead_perform(struct aead_request *req, int encrypt, req_ctx->hmac_virt = NULL; } /* Crypt */ - nents = count_sg(req->src, cryptlen + authsize); - req_ctx->src_nents = nents; - dma_map_sg(dev, req->src, nents, DMA_BIDIRECTIONAL); - buf = chainup_buffers(req->src, cryptlen + authsize, buf, flags); + buf = chainup_buffers(dev, req->src, cryptlen + authsize, buf, flags, + DMA_BIDIRECTIONAL); if (!buf) - goto unmap_sg_src; + goto free_hmac_virt; if (!req_ctx->hmac_virt) { crypt->icv_rev_aes = buf->phys_addr + buf->buf_len - authsize; } + crypt->ctl_flags |= CTL_FLAG_PERFORM_AEAD; qmgr_put_entry(SEND_QID, crypt_virt2phys(crypt)); BUG_ON(qmgr_stat_overflow(SEND_QID)); return -EINPROGRESS; -unmap_sg_src: - dma_unmap_sg(dev, req->src, req_ctx->src_nents, DMA_BIDIRECTIONAL); +free_hmac_virt: if (req_ctx->hmac_virt) { dma_pool_free(buffer_pool, req_ctx->hmac_virt, crypt->icv_rev_aes); } -unmap_sg_iv: - dma_unmap_sg(dev, &req_ctx->ivlist, 1, DMA_BIDIRECTIONAL); -unmap_sg_assoc: - dma_unmap_sg(dev, req->assoc, req_ctx->assoc_nents, DMA_TO_DEVICE); - free_buf_chain(req_ctx->buffer, crypt->src_buf); +free_chain: + free_buf_chain(dev, req_ctx->buffer, crypt->src_buf); out: crypt->ctl_flags = CTL_FLAG_UNUSED; - return ret; + return -ENOMEM; } static int aead_setup(struct crypto_aead *tfm, unsigned int authsize) diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 48ea59e7967..3b3c01b6f1e 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -98,6 +98,17 @@ config NET_DMA Say Y here if you enabled INTEL_IOATDMA or FSL_DMA, otherwise say N. +config ASYNC_TX_DMA + bool "Async_tx: Offload support for the async_tx api" + depends on DMA_ENGINE + help + This allows the async_tx api to take advantage of offload engines for + memcpy, memset, xor, and raid6 p+q operations. If your platform has + a dma engine that can perform raid operations and you have enabled + MD_RAID456 say Y. + + If unsure, say N. + config DMATEST tristate "DMA Test client" depends on DMA_ENGINE diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index 280a9d263eb..92438e9dacc 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -507,6 +507,7 @@ struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, v * published in the general-purpose allocator */ dma_cap_set(DMA_PRIVATE, device->cap_mask); + device->privatecnt++; err = dma_chan_get(chan); if (err == -ENODEV) { @@ -518,6 +519,8 @@ struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, v dma_chan_name(chan), err); else break; + if (--device->privatecnt == 0) + dma_cap_clear(DMA_PRIVATE, device->cap_mask); chan->private = NULL; chan = NULL; } @@ -537,6 +540,9 @@ void dma_release_channel(struct dma_chan *chan) WARN_ONCE(chan->client_count != 1, "chan reference count %d != 1\n", chan->client_count); dma_chan_put(chan); + /* drop PRIVATE cap enabled by __dma_request_channel() */ + if (--chan->device->privatecnt == 0) + dma_cap_clear(DMA_PRIVATE, chan->device->cap_mask); chan->private = NULL; mutex_unlock(&dma_list_mutex); } @@ -602,6 +608,24 @@ void dmaengine_put(void) } EXPORT_SYMBOL(dmaengine_put); +static int get_dma_id(struct dma_device *device) +{ + int rc; + + idr_retry: + if (!idr_pre_get(&dma_idr, GFP_KERNEL)) + return -ENOMEM; + mutex_lock(&dma_list_mutex); + rc = idr_get_new(&dma_idr, NULL, &device->dev_id); + mutex_unlock(&dma_list_mutex); + if (rc == -EAGAIN) + goto idr_retry; + else if (rc != 0) + return rc; + + return 0; +} + /** * dma_async_device_register - registers DMA devices found * @device: &dma_device @@ -640,27 +664,25 @@ int dma_async_device_register(struct dma_device *device) idr_ref = kmalloc(sizeof(*idr_ref), GFP_KERNEL); if (!idr_ref) return -ENOMEM; - atomic_set(idr_ref, 0); - idr_retry: - if (!idr_pre_get(&dma_idr, GFP_KERNEL)) - return -ENOMEM; - mutex_lock(&dma_list_mutex); - rc = idr_get_new(&dma_idr, NULL, &device->dev_id); - mutex_unlock(&dma_list_mutex); - if (rc == -EAGAIN) - goto idr_retry; - else if (rc != 0) + rc = get_dma_id(device); + if (rc != 0) { + kfree(idr_ref); return rc; + } + + atomic_set(idr_ref, 0); /* represent channels in sysfs. Probably want devs too */ list_for_each_entry(chan, &device->channels, device_node) { + rc = -ENOMEM; chan->local = alloc_percpu(typeof(*chan->local)); if (chan->local == NULL) - continue; + goto err_out; chan->dev = kzalloc(sizeof(*chan->dev), GFP_KERNEL); if (chan->dev == NULL) { free_percpu(chan->local); - continue; + chan->local = NULL; + goto err_out; } chan->chan_id = chancnt++; @@ -677,6 +699,8 @@ int dma_async_device_register(struct dma_device *device) if (rc) { free_percpu(chan->local); chan->local = NULL; + kfree(chan->dev); + atomic_dec(idr_ref); goto err_out; } chan->client_count = 0; @@ -701,12 +725,23 @@ int dma_async_device_register(struct dma_device *device) } } list_add_tail_rcu(&device->global_node, &dma_device_list); + if (dma_has_cap(DMA_PRIVATE, device->cap_mask)) + device->privatecnt++; /* Always private */ dma_channel_rebalance(); mutex_unlock(&dma_list_mutex); return 0; err_out: + /* if we never registered a channel just release the idr */ + if (atomic_read(idr_ref) == 0) { + mutex_lock(&dma_list_mutex); + idr_remove(&dma_idr, device->dev_id); + mutex_unlock(&dma_list_mutex); + kfree(idr_ref); + return rc; + } + list_for_each_entry(chan, &device->channels, device_node) { if (chan->local == NULL) continue; @@ -893,6 +928,7 @@ void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx, { tx->chan = chan; spin_lock_init(&tx->lock); + INIT_LIST_HEAD(&tx->tx_list); } EXPORT_SYMBOL(dma_async_tx_descriptor_init); diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c index e190d8b3070..a27c0fb1bc1 100644 --- a/drivers/dma/dmatest.c +++ b/drivers/dma/dmatest.c @@ -38,6 +38,11 @@ module_param(max_channels, uint, S_IRUGO); MODULE_PARM_DESC(max_channels, "Maximum number of channels to use (default: all)"); +static unsigned int xor_sources = 3; +module_param(xor_sources, uint, S_IRUGO); +MODULE_PARM_DESC(xor_sources, + "Number of xor source buffers (default: 3)"); + /* * Initialization patterns. All bytes in the source buffer has bit 7 * set, all bytes in the destination buffer has bit 7 cleared. @@ -59,8 +64,9 @@ struct dmatest_thread { struct list_head node; struct task_struct *task; struct dma_chan *chan; - u8 *srcbuf; - u8 *dstbuf; + u8 **srcs; + u8 **dsts; + enum dma_transaction_type type; }; struct dmatest_chan { @@ -98,30 +104,37 @@ static unsigned long dmatest_random(void) return buf; } -static void dmatest_init_srcbuf(u8 *buf, unsigned int start, unsigned int len) +static void dmatest_init_srcs(u8 **bufs, unsigned int start, unsigned int len) { unsigned int i; - - for (i = 0; i < start; i++) - buf[i] = PATTERN_SRC | (~i & PATTERN_COUNT_MASK); - for ( ; i < start + len; i++) - buf[i] = PATTERN_SRC | PATTERN_COPY - | (~i & PATTERN_COUNT_MASK);; - for ( ; i < test_buf_size; i++) - buf[i] = PATTERN_SRC | (~i & PATTERN_COUNT_MASK); + u8 *buf; + + for (; (buf = *bufs); bufs++) { + for (i = 0; i < start; i++) + buf[i] = PATTERN_SRC | (~i & PATTERN_COUNT_MASK); + for ( ; i < start + len; i++) + buf[i] = PATTERN_SRC | PATTERN_COPY + | (~i & PATTERN_COUNT_MASK);; + for ( ; i < test_buf_size; i++) + buf[i] = PATTERN_SRC | (~i & PATTERN_COUNT_MASK); + buf++; + } } -static void dmatest_init_dstbuf(u8 *buf, unsigned int start, unsigned int len) +static void dmatest_init_dsts(u8 **bufs, unsigned int start, unsigned int len) { unsigned int i; - - for (i = 0; i < start; i++) - buf[i] = PATTERN_DST | (~i & PATTERN_COUNT_MASK); - for ( ; i < start + len; i++) - buf[i] = PATTERN_DST | PATTERN_OVERWRITE - | (~i & PATTERN_COUNT_MASK); - for ( ; i < test_buf_size; i++) - buf[i] = PATTERN_DST | (~i & PATTERN_COUNT_MASK); + u8 *buf; + + for (; (buf = *bufs); bufs++) { + for (i = 0; i < start; i++) + buf[i] = PATTERN_DST | (~i & PATTERN_COUNT_MASK); + for ( ; i < start + len; i++) + buf[i] = PATTERN_DST | PATTERN_OVERWRITE + | (~i & PATTERN_COUNT_MASK); + for ( ; i < test_buf_size; i++) + buf[i] = PATTERN_DST | (~i & PATTERN_COUNT_MASK); + } } static void dmatest_mismatch(u8 actual, u8 pattern, unsigned int index, @@ -150,23 +163,30 @@ static void dmatest_mismatch(u8 actual, u8 pattern, unsigned int index, thread_name, index, expected, actual); } -static unsigned int dmatest_verify(u8 *buf, unsigned int start, +static unsigned int dmatest_verify(u8 **bufs, unsigned int start, unsigned int end, unsigned int counter, u8 pattern, bool is_srcbuf) { unsigned int i; unsigned int error_count = 0; u8 actual; - - for (i = start; i < end; i++) { - actual = buf[i]; - if (actual != (pattern | (~counter & PATTERN_COUNT_MASK))) { - if (error_count < 32) - dmatest_mismatch(actual, pattern, i, counter, - is_srcbuf); - error_count++; + u8 expected; + u8 *buf; + unsigned int counter_orig = counter; + + for (; (buf = *bufs); bufs++) { + counter = counter_orig; + for (i = start; i < end; i++) { + actual = buf[i]; + expected = pattern | (~counter & PATTERN_COUNT_MASK); + if (actual != expected) { + if (error_count < 32) + dmatest_mismatch(actual, pattern, i, + counter, is_srcbuf); + error_count++; + } + counter++; } - counter++; } if (error_count > 32) @@ -176,12 +196,17 @@ static unsigned int dmatest_verify(u8 *buf, unsigned int start, return error_count; } +static void dmatest_callback(void *completion) +{ + complete(completion); +} + /* * This function repeatedly tests DMA transfers of various lengths and - * offsets until it is told to exit by kthread_stop(). There may be - * multiple threads running this function in parallel for a single - * channel, and there may be multiple channels being tested in - * parallel. + * offsets for a given operation type until it is told to exit by + * kthread_stop(). There may be multiple threads running this function + * in parallel for a single channel, and there may be multiple channels + * being tested in parallel. * * Before each test, the source and destination buffer is initialized * with a known pattern. This pattern is different depending on @@ -201,25 +226,57 @@ static int dmatest_func(void *data) unsigned int total_tests = 0; dma_cookie_t cookie; enum dma_status status; + enum dma_ctrl_flags flags; int ret; + int src_cnt; + int dst_cnt; + int i; thread_name = current->comm; ret = -ENOMEM; - thread->srcbuf = kmalloc(test_buf_size, GFP_KERNEL); - if (!thread->srcbuf) - goto err_srcbuf; - thread->dstbuf = kmalloc(test_buf_size, GFP_KERNEL); - if (!thread->dstbuf) - goto err_dstbuf; smp_rmb(); chan = thread->chan; + if (thread->type == DMA_MEMCPY) + src_cnt = dst_cnt = 1; + else if (thread->type == DMA_XOR) { + src_cnt = xor_sources | 1; /* force odd to ensure dst = src */ + dst_cnt = 1; + } else + goto err_srcs; + + thread->srcs = kcalloc(src_cnt+1, sizeof(u8 *), GFP_KERNEL); + if (!thread->srcs) + goto err_srcs; + for (i = 0; i < src_cnt; i++) { + thread->srcs[i] = kmalloc(test_buf_size, GFP_KERNEL); + if (!thread->srcs[i]) + goto err_srcbuf; + } + thread->srcs[i] = NULL; + + thread->dsts = kcalloc(dst_cnt+1, sizeof(u8 *), GFP_KERNEL); + if (!thread->dsts) + goto err_dsts; + for (i = 0; i < dst_cnt; i++) { + thread->dsts[i] = kmalloc(test_buf_size, GFP_KERNEL); + if (!thread->dsts[i]) + goto err_dstbuf; + } + thread->dsts[i] = NULL; + + set_user_nice(current, 10); + + flags = DMA_CTRL_ACK | DMA_COMPL_SKIP_DEST_UNMAP | DMA_PREP_INTERRUPT; while (!kthread_should_stop()) { struct dma_device *dev = chan->device; - struct dma_async_tx_descriptor *tx; - dma_addr_t dma_src, dma_dest; + struct dma_async_tx_descriptor *tx = NULL; + dma_addr_t dma_srcs[src_cnt]; + dma_addr_t dma_dsts[dst_cnt]; + struct completion cmp; + unsigned long tmo = msecs_to_jiffies(3000); total_tests++; @@ -227,22 +284,41 @@ static int dmatest_func(void *data) src_off = dmatest_random() % (test_buf_size - len + 1); dst_off = dmatest_random() % (test_buf_size - len + 1); - dmatest_init_srcbuf(thread->srcbuf, src_off, len); - dmatest_init_dstbuf(thread->dstbuf, dst_off, len); + dmatest_init_srcs(thread->srcs, src_off, len); + dmatest_init_dsts(thread->dsts, dst_off, len); - dma_src = dma_map_single(dev->dev, thread->srcbuf + src_off, - len, DMA_TO_DEVICE); + for (i = 0; i < src_cnt; i++) { + u8 *buf = thread->srcs[i] + src_off; + + dma_srcs[i] = dma_map_single(dev->dev, buf, len, + DMA_TO_DEVICE); + } /* map with DMA_BIDIRECTIONAL to force writeback/invalidate */ - dma_dest = dma_map_single(dev->dev, thread->dstbuf, - test_buf_size, DMA_BIDIRECTIONAL); + for (i = 0; i < dst_cnt; i++) { + dma_dsts[i] = dma_map_single(dev->dev, thread->dsts[i], + test_buf_size, + DMA_BIDIRECTIONAL); + } + + if (thread->type == DMA_MEMCPY) + tx = dev->device_prep_dma_memcpy(chan, + dma_dsts[0] + dst_off, + dma_srcs[0], len, + flags); + else if (thread->type == DMA_XOR) + tx = dev->device_prep_dma_xor(chan, + dma_dsts[0] + dst_off, + dma_srcs, xor_sources, + len, flags); - tx = dev->device_prep_dma_memcpy(chan, dma_dest + dst_off, - dma_src, len, - DMA_CTRL_ACK | DMA_COMPL_SKIP_DEST_UNMAP); if (!tx) { - dma_unmap_single(dev->dev, dma_src, len, DMA_TO_DEVICE); - dma_unmap_single(dev->dev, dma_dest, - test_buf_size, DMA_BIDIRECTIONAL); + for (i = 0; i < src_cnt; i++) + dma_unmap_single(dev->dev, dma_srcs[i], len, + DMA_TO_DEVICE); + for (i = 0; i < dst_cnt; i++) + dma_unmap_single(dev->dev, dma_dsts[i], + test_buf_size, + DMA_BIDIRECTIONAL); pr_warning("%s: #%u: prep error with src_off=0x%x " "dst_off=0x%x len=0x%x\n", thread_name, total_tests - 1, @@ -251,7 +327,10 @@ static int dmatest_func(void *data) failed_tests++; continue; } - tx->callback = NULL; + + init_completion(&cmp); + tx->callback = dmatest_callback; + tx->callback_param = &cmp; cookie = tx->tx_submit(tx); if (dma_submit_error(cookie)) { @@ -263,44 +342,50 @@ static int dmatest_func(void *data) failed_tests++; continue; } - dma_async_memcpy_issue_pending(chan); + dma_async_issue_pending(chan); - do { - msleep(1); - status = dma_async_memcpy_complete( - chan, cookie, NULL, NULL); - } while (status == DMA_IN_PROGRESS); + tmo = wait_for_completion_timeout(&cmp, tmo); + status = dma_async_is_tx_complete(chan, cookie, NULL, NULL); - if (status == DMA_ERROR) { - pr_warning("%s: #%u: error during copy\n", - thread_name, total_tests - 1); + if (tmo == 0) { + pr_warning("%s: #%u: test timed out\n", + thread_name, total_tests - 1); + failed_tests++; + continue; + } else if (status != DMA_SUCCESS) { + pr_warning("%s: #%u: got completion callback," + " but status is \'%s\'\n", + thread_name, total_tests - 1, + status == DMA_ERROR ? "error" : "in progress"); failed_tests++; continue; } + /* Unmap by myself (see DMA_COMPL_SKIP_DEST_UNMAP above) */ - dma_unmap_single(dev->dev, dma_dest, - test_buf_size, DMA_BIDIRECTIONAL); + for (i = 0; i < dst_cnt; i++) + dma_unmap_single(dev->dev, dma_dsts[i], test_buf_size, + DMA_BIDIRECTIONAL); error_count = 0; pr_debug("%s: verifying source buffer...\n", thread_name); - error_count += dmatest_verify(thread->srcbuf, 0, src_off, + error_count += dmatest_verify(thread->srcs, 0, src_off, 0, PATTERN_SRC, true); - error_count += dmatest_verify(thread->srcbuf, src_off, + error_count += dmatest_verify(thread->srcs, src_off, src_off + len, src_off, PATTERN_SRC | PATTERN_COPY, true); - error_count += dmatest_verify(thread->srcbuf, src_off + len, + error_count += dmatest_verify(thread->srcs, src_off + len, test_buf_size, src_off + len, PATTERN_SRC, true); pr_debug("%s: verifying dest buffer...\n", thread->task->comm); - error_count += dmatest_verify(thread->dstbuf, 0, dst_off, + error_count += dmatest_verify(thread->dsts, 0, dst_off, 0, PATTERN_DST, false); - error_count += dmatest_verify(thread->dstbuf, dst_off, + error_count += dmatest_verify(thread->dsts, dst_off, dst_off + len, src_off, PATTERN_SRC | PATTERN_COPY, false); - error_count += dmatest_verify(thread->dstbuf, dst_off + len, + error_count += dmatest_verify(thread->dsts, dst_off + len, test_buf_size, dst_off + len, PATTERN_DST, false); @@ -319,10 +404,16 @@ static int dmatest_func(void *data) } ret = 0; - kfree(thread->dstbuf); + for (i = 0; thread->dsts[i]; i++) + kfree(thread->dsts[i]); err_dstbuf: - kfree(thread->srcbuf); + kfree(thread->dsts); +err_dsts: + for (i = 0; thread->srcs[i]; i++) + kfree(thread->srcs[i]); err_srcbuf: + kfree(thread->srcs); +err_srcs: pr_notice("%s: terminating after %u tests, %u failures (status %d)\n", thread_name, total_tests, failed_tests, ret); return ret; @@ -344,35 +435,36 @@ static void dmatest_cleanup_channel(struct dmatest_chan *dtc) kfree(dtc); } -static int dmatest_add_channel(struct dma_chan *chan) +static int dmatest_add_threads(struct dmatest_chan *dtc, enum dma_transaction_type type) { - struct dmatest_chan *dtc; - struct dmatest_thread *thread; - unsigned int i; - - dtc = kmalloc(sizeof(struct dmatest_chan), GFP_KERNEL); - if (!dtc) { - pr_warning("dmatest: No memory for %s\n", dma_chan_name(chan)); - return -ENOMEM; - } + struct dmatest_thread *thread; + struct dma_chan *chan = dtc->chan; + char *op; + unsigned int i; - dtc->chan = chan; - INIT_LIST_HEAD(&dtc->threads); + if (type == DMA_MEMCPY) + op = "copy"; + else if (type == DMA_XOR) + op = "xor"; + else + return -EINVAL; for (i = 0; i < threads_per_chan; i++) { thread = kzalloc(sizeof(struct dmatest_thread), GFP_KERNEL); if (!thread) { - pr_warning("dmatest: No memory for %s-test%u\n", - dma_chan_name(chan), i); + pr_warning("dmatest: No memory for %s-%s%u\n", + dma_chan_name(chan), op, i); + break; } thread->chan = dtc->chan; + thread->type = type; smp_wmb(); - thread->task = kthread_run(dmatest_func, thread, "%s-test%u", - dma_chan_name(chan), i); + thread->task = kthread_run(dmatest_func, thread, "%s-%s%u", + dma_chan_name(chan), op, i); if (IS_ERR(thread->task)) { - pr_warning("dmatest: Failed to run thread %s-test%u\n", - dma_chan_name(chan), i); + pr_warning("dmatest: Failed to run thread %s-%s%u\n", + dma_chan_name(chan), op, i); kfree(thread); break; } @@ -382,7 +474,36 @@ static int dmatest_add_channel(struct dma_chan *chan) list_add_tail(&thread->node, &dtc->threads); } - pr_info("dmatest: Started %u threads using %s\n", i, dma_chan_name(chan)); + return i; +} + +static int dmatest_add_channel(struct dma_chan *chan) +{ + struct dmatest_chan *dtc; + struct dma_device *dma_dev = chan->device; + unsigned int thread_count = 0; + unsigned int cnt; + + dtc = kmalloc(sizeof(struct dmatest_chan), GFP_KERNEL); + if (!dtc) { + pr_warning("dmatest: No memory for %s\n", dma_chan_name(chan)); + return -ENOMEM; + } + + dtc->chan = chan; + INIT_LIST_HEAD(&dtc->threads); + + if (dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask)) { + cnt = dmatest_add_threads(dtc, DMA_MEMCPY); + thread_count += cnt > 0 ?: 0; + } + if (dma_has_cap(DMA_XOR, dma_dev->cap_mask)) { + cnt = dmatest_add_threads(dtc, DMA_XOR); + thread_count += cnt > 0 ?: 0; + } + + pr_info("dmatest: Started %u threads using %s\n", + thread_count, dma_chan_name(chan)); list_add_tail(&dtc->node, &dmatest_channels); nr_channels++; diff --git a/drivers/dma/dw_dmac.c b/drivers/dma/dw_dmac.c index 20ad3d26bec..98c9a847bf5 100644 --- a/drivers/dma/dw_dmac.c +++ b/drivers/dma/dw_dmac.c @@ -363,6 +363,82 @@ static void dwc_handle_error(struct dw_dma *dw, struct dw_dma_chan *dwc) dwc_descriptor_complete(dwc, bad_desc); } +/* --------------------- Cyclic DMA API extensions -------------------- */ + +inline dma_addr_t dw_dma_get_src_addr(struct dma_chan *chan) +{ + struct dw_dma_chan *dwc = to_dw_dma_chan(chan); + return channel_readl(dwc, SAR); +} +EXPORT_SYMBOL(dw_dma_get_src_addr); + +inline dma_addr_t dw_dma_get_dst_addr(struct dma_chan *chan) +{ + struct dw_dma_chan *dwc = to_dw_dma_chan(chan); + return channel_readl(dwc, DAR); +} +EXPORT_SYMBOL(dw_dma_get_dst_addr); + +/* called with dwc->lock held and all DMAC interrupts disabled */ +static void dwc_handle_cyclic(struct dw_dma *dw, struct dw_dma_chan *dwc, + u32 status_block, u32 status_err, u32 status_xfer) +{ + if (status_block & dwc->mask) { + void (*callback)(void *param); + void *callback_param; + + dev_vdbg(chan2dev(&dwc->chan), "new cyclic period llp 0x%08x\n", + channel_readl(dwc, LLP)); + dma_writel(dw, CLEAR.BLOCK, dwc->mask); + + callback = dwc->cdesc->period_callback; + callback_param = dwc->cdesc->period_callback_param; + if (callback) { + spin_unlock(&dwc->lock); + callback(callback_param); + spin_lock(&dwc->lock); + } + } + + /* + * Error and transfer complete are highly unlikely, and will most + * likely be due to a configuration error by the user. + */ + if (unlikely(status_err & dwc->mask) || + unlikely(status_xfer & dwc->mask)) { + int i; + + dev_err(chan2dev(&dwc->chan), "cyclic DMA unexpected %s " + "interrupt, stopping DMA transfer\n", + status_xfer ? "xfer" : "error"); + dev_err(chan2dev(&dwc->chan), + " SAR: 0x%x DAR: 0x%x LLP: 0x%x CTL: 0x%x:%08x\n", + channel_readl(dwc, SAR), + channel_readl(dwc, DAR), + channel_readl(dwc, LLP), + channel_readl(dwc, CTL_HI), + channel_readl(dwc, CTL_LO)); + + channel_clear_bit(dw, CH_EN, dwc->mask); + while (dma_readl(dw, CH_EN) & dwc->mask) + cpu_relax(); + + /* make sure DMA does not restart by loading a new list */ + channel_writel(dwc, LLP, 0); + channel_writel(dwc, CTL_LO, 0); + channel_writel(dwc, CTL_HI, 0); + + dma_writel(dw, CLEAR.BLOCK, dwc->mask); + dma_writel(dw, CLEAR.ERROR, dwc->mask); + dma_writel(dw, CLEAR.XFER, dwc->mask); + + for (i = 0; i < dwc->cdesc->periods; i++) + dwc_dump_lli(dwc, &dwc->cdesc->desc[i]->lli); + } +} + +/* ------------------------------------------------------------------------- */ + static void dw_dma_tasklet(unsigned long data) { struct dw_dma *dw = (struct dw_dma *)data; @@ -382,7 +458,10 @@ static void dw_dma_tasklet(unsigned long data) for (i = 0; i < dw->dma.chancnt; i++) { dwc = &dw->chan[i]; spin_lock(&dwc->lock); - if (status_err & (1 << i)) + if (test_bit(DW_DMA_IS_CYCLIC, &dwc->flags)) + dwc_handle_cyclic(dw, dwc, status_block, status_err, + status_xfer); + else if (status_err & (1 << i)) dwc_handle_error(dw, dwc); else if ((status_block | status_xfer) & (1 << i)) dwc_scan_descriptors(dw, dwc); @@ -826,7 +905,6 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan) dma_async_tx_descriptor_init(&desc->txd, chan); desc->txd.tx_submit = dwc_tx_submit; desc->txd.flags = DMA_CTRL_ACK; - INIT_LIST_HEAD(&desc->txd.tx_list); desc->txd.phys = dma_map_single(chan2parent(chan), &desc->lli, sizeof(desc->lli), DMA_TO_DEVICE); dwc_desc_put(dwc, desc); @@ -884,6 +962,257 @@ static void dwc_free_chan_resources(struct dma_chan *chan) dev_vdbg(chan2dev(chan), "free_chan_resources done\n"); } +/* --------------------- Cyclic DMA API extensions -------------------- */ + +/** + * dw_dma_cyclic_start - start the cyclic DMA transfer + * @chan: the DMA channel to start + * + * Must be called with soft interrupts disabled. Returns zero on success or + * -errno on failure. + */ +int dw_dma_cyclic_start(struct dma_chan *chan) +{ + struct dw_dma_chan *dwc = to_dw_dma_chan(chan); + struct dw_dma *dw = to_dw_dma(dwc->chan.device); + + if (!test_bit(DW_DMA_IS_CYCLIC, &dwc->flags)) { + dev_err(chan2dev(&dwc->chan), "missing prep for cyclic DMA\n"); + return -ENODEV; + } + + spin_lock(&dwc->lock); + + /* assert channel is idle */ + if (dma_readl(dw, CH_EN) & dwc->mask) { + dev_err(chan2dev(&dwc->chan), + "BUG: Attempted to start non-idle channel\n"); + dev_err(chan2dev(&dwc->chan), + " SAR: 0x%x DAR: 0x%x LLP: 0x%x CTL: 0x%x:%08x\n", + channel_readl(dwc, SAR), + channel_readl(dwc, DAR), + channel_readl(dwc, LLP), + channel_readl(dwc, CTL_HI), + channel_readl(dwc, CTL_LO)); + spin_unlock(&dwc->lock); + return -EBUSY; + } + + dma_writel(dw, CLEAR.BLOCK, dwc->mask); + dma_writel(dw, CLEAR.ERROR, dwc->mask); + dma_writel(dw, CLEAR.XFER, dwc->mask); + + /* setup DMAC channel registers */ + channel_writel(dwc, LLP, dwc->cdesc->desc[0]->txd.phys); + channel_writel(dwc, CTL_LO, DWC_CTLL_LLP_D_EN | DWC_CTLL_LLP_S_EN); + channel_writel(dwc, CTL_HI, 0); + + channel_set_bit(dw, CH_EN, dwc->mask); + + spin_unlock(&dwc->lock); + + return 0; +} +EXPORT_SYMBOL(dw_dma_cyclic_start); + +/** + * dw_dma_cyclic_stop - stop the cyclic DMA transfer + * @chan: the DMA channel to stop + * + * Must be called with soft interrupts disabled. + */ +void dw_dma_cyclic_stop(struct dma_chan *chan) +{ + struct dw_dma_chan *dwc = to_dw_dma_chan(chan); + struct dw_dma *dw = to_dw_dma(dwc->chan.device); + + spin_lock(&dwc->lock); + + channel_clear_bit(dw, CH_EN, dwc->mask); + while (dma_readl(dw, CH_EN) & dwc->mask) + cpu_relax(); + + spin_unlock(&dwc->lock); +} +EXPORT_SYMBOL(dw_dma_cyclic_stop); + +/** + * dw_dma_cyclic_prep - prepare the cyclic DMA transfer + * @chan: the DMA channel to prepare + * @buf_addr: physical DMA address where the buffer starts + * @buf_len: total number of bytes for the entire buffer + * @period_len: number of bytes for each period + * @direction: transfer direction, to or from device + * + * Must be called before trying to start the transfer. Returns a valid struct + * dw_cyclic_desc if successful or an ERR_PTR(-errno) if not successful. + */ +struct dw_cyclic_desc *dw_dma_cyclic_prep(struct dma_chan *chan, + dma_addr_t buf_addr, size_t buf_len, size_t period_len, + enum dma_data_direction direction) +{ + struct dw_dma_chan *dwc = to_dw_dma_chan(chan); + struct dw_cyclic_desc *cdesc; + struct dw_cyclic_desc *retval = NULL; + struct dw_desc *desc; + struct dw_desc *last = NULL; + struct dw_dma_slave *dws = chan->private; + unsigned long was_cyclic; + unsigned int reg_width; + unsigned int periods; + unsigned int i; + + spin_lock_bh(&dwc->lock); + if (!list_empty(&dwc->queue) || !list_empty(&dwc->active_list)) { + spin_unlock_bh(&dwc->lock); + dev_dbg(chan2dev(&dwc->chan), + "queue and/or active list are not empty\n"); + return ERR_PTR(-EBUSY); + } + + was_cyclic = test_and_set_bit(DW_DMA_IS_CYCLIC, &dwc->flags); + spin_unlock_bh(&dwc->lock); + if (was_cyclic) { + dev_dbg(chan2dev(&dwc->chan), + "channel already prepared for cyclic DMA\n"); + return ERR_PTR(-EBUSY); + } + + retval = ERR_PTR(-EINVAL); + reg_width = dws->reg_width; + periods = buf_len / period_len; + + /* Check for too big/unaligned periods and unaligned DMA buffer. */ + if (period_len > (DWC_MAX_COUNT << reg_width)) + goto out_err; + if (unlikely(period_len & ((1 << reg_width) - 1))) + goto out_err; + if (unlikely(buf_addr & ((1 << reg_width) - 1))) + goto out_err; + if (unlikely(!(direction & (DMA_TO_DEVICE | DMA_FROM_DEVICE)))) + goto out_err; + + retval = ERR_PTR(-ENOMEM); + + if (periods > NR_DESCS_PER_CHANNEL) + goto out_err; + + cdesc = kzalloc(sizeof(struct dw_cyclic_desc), GFP_KERNEL); + if (!cdesc) + goto out_err; + + cdesc->desc = kzalloc(sizeof(struct dw_desc *) * periods, GFP_KERNEL); + if (!cdesc->desc) + goto out_err_alloc; + + for (i = 0; i < periods; i++) { + desc = dwc_desc_get(dwc); + if (!desc) + goto out_err_desc_get; + + switch (direction) { + case DMA_TO_DEVICE: + desc->lli.dar = dws->tx_reg; + desc->lli.sar = buf_addr + (period_len * i); + desc->lli.ctllo = (DWC_DEFAULT_CTLLO + | DWC_CTLL_DST_WIDTH(reg_width) + | DWC_CTLL_SRC_WIDTH(reg_width) + | DWC_CTLL_DST_FIX + | DWC_CTLL_SRC_INC + | DWC_CTLL_FC_M2P + | DWC_CTLL_INT_EN); + break; + case DMA_FROM_DEVICE: + desc->lli.dar = buf_addr + (period_len * i); + desc->lli.sar = dws->rx_reg; + desc->lli.ctllo = (DWC_DEFAULT_CTLLO + | DWC_CTLL_SRC_WIDTH(reg_width) + | DWC_CTLL_DST_WIDTH(reg_width) + | DWC_CTLL_DST_INC + | DWC_CTLL_SRC_FIX + | DWC_CTLL_FC_P2M + | DWC_CTLL_INT_EN); + break; + default: + break; + } + + desc->lli.ctlhi = (period_len >> reg_width); + cdesc->desc[i] = desc; + + if (last) { + last->lli.llp = desc->txd.phys; + dma_sync_single_for_device(chan2parent(chan), + last->txd.phys, sizeof(last->lli), + DMA_TO_DEVICE); + } + + last = desc; + } + + /* lets make a cyclic list */ + last->lli.llp = cdesc->desc[0]->txd.phys; + dma_sync_single_for_device(chan2parent(chan), last->txd.phys, + sizeof(last->lli), DMA_TO_DEVICE); + + dev_dbg(chan2dev(&dwc->chan), "cyclic prepared buf 0x%08x len %zu " + "period %zu periods %d\n", buf_addr, buf_len, + period_len, periods); + + cdesc->periods = periods; + dwc->cdesc = cdesc; + + return cdesc; + +out_err_desc_get: + while (i--) + dwc_desc_put(dwc, cdesc->desc[i]); +out_err_alloc: + kfree(cdesc); +out_err: + clear_bit(DW_DMA_IS_CYCLIC, &dwc->flags); + return (struct dw_cyclic_desc *)retval; +} +EXPORT_SYMBOL(dw_dma_cyclic_prep); + +/** + * dw_dma_cyclic_free - free a prepared cyclic DMA transfer + * @chan: the DMA channel to free + */ +void dw_dma_cyclic_free(struct dma_chan *chan) +{ + struct dw_dma_chan *dwc = to_dw_dma_chan(chan); + struct dw_dma *dw = to_dw_dma(dwc->chan.device); + struct dw_cyclic_desc *cdesc = dwc->cdesc; + int i; + + dev_dbg(chan2dev(&dwc->chan), "cyclic free\n"); + + if (!cdesc) + return; + + spin_lock_bh(&dwc->lock); + + channel_clear_bit(dw, CH_EN, dwc->mask); + while (dma_readl(dw, CH_EN) & dwc->mask) + cpu_relax(); + + dma_writel(dw, CLEAR.BLOCK, dwc->mask); + dma_writel(dw, CLEAR.ERROR, dwc->mask); + dma_writel(dw, CLEAR.XFER, dwc->mask); + + spin_unlock_bh(&dwc->lock); + + for (i = 0; i < cdesc->periods; i++) + dwc_desc_put(dwc, cdesc->desc[i]); + + kfree(cdesc->desc); + kfree(cdesc); + + clear_bit(DW_DMA_IS_CYCLIC, &dwc->flags); +} +EXPORT_SYMBOL(dw_dma_cyclic_free); + /*----------------------------------------------------------------------*/ static void dw_dma_off(struct dw_dma *dw) diff --git a/drivers/dma/dw_dmac_regs.h b/drivers/dma/dw_dmac_regs.h index b252b202c5c..13a58076703 100644 --- a/drivers/dma/dw_dmac_regs.h +++ b/drivers/dma/dw_dmac_regs.h @@ -126,6 +126,10 @@ struct dw_dma_regs { #define DW_REGLEN 0x400 +enum dw_dmac_flags { + DW_DMA_IS_CYCLIC = 0, +}; + struct dw_dma_chan { struct dma_chan chan; void __iomem *ch_regs; @@ -134,10 +138,12 @@ struct dw_dma_chan { spinlock_t lock; /* these other elements are all protected by lock */ + unsigned long flags; dma_cookie_t completed; struct list_head active_list; struct list_head queue; struct list_head free_list; + struct dw_cyclic_desc *cdesc; unsigned int descs_allocated; }; @@ -158,7 +164,6 @@ static inline struct dw_dma_chan *to_dw_dma_chan(struct dma_chan *chan) return container_of(chan, struct dw_dma_chan, chan); } - struct dw_dma { struct dma_device dma; void __iomem *regs; diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c index 86d6da47f55..da8a8ed9e41 100644 --- a/drivers/dma/fsldma.c +++ b/drivers/dma/fsldma.c @@ -354,7 +354,6 @@ static struct fsl_desc_sw *fsl_dma_alloc_descriptor( dma_async_tx_descriptor_init(&desc_sw->async_tx, &fsl_chan->common); desc_sw->async_tx.tx_submit = fsl_dma_tx_submit; - INIT_LIST_HEAD(&desc_sw->async_tx.tx_list); desc_sw->async_tx.phys = pdesc; } diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c index 5905cd36bcd..e4fc33c1c32 100644 --- a/drivers/dma/ioat_dma.c +++ b/drivers/dma/ioat_dma.c @@ -693,7 +693,6 @@ static struct ioat_desc_sw *ioat_dma_alloc_descriptor( desc_sw->async_tx.tx_submit = ioat2_tx_submit; break; } - INIT_LIST_HEAD(&desc_sw->async_tx.tx_list); desc_sw->hw = desc; desc_sw->async_tx.phys = phys; diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c index 16adbe61cfb..2f052265122 100644 --- a/drivers/dma/iop-adma.c +++ b/drivers/dma/iop-adma.c @@ -498,7 +498,6 @@ static int iop_adma_alloc_chan_resources(struct dma_chan *chan) slot->async_tx.tx_submit = iop_adma_tx_submit; INIT_LIST_HEAD(&slot->chain_node); INIT_LIST_HEAD(&slot->slot_node); - INIT_LIST_HEAD(&slot->async_tx.tx_list); hw_desc = (char *) iop_chan->device->dma_desc_pool; slot->async_tx.phys = (dma_addr_t) &hw_desc[idx * IOP_ADMA_SLOT_SIZE]; diff --git a/drivers/dma/ipu/ipu_idmac.c b/drivers/dma/ipu/ipu_idmac.c index da781d10789..e202a6ce557 100644 --- a/drivers/dma/ipu/ipu_idmac.c +++ b/drivers/dma/ipu/ipu_idmac.c @@ -28,6 +28,9 @@ #define FS_VF_IN_VALID 0x00000002 #define FS_ENC_IN_VALID 0x00000001 +static int ipu_disable_channel(struct idmac *idmac, struct idmac_channel *ichan, + bool wait_for_stop); + /* * There can be only one, we could allocate it dynamically, but then we'd have * to add an extra parameter to some functions, and use something as ugly as @@ -107,7 +110,7 @@ static uint32_t bytes_per_pixel(enum pixel_fmt fmt) } } -/* Enable / disable direct write to memory by the Camera Sensor Interface */ +/* Enable direct write to memory by the Camera Sensor Interface */ static void ipu_ic_enable_task(struct ipu *ipu, enum ipu_channel channel) { uint32_t ic_conf, mask; @@ -126,6 +129,7 @@ static void ipu_ic_enable_task(struct ipu *ipu, enum ipu_channel channel) idmac_write_icreg(ipu, ic_conf, IC_CONF); } +/* Called under spin_lock_irqsave(&ipu_data.lock) */ static void ipu_ic_disable_task(struct ipu *ipu, enum ipu_channel channel) { uint32_t ic_conf, mask; @@ -422,7 +426,7 @@ static void ipu_ch_param_set_size(union chan_param_mem *params, break; default: dev_err(ipu_data.dev, - "mxc ipu: unimplemented pixel format %d\n", pixel_fmt); + "mx3 ipu: unimplemented pixel format %d\n", pixel_fmt); break; } @@ -433,20 +437,20 @@ static void ipu_ch_param_set_burst_size(union chan_param_mem *params, uint16_t burst_pixels) { params->pp.npb = burst_pixels - 1; -}; +} static void ipu_ch_param_set_buffer(union chan_param_mem *params, dma_addr_t buf0, dma_addr_t buf1) { params->pp.eba0 = buf0; params->pp.eba1 = buf1; -}; +} static void ipu_ch_param_set_rotation(union chan_param_mem *params, enum ipu_rotate_mode rotate) { params->pp.bam = rotate; -}; +} static void ipu_write_param_mem(uint32_t addr, uint32_t *data, uint32_t num_words) @@ -571,7 +575,7 @@ static uint32_t dma_param_addr(uint32_t dma_ch) { /* Channel Parameter Memory */ return 0x10000 | (dma_ch << 4); -}; +} static void ipu_channel_set_priority(struct ipu *ipu, enum ipu_channel channel, bool prio) @@ -611,7 +615,8 @@ static uint32_t ipu_channel_conf_mask(enum ipu_channel channel) /** * ipu_enable_channel() - enable an IPU channel. - * @channel: channel ID. + * @idmac: IPU DMAC context. + * @ichan: IDMAC channel. * @return: 0 on success or negative error code on failure. */ static int ipu_enable_channel(struct idmac *idmac, struct idmac_channel *ichan) @@ -649,7 +654,7 @@ static int ipu_enable_channel(struct idmac *idmac, struct idmac_channel *ichan) /** * ipu_init_channel_buffer() - initialize a buffer for logical IPU channel. - * @channel: channel ID. + * @ichan: IDMAC channel. * @pixel_fmt: pixel format of buffer. Pixel format is a FOURCC ASCII code. * @width: width of buffer in pixels. * @height: height of buffer in pixels. @@ -687,7 +692,7 @@ static int ipu_init_channel_buffer(struct idmac_channel *ichan, } /* IC channel's stride must be a multiple of 8 pixels */ - if ((channel <= 13) && (stride % 8)) { + if ((channel <= IDMAC_IC_13) && (stride % 8)) { dev_err(ipu->dev, "Stride must be 8 pixel multiple\n"); return -EINVAL; } @@ -752,7 +757,7 @@ static void ipu_select_buffer(enum ipu_channel channel, int buffer_n) /** * ipu_update_channel_buffer() - update physical address of a channel buffer. - * @channel: channel ID. + * @ichan: IDMAC channel. * @buffer_n: buffer number to update. * 0 or 1 are the only valid values. * @phyaddr: buffer physical address. @@ -760,9 +765,10 @@ static void ipu_select_buffer(enum ipu_channel channel, int buffer_n) * function will fail if the buffer is set to ready. */ /* Called under spin_lock(_irqsave)(&ichan->lock) */ -static int ipu_update_channel_buffer(enum ipu_channel channel, +static int ipu_update_channel_buffer(struct idmac_channel *ichan, int buffer_n, dma_addr_t phyaddr) { + enum ipu_channel channel = ichan->dma_chan.chan_id; uint32_t reg; unsigned long flags; @@ -771,8 +777,8 @@ static int ipu_update_channel_buffer(enum ipu_channel channel, if (buffer_n == 0) { reg = idmac_read_ipureg(&ipu_data, IPU_CHA_BUF0_RDY); if (reg & (1UL << channel)) { - spin_unlock_irqrestore(&ipu_data.lock, flags); - return -EACCES; + ipu_ic_disable_task(&ipu_data, channel); + ichan->status = IPU_CHANNEL_READY; } /* 44.3.3.1.9 - Row Number 1 (WORD1, offset 0) */ @@ -782,8 +788,8 @@ static int ipu_update_channel_buffer(enum ipu_channel channel, } else { reg = idmac_read_ipureg(&ipu_data, IPU_CHA_BUF1_RDY); if (reg & (1UL << channel)) { - spin_unlock_irqrestore(&ipu_data.lock, flags); - return -EACCES; + ipu_ic_disable_task(&ipu_data, channel); + ichan->status = IPU_CHANNEL_READY; } /* Check if double-buffering is already enabled */ @@ -805,6 +811,39 @@ static int ipu_update_channel_buffer(enum ipu_channel channel, } /* Called under spin_lock_irqsave(&ichan->lock) */ +static int ipu_submit_buffer(struct idmac_channel *ichan, + struct idmac_tx_desc *desc, struct scatterlist *sg, int buf_idx) +{ + unsigned int chan_id = ichan->dma_chan.chan_id; + struct device *dev = &ichan->dma_chan.dev->device; + int ret; + + if (async_tx_test_ack(&desc->txd)) + return -EINTR; + + /* + * On first invocation this shouldn't be necessary, the call to + * ipu_init_channel_buffer() above will set addresses for us, so we + * could make it conditional on status >= IPU_CHANNEL_ENABLED, but + * doing it again shouldn't hurt either. + */ + ret = ipu_update_channel_buffer(ichan, buf_idx, + sg_dma_address(sg)); + + if (ret < 0) { + dev_err(dev, "Updating sg %p on channel 0x%x buffer %d failed!\n", + sg, chan_id, buf_idx); + return ret; + } + + ipu_select_buffer(chan_id, buf_idx); + dev_dbg(dev, "Updated sg %p on channel 0x%x buffer %d\n", + sg, chan_id, buf_idx); + + return 0; +} + +/* Called under spin_lock_irqsave(&ichan->lock) */ static int ipu_submit_channel_buffers(struct idmac_channel *ichan, struct idmac_tx_desc *desc) { @@ -815,20 +854,10 @@ static int ipu_submit_channel_buffers(struct idmac_channel *ichan, if (!ichan->sg[i]) { ichan->sg[i] = sg; - /* - * On first invocation this shouldn't be necessary, the - * call to ipu_init_channel_buffer() above will set - * addresses for us, so we could make it conditional - * on status >= IPU_CHANNEL_ENABLED, but doing it again - * shouldn't hurt either. - */ - ret = ipu_update_channel_buffer(ichan->dma_chan.chan_id, i, - sg_dma_address(sg)); + ret = ipu_submit_buffer(ichan, desc, sg, i); if (ret < 0) return ret; - ipu_select_buffer(ichan->dma_chan.chan_id, i); - sg = sg_next(sg); } } @@ -842,19 +871,22 @@ static dma_cookie_t idmac_tx_submit(struct dma_async_tx_descriptor *tx) struct idmac_channel *ichan = to_idmac_chan(tx->chan); struct idmac *idmac = to_idmac(tx->chan->device); struct ipu *ipu = to_ipu(idmac); + struct device *dev = &ichan->dma_chan.dev->device; dma_cookie_t cookie; unsigned long flags; + int ret; /* Sanity check */ if (!list_empty(&desc->list)) { /* The descriptor doesn't belong to client */ - dev_err(&ichan->dma_chan.dev->device, - "Descriptor %p not prepared!\n", tx); + dev_err(dev, "Descriptor %p not prepared!\n", tx); return -EBUSY; } mutex_lock(&ichan->chan_mutex); + async_tx_clear_ack(tx); + if (ichan->status < IPU_CHANNEL_READY) { struct idmac_video_param *video = &ichan->params.video; /* @@ -878,16 +910,7 @@ static dma_cookie_t idmac_tx_submit(struct dma_async_tx_descriptor *tx) goto out; } - /* ipu->lock can be taken under ichan->lock, but not v.v. */ - spin_lock_irqsave(&ichan->lock, flags); - - /* submit_buffers() atomically verifies and fills empty sg slots */ - cookie = ipu_submit_channel_buffers(ichan, desc); - - spin_unlock_irqrestore(&ichan->lock, flags); - - if (cookie < 0) - goto out; + dev_dbg(dev, "Submitting sg %p\n", &desc->sg[0]); cookie = ichan->dma_chan.cookie; @@ -897,24 +920,40 @@ static dma_cookie_t idmac_tx_submit(struct dma_async_tx_descriptor *tx) /* from dmaengine.h: "last cookie value returned to client" */ ichan->dma_chan.cookie = cookie; tx->cookie = cookie; + + /* ipu->lock can be taken under ichan->lock, but not v.v. */ spin_lock_irqsave(&ichan->lock, flags); + list_add_tail(&desc->list, &ichan->queue); + /* submit_buffers() atomically verifies and fills empty sg slots */ + ret = ipu_submit_channel_buffers(ichan, desc); + spin_unlock_irqrestore(&ichan->lock, flags); + if (ret < 0) { + cookie = ret; + goto dequeue; + } + if (ichan->status < IPU_CHANNEL_ENABLED) { - int ret = ipu_enable_channel(idmac, ichan); + ret = ipu_enable_channel(idmac, ichan); if (ret < 0) { cookie = ret; - spin_lock_irqsave(&ichan->lock, flags); - list_del_init(&desc->list); - spin_unlock_irqrestore(&ichan->lock, flags); - tx->cookie = cookie; - ichan->dma_chan.cookie = cookie; + goto dequeue; } } dump_idmac_reg(ipu); +dequeue: + if (cookie < 0) { + spin_lock_irqsave(&ichan->lock, flags); + list_del_init(&desc->list); + spin_unlock_irqrestore(&ichan->lock, flags); + tx->cookie = cookie; + ichan->dma_chan.cookie = cookie; + } + out: mutex_unlock(&ichan->chan_mutex); @@ -944,8 +983,6 @@ static int idmac_desc_alloc(struct idmac_channel *ichan, int n) memset(txd, 0, sizeof(*txd)); dma_async_tx_descriptor_init(txd, &ichan->dma_chan); txd->tx_submit = idmac_tx_submit; - txd->chan = &ichan->dma_chan; - INIT_LIST_HEAD(&txd->tx_list); list_add(&desc->list, &ichan->free_list); @@ -1161,6 +1198,24 @@ static int ipu_disable_channel(struct idmac *idmac, struct idmac_channel *ichan, return 0; } +static struct scatterlist *idmac_sg_next(struct idmac_channel *ichan, + struct idmac_tx_desc **desc, struct scatterlist *sg) +{ + struct scatterlist *sgnew = sg ? sg_next(sg) : NULL; + + if (sgnew) + /* next sg-element in this list */ + return sgnew; + + if ((*desc)->list.next == &ichan->queue) + /* No more descriptors on the queue */ + return NULL; + + /* Fetch next descriptor */ + *desc = list_entry((*desc)->list.next, struct idmac_tx_desc, list); + return (*desc)->sg; +} + /* * We have several possibilities here: * current BUF next BUF @@ -1176,23 +1231,46 @@ static int ipu_disable_channel(struct idmac *idmac, struct idmac_channel *ichan, static irqreturn_t idmac_interrupt(int irq, void *dev_id) { struct idmac_channel *ichan = dev_id; + struct device *dev = &ichan->dma_chan.dev->device; unsigned int chan_id = ichan->dma_chan.chan_id; struct scatterlist **sg, *sgnext, *sgnew = NULL; /* Next transfer descriptor */ - struct idmac_tx_desc *desc = NULL, *descnew; + struct idmac_tx_desc *desc, *descnew; dma_async_tx_callback callback; void *callback_param; bool done = false; - u32 ready0 = idmac_read_ipureg(&ipu_data, IPU_CHA_BUF0_RDY), - ready1 = idmac_read_ipureg(&ipu_data, IPU_CHA_BUF1_RDY), - curbuf = idmac_read_ipureg(&ipu_data, IPU_CHA_CUR_BUF); + u32 ready0, ready1, curbuf, err; + unsigned long flags; /* IDMAC has cleared the respective BUFx_RDY bit, we manage the buffer */ - pr_debug("IDMAC irq %d\n", irq); + dev_dbg(dev, "IDMAC irq %d, buf %d\n", irq, ichan->active_buffer); + + spin_lock_irqsave(&ipu_data.lock, flags); + + ready0 = idmac_read_ipureg(&ipu_data, IPU_CHA_BUF0_RDY); + ready1 = idmac_read_ipureg(&ipu_data, IPU_CHA_BUF1_RDY); + curbuf = idmac_read_ipureg(&ipu_data, IPU_CHA_CUR_BUF); + err = idmac_read_ipureg(&ipu_data, IPU_INT_STAT_4); + + if (err & (1 << chan_id)) { + idmac_write_ipureg(&ipu_data, 1 << chan_id, IPU_INT_STAT_4); + spin_unlock_irqrestore(&ipu_data.lock, flags); + /* + * Doing this + * ichan->sg[0] = ichan->sg[1] = NULL; + * you can force channel re-enable on the next tx_submit(), but + * this is dirty - think about descriptors with multiple + * sg elements. + */ + dev_warn(dev, "NFB4EOF on channel %d, ready %x, %x, cur %x\n", + chan_id, ready0, ready1, curbuf); + return IRQ_HANDLED; + } + spin_unlock_irqrestore(&ipu_data.lock, flags); + /* Other interrupts do not interfere with this channel */ spin_lock(&ichan->lock); - if (unlikely(chan_id != IDMAC_SDC_0 && chan_id != IDMAC_SDC_1 && ((curbuf >> chan_id) & 1) == ichan->active_buffer)) { int i = 100; @@ -1207,19 +1285,23 @@ static irqreturn_t idmac_interrupt(int irq, void *dev_id) if (!i) { spin_unlock(&ichan->lock); - dev_dbg(ichan->dma_chan.device->dev, + dev_dbg(dev, "IRQ on active buffer on channel %x, active " "%d, ready %x, %x, current %x!\n", chan_id, ichan->active_buffer, ready0, ready1, curbuf); return IRQ_NONE; - } + } else + dev_dbg(dev, + "Buffer deactivated on channel %x, active " + "%d, ready %x, %x, current %x, rest %d!\n", chan_id, + ichan->active_buffer, ready0, ready1, curbuf, i); } if (unlikely((ichan->active_buffer && (ready1 >> chan_id) & 1) || (!ichan->active_buffer && (ready0 >> chan_id) & 1) )) { spin_unlock(&ichan->lock); - dev_dbg(ichan->dma_chan.device->dev, + dev_dbg(dev, "IRQ with active buffer still ready on channel %x, " "active %d, ready %x, %x!\n", chan_id, ichan->active_buffer, ready0, ready1); @@ -1227,8 +1309,9 @@ static irqreturn_t idmac_interrupt(int irq, void *dev_id) } if (unlikely(list_empty(&ichan->queue))) { + ichan->sg[ichan->active_buffer] = NULL; spin_unlock(&ichan->lock); - dev_err(ichan->dma_chan.device->dev, + dev_err(dev, "IRQ without queued buffers on channel %x, active %d, " "ready %x, %x!\n", chan_id, ichan->active_buffer, ready0, ready1); @@ -1243,40 +1326,44 @@ static irqreturn_t idmac_interrupt(int irq, void *dev_id) sg = &ichan->sg[ichan->active_buffer]; sgnext = ichan->sg[!ichan->active_buffer]; + if (!*sg) { + spin_unlock(&ichan->lock); + return IRQ_HANDLED; + } + + desc = list_entry(ichan->queue.next, struct idmac_tx_desc, list); + descnew = desc; + + dev_dbg(dev, "IDMAC irq %d, dma 0x%08x, next dma 0x%08x, current %d, curbuf 0x%08x\n", + irq, sg_dma_address(*sg), sgnext ? sg_dma_address(sgnext) : 0, ichan->active_buffer, curbuf); + + /* Find the descriptor of sgnext */ + sgnew = idmac_sg_next(ichan, &descnew, *sg); + if (sgnext != sgnew) + dev_err(dev, "Submitted buffer %p, next buffer %p\n", sgnext, sgnew); + /* * if sgnext == NULL sg must be the last element in a scatterlist and * queue must be empty */ if (unlikely(!sgnext)) { - if (unlikely(sg_next(*sg))) { - dev_err(ichan->dma_chan.device->dev, - "Broken buffer-update locking on channel %x!\n", - chan_id); - /* We'll let the user catch up */ + if (!WARN_ON(sg_next(*sg))) + dev_dbg(dev, "Underrun on channel %x\n", chan_id); + ichan->sg[!ichan->active_buffer] = sgnew; + + if (unlikely(sgnew)) { + ipu_submit_buffer(ichan, descnew, sgnew, !ichan->active_buffer); } else { - /* Underrun */ + spin_lock_irqsave(&ipu_data.lock, flags); ipu_ic_disable_task(&ipu_data, chan_id); - dev_dbg(ichan->dma_chan.device->dev, - "Underrun on channel %x\n", chan_id); + spin_unlock_irqrestore(&ipu_data.lock, flags); ichan->status = IPU_CHANNEL_READY; /* Continue to check for complete descriptor */ } } - desc = list_entry(ichan->queue.next, struct idmac_tx_desc, list); - - /* First calculate and submit the next sg element */ - if (likely(sgnext)) - sgnew = sg_next(sgnext); - - if (unlikely(!sgnew)) { - /* Start a new scatterlist, if any queued */ - if (likely(desc->list.next != &ichan->queue)) { - descnew = list_entry(desc->list.next, - struct idmac_tx_desc, list); - sgnew = &descnew->sg[0]; - } - } + /* Calculate and submit the next sg element */ + sgnew = idmac_sg_next(ichan, &descnew, sgnew); if (unlikely(!sg_next(*sg)) || !sgnext) { /* @@ -1289,17 +1376,13 @@ static irqreturn_t idmac_interrupt(int irq, void *dev_id) *sg = sgnew; - if (likely(sgnew)) { - int ret; - - ret = ipu_update_channel_buffer(chan_id, ichan->active_buffer, - sg_dma_address(*sg)); - if (ret < 0) - dev_err(ichan->dma_chan.device->dev, - "Failed to update buffer on channel %x buffer %d!\n", - chan_id, ichan->active_buffer); - else - ipu_select_buffer(chan_id, ichan->active_buffer); + if (likely(sgnew) && + ipu_submit_buffer(ichan, descnew, sgnew, ichan->active_buffer) < 0) { + callback = desc->txd.callback; + callback_param = desc->txd.callback_param; + spin_unlock(&ichan->lock); + callback(callback_param); + spin_lock(&ichan->lock); } /* Flip the active buffer - even if update above failed */ @@ -1327,13 +1410,20 @@ static void ipu_gc_tasklet(unsigned long arg) struct idmac_channel *ichan = ipu->channel + i; struct idmac_tx_desc *desc; unsigned long flags; - int j; + struct scatterlist *sg; + int j, k; for (j = 0; j < ichan->n_tx_desc; j++) { desc = ichan->desc + j; spin_lock_irqsave(&ichan->lock, flags); if (async_tx_test_ack(&desc->txd)) { list_move(&desc->list, &ichan->free_list); + for_each_sg(desc->sg, sg, desc->sg_len, k) { + if (ichan->sg[0] == sg) + ichan->sg[0] = NULL; + else if (ichan->sg[1] == sg) + ichan->sg[1] = NULL; + } async_tx_clear_ack(&desc->txd); } spin_unlock_irqrestore(&ichan->lock, flags); @@ -1341,13 +1431,7 @@ static void ipu_gc_tasklet(unsigned long arg) } } -/* - * At the time .device_alloc_chan_resources() method is called, we cannot know, - * whether the client will accept the channel. Thus we must only check, if we - * can satisfy client's request but the only real criterion to verify, whether - * the client has accepted our offer is the client_count. That's why we have to - * perform the rest of our allocation tasks on the first call to this function. - */ +/* Allocate and initialise a transfer descriptor. */ static struct dma_async_tx_descriptor *idmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, unsigned int sg_len, enum dma_data_direction direction, unsigned long tx_flags) @@ -1358,8 +1442,8 @@ static struct dma_async_tx_descriptor *idmac_prep_slave_sg(struct dma_chan *chan unsigned long flags; /* We only can handle these three channels so far */ - if (ichan->dma_chan.chan_id != IDMAC_SDC_0 && ichan->dma_chan.chan_id != IDMAC_SDC_1 && - ichan->dma_chan.chan_id != IDMAC_IC_7) + if (chan->chan_id != IDMAC_SDC_0 && chan->chan_id != IDMAC_SDC_1 && + chan->chan_id != IDMAC_IC_7) return NULL; if (direction != DMA_FROM_DEVICE && direction != DMA_TO_DEVICE) { @@ -1400,7 +1484,7 @@ static void idmac_issue_pending(struct dma_chan *chan) /* This is not always needed, but doesn't hurt either */ spin_lock_irqsave(&ipu->lock, flags); - ipu_select_buffer(ichan->dma_chan.chan_id, ichan->active_buffer); + ipu_select_buffer(chan->chan_id, ichan->active_buffer); spin_unlock_irqrestore(&ipu->lock, flags); /* @@ -1432,8 +1516,7 @@ static void __idmac_terminate_all(struct dma_chan *chan) struct idmac_tx_desc *desc = ichan->desc + i; if (list_empty(&desc->list)) /* Descriptor was prepared, but not submitted */ - list_add(&desc->list, - &ichan->free_list); + list_add(&desc->list, &ichan->free_list); async_tx_clear_ack(&desc->txd); } @@ -1458,6 +1541,28 @@ static void idmac_terminate_all(struct dma_chan *chan) mutex_unlock(&ichan->chan_mutex); } +#ifdef DEBUG +static irqreturn_t ic_sof_irq(int irq, void *dev_id) +{ + struct idmac_channel *ichan = dev_id; + printk(KERN_DEBUG "Got SOF IRQ %d on Channel %d\n", + irq, ichan->dma_chan.chan_id); + disable_irq(irq); + return IRQ_HANDLED; +} + +static irqreturn_t ic_eof_irq(int irq, void *dev_id) +{ + struct idmac_channel *ichan = dev_id; + printk(KERN_DEBUG "Got EOF IRQ %d on Channel %d\n", + irq, ichan->dma_chan.chan_id); + disable_irq(irq); + return IRQ_HANDLED; +} + +static int ic_sof = -EINVAL, ic_eof = -EINVAL; +#endif + static int idmac_alloc_chan_resources(struct dma_chan *chan) { struct idmac_channel *ichan = to_idmac_chan(chan); @@ -1471,31 +1576,49 @@ static int idmac_alloc_chan_resources(struct dma_chan *chan) chan->cookie = 1; ichan->completed = -ENXIO; - ret = ipu_irq_map(ichan->dma_chan.chan_id); + ret = ipu_irq_map(chan->chan_id); if (ret < 0) goto eimap; ichan->eof_irq = ret; + + /* + * Important to first disable the channel, because maybe someone + * used it before us, e.g., the bootloader + */ + ipu_disable_channel(idmac, ichan, true); + + ret = ipu_init_channel(idmac, ichan); + if (ret < 0) + goto eichan; + ret = request_irq(ichan->eof_irq, idmac_interrupt, 0, ichan->eof_name, ichan); if (ret < 0) goto erirq; - ret = ipu_init_channel(idmac, ichan); - if (ret < 0) - goto eichan; +#ifdef DEBUG + if (chan->chan_id == IDMAC_IC_7) { + ic_sof = ipu_irq_map(69); + if (ic_sof > 0) + request_irq(ic_sof, ic_sof_irq, 0, "IC SOF", ichan); + ic_eof = ipu_irq_map(70); + if (ic_eof > 0) + request_irq(ic_eof, ic_eof_irq, 0, "IC EOF", ichan); + } +#endif ichan->status = IPU_CHANNEL_INITIALIZED; - dev_dbg(&ichan->dma_chan.dev->device, "Found channel 0x%x, irq %d\n", - ichan->dma_chan.chan_id, ichan->eof_irq); + dev_dbg(&chan->dev->device, "Found channel 0x%x, irq %d\n", + chan->chan_id, ichan->eof_irq); return ret; -eichan: - free_irq(ichan->eof_irq, ichan); erirq: - ipu_irq_unmap(ichan->dma_chan.chan_id); + ipu_uninit_channel(idmac, ichan); +eichan: + ipu_irq_unmap(chan->chan_id); eimap: return ret; } @@ -1510,8 +1633,22 @@ static void idmac_free_chan_resources(struct dma_chan *chan) __idmac_terminate_all(chan); if (ichan->status > IPU_CHANNEL_FREE) { +#ifdef DEBUG + if (chan->chan_id == IDMAC_IC_7) { + if (ic_sof > 0) { + free_irq(ic_sof, ichan); + ipu_irq_unmap(69); + ic_sof = -EINVAL; + } + if (ic_eof > 0) { + free_irq(ic_eof, ichan); + ipu_irq_unmap(70); + ic_eof = -EINVAL; + } + } +#endif free_irq(ichan->eof_irq, ichan); - ipu_irq_unmap(ichan->dma_chan.chan_id); + ipu_irq_unmap(chan->chan_id); } ichan->status = IPU_CHANNEL_FREE; @@ -1573,7 +1710,7 @@ static int __init ipu_idmac_init(struct ipu *ipu) dma_chan->device = &idmac->dma; dma_chan->cookie = 1; dma_chan->chan_id = i; - list_add_tail(&ichan->dma_chan.device_node, &dma->channels); + list_add_tail(&dma_chan->device_node, &dma->channels); } idmac_write_icreg(ipu, 0x00000070, IDMAC_CONF); @@ -1581,7 +1718,7 @@ static int __init ipu_idmac_init(struct ipu *ipu) return dma_async_device_register(&idmac->dma); } -static void ipu_idmac_exit(struct ipu *ipu) +static void __exit ipu_idmac_exit(struct ipu *ipu) { int i; struct idmac *idmac = &ipu->idmac; @@ -1600,7 +1737,7 @@ static void ipu_idmac_exit(struct ipu *ipu) * IPU common probe / remove */ -static int ipu_probe(struct platform_device *pdev) +static int __init ipu_probe(struct platform_device *pdev) { struct ipu_platform_data *pdata = pdev->dev.platform_data; struct resource *mem_ipu, *mem_ic; @@ -1700,7 +1837,7 @@ err_noirq: return ret; } -static int ipu_remove(struct platform_device *pdev) +static int __exit ipu_remove(struct platform_device *pdev) { struct ipu *ipu = platform_get_drvdata(pdev); @@ -1725,7 +1862,7 @@ static struct platform_driver ipu_platform_driver = { .name = "ipu-core", .owner = THIS_MODULE, }, - .remove = ipu_remove, + .remove = __exit_p(ipu_remove), }; static int __init ipu_init(void) diff --git a/drivers/dma/ipu/ipu_irq.c b/drivers/dma/ipu/ipu_irq.c index 83f532cc767..dd8ebc75b66 100644 --- a/drivers/dma/ipu/ipu_irq.c +++ b/drivers/dma/ipu/ipu_irq.c @@ -352,7 +352,7 @@ static struct irq_chip ipu_irq_chip = { }; /* Install the IRQ handler */ -int ipu_irq_attach_irq(struct ipu *ipu, struct platform_device *dev) +int __init ipu_irq_attach_irq(struct ipu *ipu, struct platform_device *dev) { struct ipu_platform_data *pdata = dev->dev.platform_data; unsigned int irq, irq_base, i; diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c index cb7f26fb9f1..ddab94f5122 100644 --- a/drivers/dma/mv_xor.c +++ b/drivers/dma/mv_xor.c @@ -632,7 +632,6 @@ static int mv_xor_alloc_chan_resources(struct dma_chan *chan) slot->async_tx.tx_submit = mv_xor_tx_submit; INIT_LIST_HEAD(&slot->chain_node); INIT_LIST_HEAD(&slot->slot_node); - INIT_LIST_HEAD(&slot->async_tx.tx_list); hw_desc = (char *) mv_chan->device->dma_desc_pool; slot->async_tx.phys = (dma_addr_t) &hw_desc[idx * MV_XOR_SLOT_SIZE]; diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 2281b5098e9..36e0675be9f 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -121,6 +121,7 @@ config MD_RAID10 config MD_RAID456 tristate "RAID-4/RAID-5/RAID-6 mode" depends on BLK_DEV_MD + select MD_RAID6_PQ select ASYNC_MEMCPY select ASYNC_XOR ---help--- @@ -151,34 +152,8 @@ config MD_RAID456 If unsure, say Y. -config MD_RAID5_RESHAPE - bool "Support adding drives to a raid-5 array" - depends on MD_RAID456 - default y - ---help--- - A RAID-5 set can be expanded by adding extra drives. This - requires "restriping" the array which means (almost) every - block must be written to a different place. - - This option allows such restriping to be done while the array - is online. - - You will need mdadm version 2.4.1 or later to use this - feature safely. During the early stage of reshape there is - a critical section where live data is being over-written. A - crash during this time needs extra care for recovery. The - newer mdadm takes a copy of the data in the critical section - and will restore it, if necessary, after a crash. - - The mdadm usage is e.g. - mdadm --grow /dev/md1 --raid-disks=6 - to grow '/dev/md1' to having 6 disks. - - Note: The array can only be expanded, not contracted. - There should be enough spares already present to make the new - array workable. - - If unsure, say Y. +config MD_RAID6_PQ + tristate config MD_MULTIPATH tristate "Multipath I/O support" diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 72880b7e28d..45cc5951d92 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -2,20 +2,21 @@ # Makefile for the kernel software RAID and LVM drivers. # -dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ +dm-mod-y += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o -dm-multipath-objs := dm-path-selector.o dm-mpath.o -dm-snapshot-objs := dm-snap.o dm-exception-store.o dm-snap-transient.o \ +dm-multipath-y += dm-path-selector.o dm-mpath.o +dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ dm-snap-persistent.o -dm-mirror-objs := dm-raid1.o -md-mod-objs := md.o bitmap.o -raid456-objs := raid5.o raid6algos.o raid6recov.o raid6tables.o \ +dm-mirror-y += dm-raid1.o +md-mod-y += md.o bitmap.o +raid456-y += raid5.o +raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \ raid6int1.o raid6int2.o raid6int4.o \ raid6int8.o raid6int16.o raid6int32.o \ raid6altivec1.o raid6altivec2.o raid6altivec4.o \ raid6altivec8.o \ raid6mmx.o raid6sse1.o raid6sse2.o -hostprogs-y := mktables +hostprogs-y += mktables # Note: link order is important. All raid personalities # and must come before md.o, as they each initialise @@ -26,6 +27,7 @@ obj-$(CONFIG_MD_LINEAR) += linear.o obj-$(CONFIG_MD_RAID0) += raid0.o obj-$(CONFIG_MD_RAID1) += raid1.o obj-$(CONFIG_MD_RAID10) += raid10.o +obj-$(CONFIG_MD_RAID6_PQ) += raid6_pq.o obj-$(CONFIG_MD_RAID456) += raid456.o obj-$(CONFIG_MD_MULTIPATH) += multipath.o obj-$(CONFIG_MD_FAULTY) += faulty.o diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 71994376339..f8a9f7ab2cb 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -16,6 +16,7 @@ * wait if count gets too high, wake when it drops to half. */ +#include <linux/blkdev.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/slab.h> @@ -26,8 +27,8 @@ #include <linux/file.h> #include <linux/mount.h> #include <linux/buffer_head.h> -#include <linux/raid/md.h> -#include <linux/raid/bitmap.h> +#include "md.h" +#include "bitmap.h" /* debug macros */ @@ -111,9 +112,10 @@ static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int creat unsigned char *mappage; if (page >= bitmap->pages) { - printk(KERN_ALERT - "%s: invalid bitmap page request: %lu (> %lu)\n", - bmname(bitmap), page, bitmap->pages-1); + /* This can happen if bitmap_start_sync goes beyond + * End-of-device while looking for a whole page. + * It is harmless. + */ return -EINVAL; } @@ -265,7 +267,6 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev) list_for_each_continue_rcu(pos, &mddev->disks) { rdev = list_entry(pos, mdk_rdev_t, same_set); if (rdev->raid_disk >= 0 && - test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) { /* this is a usable devices */ atomic_inc(&rdev->nr_pending); @@ -297,7 +298,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) + size/512 > 0) /* bitmap runs in to metadata */ goto bad_alignment; - if (rdev->data_offset + mddev->size*2 + if (rdev->data_offset + mddev->dev_sectors > rdev->sb_start + bitmap->offset) /* data runs in to bitmap */ goto bad_alignment; @@ -570,7 +571,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO || le32_to_cpu(sb->version) > BITMAP_MAJOR_HI) reason = "unrecognized superblock version"; - else if (chunksize < PAGE_SIZE) + else if (chunksize < 512) reason = "bitmap chunksize too small"; else if ((1 << ffz(~chunksize)) != chunksize) reason = "bitmap chunksize not a power of 2"; @@ -1306,6 +1307,9 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n", atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); } + if (bitmap->mddev->degraded) + /* Never clear bits or update events_cleared when degraded */ + success = 0; while (sectors) { int blocks; @@ -1345,8 +1349,8 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto } } -int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, - int degraded) +static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, + int degraded) { bitmap_counter_t *bmc; int rv; @@ -1374,6 +1378,29 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, return rv; } +int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, + int degraded) +{ + /* bitmap_start_sync must always report on multiples of whole + * pages, otherwise resync (which is very PAGE_SIZE based) will + * get confused. + * So call __bitmap_start_sync repeatedly (if needed) until + * At least PAGE_SIZE>>9 blocks are covered. + * Return the 'or' of the result. + */ + int rv = 0; + int blocks1; + + *blocks = 0; + while (*blocks < (PAGE_SIZE>>9)) { + rv |= __bitmap_start_sync(bitmap, offset, + &blocks1, degraded); + offset += blocks1; + *blocks += blocks1; + } + return rv; +} + void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted) { bitmap_counter_t *bmc; @@ -1443,6 +1470,8 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) wait_event(bitmap->mddev->recovery_wait, atomic_read(&bitmap->mddev->recovery_active) == 0); + bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync; + set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); s = 0; while (s < sector && s < bitmap->mddev->resync_max_sectors) { diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h new file mode 100644 index 00000000000..e98900671ca --- /dev/null +++ b/drivers/md/bitmap.h @@ -0,0 +1,288 @@ +/* + * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 + * + * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. + */ +#ifndef BITMAP_H +#define BITMAP_H 1 + +#define BITMAP_MAJOR_LO 3 +/* version 4 insists the bitmap is in little-endian order + * with version 3, it is host-endian which is non-portable + */ +#define BITMAP_MAJOR_HI 4 +#define BITMAP_MAJOR_HOSTENDIAN 3 + +#define BITMAP_MINOR 39 + +/* + * in-memory bitmap: + * + * Use 16 bit block counters to track pending writes to each "chunk". + * The 2 high order bits are special-purpose, the first is a flag indicating + * whether a resync is needed. The second is a flag indicating whether a + * resync is active. + * This means that the counter is actually 14 bits: + * + * +--------+--------+------------------------------------------------+ + * | resync | resync | counter | + * | needed | active | | + * | (0-1) | (0-1) | (0-16383) | + * +--------+--------+------------------------------------------------+ + * + * The "resync needed" bit is set when: + * a '1' bit is read from storage at startup. + * a write request fails on some drives + * a resync is aborted on a chunk with 'resync active' set + * It is cleared (and resync-active set) when a resync starts across all drives + * of the chunk. + * + * + * The "resync active" bit is set when: + * a resync is started on all drives, and resync_needed is set. + * resync_needed will be cleared (as long as resync_active wasn't already set). + * It is cleared when a resync completes. + * + * The counter counts pending write requests, plus the on-disk bit. + * When the counter is '1' and the resync bits are clear, the on-disk + * bit can be cleared aswell, thus setting the counter to 0. + * When we set a bit, or in the counter (to start a write), if the fields is + * 0, we first set the disk bit and set the counter to 1. + * + * If the counter is 0, the on-disk bit is clear and the stipe is clean + * Anything that dirties the stipe pushes the counter to 2 (at least) + * and sets the on-disk bit (lazily). + * If a periodic sweep find the counter at 2, it is decremented to 1. + * If the sweep find the counter at 1, the on-disk bit is cleared and the + * counter goes to zero. + * + * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block + * counters as a fallback when "page" memory cannot be allocated: + * + * Normal case (page memory allocated): + * + * page pointer (32-bit) + * + * [ ] ------+ + * | + * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters) + * c1 c2 c2048 + * + * Hijacked case (page memory allocation failed): + * + * hijacked page pointer (32-bit) + * + * [ ][ ] (no page memory allocated) + * counter #1 (16-bit) counter #2 (16-bit) + * + */ + +#ifdef __KERNEL__ + +#define PAGE_BITS (PAGE_SIZE << 3) +#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) + +typedef __u16 bitmap_counter_t; +#define COUNTER_BITS 16 +#define COUNTER_BIT_SHIFT 4 +#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8) +#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3) + +#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) +#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) +#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) +#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) +#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) +#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) + +/* how many counters per page? */ +#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS) +/* same, except a shift value for more efficient bitops */ +#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT) +/* same, except a mask value for more efficient bitops */ +#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) + +#define BITMAP_BLOCK_SIZE 512 +#define BITMAP_BLOCK_SHIFT 9 + +/* how many blocks per chunk? (this is variable) */ +#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT) +#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT) +#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1) + +/* when hijacked, the counters and bits represent even larger "chunks" */ +/* there will be 1024 chunks represented by each counter in the page pointers */ +#define PAGEPTR_BLOCK_RATIO(bitmap) \ + (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1) +#define PAGEPTR_BLOCK_SHIFT(bitmap) \ + (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1) +#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1) + +/* + * on-disk bitmap: + * + * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap + * file a page at a time. There's a superblock at the start of the file. + */ + +/* map chunks (bits) to file pages - offset by the size of the superblock */ +#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3)) + +#endif + +/* + * bitmap structures: + */ + +#define BITMAP_MAGIC 0x6d746962 + +/* use these for bitmap->flags and bitmap->sb->state bit-fields */ +enum bitmap_state { + BITMAP_STALE = 0x002, /* the bitmap file is out of date or had -EIO */ + BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */ + BITMAP_HOSTENDIAN = 0x8000, +}; + +/* the superblock at the front of the bitmap file -- little endian */ +typedef struct bitmap_super_s { + __le32 magic; /* 0 BITMAP_MAGIC */ + __le32 version; /* 4 the bitmap major for now, could change... */ + __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */ + __le64 events; /* 24 event counter for the bitmap (1)*/ + __le64 events_cleared;/*32 event counter when last bit cleared (2) */ + __le64 sync_size; /* 40 the size of the md device's sync range(3) */ + __le32 state; /* 48 bitmap state information */ + __le32 chunksize; /* 52 the bitmap chunk size in bytes */ + __le32 daemon_sleep; /* 56 seconds between disk flushes */ + __le32 write_behind; /* 60 number of outstanding write-behind writes */ + + __u8 pad[256 - 64]; /* set to zero */ +} bitmap_super_t; + +/* notes: + * (1) This event counter is updated before the eventcounter in the md superblock + * When a bitmap is loaded, it is only accepted if this event counter is equal + * to, or one greater than, the event counter in the superblock. + * (2) This event counter is updated when the other one is *if*and*only*if* the + * array is not degraded. As bits are not cleared when the array is degraded, + * this represents the last time that any bits were cleared. + * If a device is being added that has an event count with this value or + * higher, it is accepted as conforming to the bitmap. + * (3)This is the number of sectors represented by the bitmap, and is the range that + * resync happens across. For raid1 and raid5/6 it is the size of individual + * devices. For raid10 it is the size of the array. + */ + +#ifdef __KERNEL__ + +/* the in-memory bitmap is represented by bitmap_pages */ +struct bitmap_page { + /* + * map points to the actual memory page + */ + char *map; + /* + * in emergencies (when map cannot be alloced), hijack the map + * pointer and use it as two counters itself + */ + unsigned int hijacked:1; + /* + * count of dirty bits on the page + */ + unsigned int count:31; +}; + +/* keep track of bitmap file pages that have pending writes on them */ +struct page_list { + struct list_head list; + struct page *page; +}; + +/* the main bitmap structure - one per mddev */ +struct bitmap { + struct bitmap_page *bp; + unsigned long pages; /* total number of pages in the bitmap */ + unsigned long missing_pages; /* number of pages not yet allocated */ + + mddev_t *mddev; /* the md device that the bitmap is for */ + + int counter_bits; /* how many bits per block counter */ + + /* bitmap chunksize -- how much data does each bit represent? */ + unsigned long chunksize; + unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ + unsigned long chunks; /* total number of data chunks for the array */ + + /* We hold a count on the chunk currently being synced, and drop + * it when the last block is started. If the resync is aborted + * midway, we need to be able to drop that count, so we remember + * the counted chunk.. + */ + unsigned long syncchunk; + + __u64 events_cleared; + int need_sync; + + /* bitmap spinlock */ + spinlock_t lock; + + long offset; /* offset from superblock if file is NULL */ + struct file *file; /* backing disk file */ + struct page *sb_page; /* cached copy of the bitmap file superblock */ + struct page **filemap; /* list of cache pages for the file */ + unsigned long *filemap_attr; /* attributes associated w/ filemap pages */ + unsigned long file_pages; /* number of pages in the file */ + int last_page_size; /* bytes in the last page */ + + unsigned long flags; + + int allclean; + + unsigned long max_write_behind; /* write-behind mode */ + atomic_t behind_writes; + + /* + * the bitmap daemon - periodically wakes up and sweeps the bitmap + * file, cleaning up bits and flushing out pages to disk as necessary + */ + unsigned long daemon_lastrun; /* jiffies of last run */ + unsigned long daemon_sleep; /* how many seconds between updates? */ + unsigned long last_end_sync; /* when we lasted called end_sync to + * update bitmap with resync progress */ + + atomic_t pending_writes; /* pending writes to the bitmap file */ + wait_queue_head_t write_wait; + wait_queue_head_t overflow_wait; + +}; + +/* the bitmap API */ + +/* these are used only by md/bitmap */ +int bitmap_create(mddev_t *mddev); +void bitmap_flush(mddev_t *mddev); +void bitmap_destroy(mddev_t *mddev); + +void bitmap_print_sb(struct bitmap *bitmap); +void bitmap_update_sb(struct bitmap *bitmap); + +int bitmap_setallbits(struct bitmap *bitmap); +void bitmap_write_all(struct bitmap *bitmap); + +void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e); + +/* these are exported */ +int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, + unsigned long sectors, int behind); +void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, + unsigned long sectors, int success, int behind); +int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded); +void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted); +void bitmap_close_sync(struct bitmap *bitmap); +void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); + +void bitmap_unplug(struct bitmap *bitmap); +void bitmap_daemon_work(struct bitmap *bitmap); +#endif + +#endif diff --git a/drivers/md/dm-bio-list.h b/drivers/md/dm-bio-list.h index d4509be0fe6..345098b4ca7 100644 --- a/drivers/md/dm-bio-list.h +++ b/drivers/md/dm-bio-list.h @@ -52,6 +52,16 @@ static inline void bio_list_add(struct bio_list *bl, struct bio *bio) bl->tail = bio; } +static inline void bio_list_add_head(struct bio_list *bl, struct bio *bio) +{ + bio->bi_next = bl->head; + + bl->head = bio; + + if (!bl->tail) + bl->tail = bio; +} + static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2) { if (!bl2->head) diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h index d3ec217847d..3a8cfa2645c 100644 --- a/drivers/md/dm-bio-record.h +++ b/drivers/md/dm-bio-record.h @@ -16,30 +16,56 @@ * functions in this file help the target record and restore the * original bio state. */ + +struct dm_bio_vec_details { +#if PAGE_SIZE < 65536 + __u16 bv_len; + __u16 bv_offset; +#else + unsigned bv_len; + unsigned bv_offset; +#endif +}; + struct dm_bio_details { sector_t bi_sector; struct block_device *bi_bdev; unsigned int bi_size; unsigned short bi_idx; unsigned long bi_flags; + struct dm_bio_vec_details bi_io_vec[BIO_MAX_PAGES]; }; static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio) { + unsigned i; + bd->bi_sector = bio->bi_sector; bd->bi_bdev = bio->bi_bdev; bd->bi_size = bio->bi_size; bd->bi_idx = bio->bi_idx; bd->bi_flags = bio->bi_flags; + + for (i = 0; i < bio->bi_vcnt; i++) { + bd->bi_io_vec[i].bv_len = bio->bi_io_vec[i].bv_len; + bd->bi_io_vec[i].bv_offset = bio->bi_io_vec[i].bv_offset; + } } static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio) { + unsigned i; + bio->bi_sector = bd->bi_sector; bio->bi_bdev = bd->bi_bdev; bio->bi_size = bd->bi_size; bio->bi_idx = bd->bi_idx; bio->bi_flags = bd->bi_flags; + + for (i = 0; i < bio->bi_vcnt; i++) { + bio->bi_io_vec[i].bv_len = bd->bi_io_vec[i].bv_len; + bio->bi_io_vec[i].bv_offset = bd->bi_io_vec[i].bv_offset; + } } #endif diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index bfefd079a95..53394e863c7 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1156,8 +1156,7 @@ bad_ivmode: crypto_free_ablkcipher(tfm); bad_cipher: /* Must zero key material before freeing */ - memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8)); - kfree(cc); + kzfree(cc); return -EINVAL; } @@ -1183,8 +1182,7 @@ static void crypt_dtr(struct dm_target *ti) dm_put_device(ti, cc->dev); /* Must zero key material before freeing */ - memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8)); - kfree(cc); + kzfree(cc); } static int crypt_map(struct dm_target *ti, struct bio *bio, diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index dccbfb0e010..a2e26c24214 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c @@ -7,6 +7,7 @@ #include "dm-exception-store.h" +#include <linux/ctype.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/vmalloc.h> @@ -14,6 +15,257 @@ #define DM_MSG_PREFIX "snapshot exception stores" +static LIST_HEAD(_exception_store_types); +static DEFINE_SPINLOCK(_lock); + +static struct dm_exception_store_type *__find_exception_store_type(const char *name) +{ + struct dm_exception_store_type *type; + + list_for_each_entry(type, &_exception_store_types, list) + if (!strcmp(name, type->name)) + return type; + + return NULL; +} + +static struct dm_exception_store_type *_get_exception_store_type(const char *name) +{ + struct dm_exception_store_type *type; + + spin_lock(&_lock); + + type = __find_exception_store_type(name); + + if (type && !try_module_get(type->module)) + type = NULL; + + spin_unlock(&_lock); + + return type; +} + +/* + * get_type + * @type_name + * + * Attempt to retrieve the dm_exception_store_type by name. If not already + * available, attempt to load the appropriate module. + * + * Exstore modules are named "dm-exstore-" followed by the 'type_name'. + * Modules may contain multiple types. + * This function will first try the module "dm-exstore-<type_name>", + * then truncate 'type_name' on the last '-' and try again. + * + * For example, if type_name was "clustered-shared", it would search + * 'dm-exstore-clustered-shared' then 'dm-exstore-clustered'. + * + * 'dm-exception-store-<type_name>' is too long of a name in my + * opinion, which is why I've chosen to have the files + * containing exception store implementations be 'dm-exstore-<type_name>'. + * If you want your module to be autoloaded, you will follow this + * naming convention. + * + * Returns: dm_exception_store_type* on success, NULL on failure + */ +static struct dm_exception_store_type *get_type(const char *type_name) +{ + char *p, *type_name_dup; + struct dm_exception_store_type *type; + + type = _get_exception_store_type(type_name); + if (type) + return type; + + type_name_dup = kstrdup(type_name, GFP_KERNEL); + if (!type_name_dup) { + DMERR("No memory left to attempt load for \"%s\"", type_name); + return NULL; + } + + while (request_module("dm-exstore-%s", type_name_dup) || + !(type = _get_exception_store_type(type_name))) { + p = strrchr(type_name_dup, '-'); + if (!p) + break; + p[0] = '\0'; + } + + if (!type) + DMWARN("Module for exstore type \"%s\" not found.", type_name); + + kfree(type_name_dup); + + return type; +} + +static void put_type(struct dm_exception_store_type *type) +{ + spin_lock(&_lock); + module_put(type->module); + spin_unlock(&_lock); +} + +int dm_exception_store_type_register(struct dm_exception_store_type *type) +{ + int r = 0; + + spin_lock(&_lock); + if (!__find_exception_store_type(type->name)) + list_add(&type->list, &_exception_store_types); + else + r = -EEXIST; + spin_unlock(&_lock); + + return r; +} +EXPORT_SYMBOL(dm_exception_store_type_register); + +int dm_exception_store_type_unregister(struct dm_exception_store_type *type) +{ + spin_lock(&_lock); + + if (!__find_exception_store_type(type->name)) { + spin_unlock(&_lock); + return -EINVAL; + } + + list_del(&type->list); + + spin_unlock(&_lock); + + return 0; +} +EXPORT_SYMBOL(dm_exception_store_type_unregister); + +/* + * Round a number up to the nearest 'size' boundary. size must + * be a power of 2. + */ +static ulong round_up(ulong n, ulong size) +{ + size--; + return (n + size) & ~size; +} + +static int set_chunk_size(struct dm_exception_store *store, + const char *chunk_size_arg, char **error) +{ + unsigned long chunk_size_ulong; + char *value; + + chunk_size_ulong = simple_strtoul(chunk_size_arg, &value, 10); + if (*chunk_size_arg == '\0' || *value != '\0') { + *error = "Invalid chunk size"; + return -EINVAL; + } + + if (!chunk_size_ulong) { + store->chunk_size = store->chunk_mask = store->chunk_shift = 0; + return 0; + } + + /* + * Chunk size must be multiple of page size. Silently + * round up if it's not. + */ + chunk_size_ulong = round_up(chunk_size_ulong, PAGE_SIZE >> 9); + + /* Check chunk_size is a power of 2 */ + if (!is_power_of_2(chunk_size_ulong)) { + *error = "Chunk size is not a power of 2"; + return -EINVAL; + } + + /* Validate the chunk size against the device block size */ + if (chunk_size_ulong % (bdev_hardsect_size(store->cow->bdev) >> 9)) { + *error = "Chunk size is not a multiple of device blocksize"; + return -EINVAL; + } + + store->chunk_size = chunk_size_ulong; + store->chunk_mask = chunk_size_ulong - 1; + store->chunk_shift = ffs(chunk_size_ulong) - 1; + + return 0; +} + +int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, + unsigned *args_used, + struct dm_exception_store **store) +{ + int r = 0; + struct dm_exception_store_type *type; + struct dm_exception_store *tmp_store; + char persistent; + + if (argc < 3) { + ti->error = "Insufficient exception store arguments"; + return -EINVAL; + } + + tmp_store = kmalloc(sizeof(*tmp_store), GFP_KERNEL); + if (!tmp_store) { + ti->error = "Exception store allocation failed"; + return -ENOMEM; + } + + persistent = toupper(*argv[1]); + if (persistent != 'P' && persistent != 'N') { + ti->error = "Persistent flag is not P or N"; + return -EINVAL; + } + + type = get_type(argv[1]); + if (!type) { + ti->error = "Exception store type not recognised"; + r = -EINVAL; + goto bad_type; + } + + tmp_store->type = type; + tmp_store->ti = ti; + + r = dm_get_device(ti, argv[0], 0, 0, + FMODE_READ | FMODE_WRITE, &tmp_store->cow); + if (r) { + ti->error = "Cannot get COW device"; + goto bad_cow; + } + + r = set_chunk_size(tmp_store, argv[2], &ti->error); + if (r) + goto bad_cow; + + r = type->ctr(tmp_store, 0, NULL); + if (r) { + ti->error = "Exception store type constructor failed"; + goto bad_ctr; + } + + *args_used = 3; + *store = tmp_store; + return 0; + +bad_ctr: + dm_put_device(ti, tmp_store->cow); +bad_cow: + put_type(type); +bad_type: + kfree(tmp_store); + return r; +} +EXPORT_SYMBOL(dm_exception_store_create); + +void dm_exception_store_destroy(struct dm_exception_store *store) +{ + store->type->dtr(store); + dm_put_device(store->ti, store->cow); + put_type(store->type); + kfree(store); +} +EXPORT_SYMBOL(dm_exception_store_destroy); + int dm_exception_store_init(void) { int r; diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index bb9f33d5daa..0a2e6e7f67b 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h @@ -37,11 +37,18 @@ struct dm_snap_exception { * Abstraction to handle the meta/layout of exception stores (the * COW device). */ -struct dm_exception_store { +struct dm_exception_store; +struct dm_exception_store_type { + const char *name; + struct module *module; + + int (*ctr) (struct dm_exception_store *store, + unsigned argc, char **argv); + /* * Destroys this object when you've finished with it. */ - void (*destroy) (struct dm_exception_store *store); + void (*dtr) (struct dm_exception_store *store); /* * The target shouldn't read the COW device until this is @@ -72,8 +79,9 @@ struct dm_exception_store { */ void (*drop_snapshot) (struct dm_exception_store *store); - int (*status) (struct dm_exception_store *store, status_type_t status, - char *result, unsigned int maxlen); + unsigned (*status) (struct dm_exception_store *store, + status_type_t status, char *result, + unsigned maxlen); /* * Return how full the snapshot is. @@ -82,7 +90,21 @@ struct dm_exception_store { sector_t *numerator, sector_t *denominator); - struct dm_snapshot *snap; + /* For internal device-mapper use only. */ + struct list_head list; +}; + +struct dm_exception_store { + struct dm_exception_store_type *type; + struct dm_target *ti; + + struct dm_dev *cow; + + /* Size of data blocks saved - must be a power of 2 */ + chunk_t chunk_size; + chunk_t chunk_mask; + chunk_t chunk_shift; + void *context; }; @@ -129,6 +151,28 @@ static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) # endif +/* + * Return the number of sectors in the device. + */ +static inline sector_t get_dev_size(struct block_device *bdev) +{ + return bdev->bd_inode->i_size >> SECTOR_SHIFT; +} + +static inline chunk_t sector_to_chunk(struct dm_exception_store *store, + sector_t sector) +{ + return (sector & ~store->chunk_mask) >> store->chunk_shift; +} + +int dm_exception_store_type_register(struct dm_exception_store_type *type); +int dm_exception_store_type_unregister(struct dm_exception_store_type *type); + +int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, + unsigned *args_used, + struct dm_exception_store **store); +void dm_exception_store_destroy(struct dm_exception_store *store); + int dm_exception_store_init(void); void dm_exception_store_exit(void); @@ -141,8 +185,4 @@ void dm_persistent_snapshot_exit(void); int dm_transient_snapshot_init(void); void dm_transient_snapshot_exit(void); -int dm_create_persistent(struct dm_exception_store *store); - -int dm_create_transient(struct dm_exception_store *store); - #endif /* _LINUX_DM_EXCEPTION_STORE */ diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 36e2b5e46a6..e73aabd61cd 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -370,16 +370,13 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, while (1) { set_current_state(TASK_UNINTERRUPTIBLE); - if (!atomic_read(&io.count) || signal_pending(current)) + if (!atomic_read(&io.count)) break; io_schedule(); } set_current_state(TASK_RUNNING); - if (atomic_read(&io.count)) - return -EINTR; - if (error_bits) *error_bits = io.error_bits; diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 737961f275c..be233bc4d91 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -16,40 +16,29 @@ #define DM_MSG_PREFIX "dirty region log" -struct dm_dirty_log_internal { - struct dm_dirty_log_type *type; - - struct list_head list; - long use; -}; - static LIST_HEAD(_log_types); static DEFINE_SPINLOCK(_lock); -static struct dm_dirty_log_internal *__find_dirty_log_type(const char *name) +static struct dm_dirty_log_type *__find_dirty_log_type(const char *name) { - struct dm_dirty_log_internal *log_type; + struct dm_dirty_log_type *log_type; list_for_each_entry(log_type, &_log_types, list) - if (!strcmp(name, log_type->type->name)) + if (!strcmp(name, log_type->name)) return log_type; return NULL; } -static struct dm_dirty_log_internal *_get_dirty_log_type(const char *name) +static struct dm_dirty_log_type *_get_dirty_log_type(const char *name) { - struct dm_dirty_log_internal *log_type; + struct dm_dirty_log_type *log_type; spin_lock(&_lock); log_type = __find_dirty_log_type(name); - if (log_type) { - if (!log_type->use && !try_module_get(log_type->type->module)) - log_type = NULL; - else - log_type->use++; - } + if (log_type && !try_module_get(log_type->module)) + log_type = NULL; spin_unlock(&_lock); @@ -76,14 +65,14 @@ static struct dm_dirty_log_internal *_get_dirty_log_type(const char *name) static struct dm_dirty_log_type *get_type(const char *type_name) { char *p, *type_name_dup; - struct dm_dirty_log_internal *log_type; + struct dm_dirty_log_type *log_type; if (!type_name) return NULL; log_type = _get_dirty_log_type(type_name); if (log_type) - return log_type->type; + return log_type; type_name_dup = kstrdup(type_name, GFP_KERNEL); if (!type_name_dup) { @@ -105,56 +94,33 @@ static struct dm_dirty_log_type *get_type(const char *type_name) kfree(type_name_dup); - return log_type ? log_type->type : NULL; + return log_type; } static void put_type(struct dm_dirty_log_type *type) { - struct dm_dirty_log_internal *log_type; - if (!type) return; spin_lock(&_lock); - log_type = __find_dirty_log_type(type->name); - if (!log_type) + if (!__find_dirty_log_type(type->name)) goto out; - if (!--log_type->use) - module_put(type->module); - - BUG_ON(log_type->use < 0); + module_put(type->module); out: spin_unlock(&_lock); } -static struct dm_dirty_log_internal *_alloc_dirty_log_type(struct dm_dirty_log_type *type) -{ - struct dm_dirty_log_internal *log_type = kzalloc(sizeof(*log_type), - GFP_KERNEL); - - if (log_type) - log_type->type = type; - - return log_type; -} - int dm_dirty_log_type_register(struct dm_dirty_log_type *type) { - struct dm_dirty_log_internal *log_type = _alloc_dirty_log_type(type); int r = 0; - if (!log_type) - return -ENOMEM; - spin_lock(&_lock); if (!__find_dirty_log_type(type->name)) - list_add(&log_type->list, &_log_types); - else { - kfree(log_type); + list_add(&type->list, &_log_types); + else r = -EEXIST; - } spin_unlock(&_lock); return r; @@ -163,25 +129,16 @@ EXPORT_SYMBOL(dm_dirty_log_type_register); int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type) { - struct dm_dirty_log_internal *log_type; - spin_lock(&_lock); - log_type = __find_dirty_log_type(type->name); - if (!log_type) { + if (!__find_dirty_log_type(type->name)) { spin_unlock(&_lock); return -EINVAL; } - if (log_type->use) { - spin_unlock(&_lock); - return -ETXTBSY; - } - - list_del(&log_type->list); + list_del(&type->list); spin_unlock(&_lock); - kfree(log_type); return 0; } diff --git a/drivers/md/dm-path-selector.c b/drivers/md/dm-path-selector.c index 96ea226155b..42c04f04a0c 100644 --- a/drivers/md/dm-path-selector.c +++ b/drivers/md/dm-path-selector.c @@ -17,9 +17,7 @@ struct ps_internal { struct path_selector_type pst; - struct list_head list; - long use; }; #define pst_to_psi(__pst) container_of((__pst), struct ps_internal, pst) @@ -45,12 +43,8 @@ static struct ps_internal *get_path_selector(const char *name) down_read(&_ps_lock); psi = __find_path_selector_type(name); - if (psi) { - if ((psi->use == 0) && !try_module_get(psi->pst.module)) - psi = NULL; - else - psi->use++; - } + if (psi && !try_module_get(psi->pst.module)) + psi = NULL; up_read(&_ps_lock); return psi; @@ -84,11 +78,7 @@ void dm_put_path_selector(struct path_selector_type *pst) if (!psi) goto out; - if (--psi->use == 0) - module_put(psi->pst.module); - - BUG_ON(psi->use < 0); - + module_put(psi->pst.module); out: up_read(&_ps_lock); } @@ -136,11 +126,6 @@ int dm_unregister_path_selector(struct path_selector_type *pst) return -EINVAL; } - if (psi->use) { - up_write(&_ps_lock); - return -ETXTBSY; - } - list_del(&psi->list); up_write(&_ps_lock); diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 4d6bc101962..536ef0bef15 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -145,6 +145,8 @@ struct dm_raid1_read_record { struct dm_bio_details details; }; +static struct kmem_cache *_dm_raid1_read_record_cache; + /* * Every mirror should look like this one. */ @@ -586,6 +588,9 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) int state; struct bio *bio; struct bio_list sync, nosync, recover, *this_list = NULL; + struct bio_list requeue; + struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); + region_t region; if (!writes->head) return; @@ -596,10 +601,18 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) bio_list_init(&sync); bio_list_init(&nosync); bio_list_init(&recover); + bio_list_init(&requeue); while ((bio = bio_list_pop(writes))) { - state = dm_rh_get_state(ms->rh, - dm_rh_bio_to_region(ms->rh, bio), 1); + region = dm_rh_bio_to_region(ms->rh, bio); + + if (log->type->is_remote_recovering && + log->type->is_remote_recovering(log, region)) { + bio_list_add(&requeue, bio); + continue; + } + + state = dm_rh_get_state(ms->rh, region, 1); switch (state) { case DM_RH_CLEAN: case DM_RH_DIRTY: @@ -619,6 +632,16 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) } /* + * Add bios that are delayed due to remote recovery + * back on to the write queue + */ + if (unlikely(requeue.head)) { + spin_lock_irq(&ms->lock); + bio_list_merge(&ms->writes, &requeue); + spin_unlock_irq(&ms->lock); + } + + /* * Increment the pending counts for any regions that will * be written to (writes to recover regions are going to * be delayed). @@ -764,9 +787,9 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, atomic_set(&ms->suspend, 0); atomic_set(&ms->default_mirror, DEFAULT_MIRROR); - len = sizeof(struct dm_raid1_read_record); - ms->read_record_pool = mempool_create_kmalloc_pool(MIN_READ_RECORDS, - len); + ms->read_record_pool = mempool_create_slab_pool(MIN_READ_RECORDS, + _dm_raid1_read_record_cache); + if (!ms->read_record_pool) { ti->error = "Error creating mirror read_record_pool"; kfree(ms); @@ -1279,16 +1302,31 @@ static int __init dm_mirror_init(void) { int r; + _dm_raid1_read_record_cache = KMEM_CACHE(dm_raid1_read_record, 0); + if (!_dm_raid1_read_record_cache) { + DMERR("Can't allocate dm_raid1_read_record cache"); + r = -ENOMEM; + goto bad_cache; + } + r = dm_register_target(&mirror_target); - if (r < 0) + if (r < 0) { DMERR("Failed to register mirror target"); + goto bad_target; + } + + return 0; +bad_target: + kmem_cache_destroy(_dm_raid1_read_record_cache); +bad_cache: return r; } static void __exit dm_mirror_exit(void) { dm_unregister_target(&mirror_target); + kmem_cache_destroy(_dm_raid1_read_record_cache); } /* Module hooks */ diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 936b34e0959..e75c6dd76a9 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -6,7 +6,6 @@ */ #include "dm-exception-store.h" -#include "dm-snap.h" #include <linux/mm.h> #include <linux/pagemap.h> @@ -89,7 +88,7 @@ struct commit_callback { * The top level structure for a persistent exception store. */ struct pstore { - struct dm_snapshot *snap; /* up pointer to my snapshot */ + struct dm_exception_store *store; int version; int valid; uint32_t exceptions_per_area; @@ -141,7 +140,7 @@ static int alloc_area(struct pstore *ps) int r = -ENOMEM; size_t len; - len = ps->snap->chunk_size << SECTOR_SHIFT; + len = ps->store->chunk_size << SECTOR_SHIFT; /* * Allocate the chunk_size block of memory that will hold @@ -163,9 +162,12 @@ static int alloc_area(struct pstore *ps) static void free_area(struct pstore *ps) { - vfree(ps->area); + if (ps->area) + vfree(ps->area); ps->area = NULL; - vfree(ps->zero_area); + + if (ps->zero_area) + vfree(ps->zero_area); ps->zero_area = NULL; } @@ -189,9 +191,9 @@ static void do_metadata(struct work_struct *work) static int chunk_io(struct pstore *ps, chunk_t chunk, int rw, int metadata) { struct dm_io_region where = { - .bdev = ps->snap->cow->bdev, - .sector = ps->snap->chunk_size * chunk, - .count = ps->snap->chunk_size, + .bdev = ps->store->cow->bdev, + .sector = ps->store->chunk_size * chunk, + .count = ps->store->chunk_size, }; struct dm_io_request io_req = { .bi_rw = rw, @@ -247,15 +249,15 @@ static int area_io(struct pstore *ps, int rw) static void zero_memory_area(struct pstore *ps) { - memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT); + memset(ps->area, 0, ps->store->chunk_size << SECTOR_SHIFT); } static int zero_disk_area(struct pstore *ps, chunk_t area) { struct dm_io_region where = { - .bdev = ps->snap->cow->bdev, - .sector = ps->snap->chunk_size * area_location(ps, area), - .count = ps->snap->chunk_size, + .bdev = ps->store->cow->bdev, + .sector = ps->store->chunk_size * area_location(ps, area), + .count = ps->store->chunk_size, }; struct dm_io_request io_req = { .bi_rw = WRITE, @@ -278,15 +280,15 @@ static int read_header(struct pstore *ps, int *new_snapshot) /* * Use default chunk size (or hardsect_size, if larger) if none supplied */ - if (!ps->snap->chunk_size) { - ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, - bdev_hardsect_size(ps->snap->cow->bdev) >> 9); - ps->snap->chunk_mask = ps->snap->chunk_size - 1; - ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1; + if (!ps->store->chunk_size) { + ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, + bdev_hardsect_size(ps->store->cow->bdev) >> 9); + ps->store->chunk_mask = ps->store->chunk_size - 1; + ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1; chunk_size_supplied = 0; } - ps->io_client = dm_io_client_create(sectors_to_pages(ps->snap-> + ps->io_client = dm_io_client_create(sectors_to_pages(ps->store-> chunk_size)); if (IS_ERR(ps->io_client)) return PTR_ERR(ps->io_client); @@ -317,22 +319,22 @@ static int read_header(struct pstore *ps, int *new_snapshot) ps->version = le32_to_cpu(dh->version); chunk_size = le32_to_cpu(dh->chunk_size); - if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size) + if (!chunk_size_supplied || ps->store->chunk_size == chunk_size) return 0; DMWARN("chunk size %llu in device metadata overrides " "table chunk size of %llu.", (unsigned long long)chunk_size, - (unsigned long long)ps->snap->chunk_size); + (unsigned long long)ps->store->chunk_size); /* We had a bogus chunk_size. Fix stuff up. */ free_area(ps); - ps->snap->chunk_size = chunk_size; - ps->snap->chunk_mask = chunk_size - 1; - ps->snap->chunk_shift = ffs(chunk_size) - 1; + ps->store->chunk_size = chunk_size; + ps->store->chunk_mask = chunk_size - 1; + ps->store->chunk_shift = ffs(chunk_size) - 1; - r = dm_io_client_resize(sectors_to_pages(ps->snap->chunk_size), + r = dm_io_client_resize(sectors_to_pages(ps->store->chunk_size), ps->io_client); if (r) return r; @@ -349,13 +351,13 @@ static int write_header(struct pstore *ps) { struct disk_header *dh; - memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT); + memset(ps->area, 0, ps->store->chunk_size << SECTOR_SHIFT); dh = (struct disk_header *) ps->area; dh->magic = cpu_to_le32(SNAP_MAGIC); dh->valid = cpu_to_le32(ps->valid); dh->version = cpu_to_le32(ps->version); - dh->chunk_size = cpu_to_le32(ps->snap->chunk_size); + dh->chunk_size = cpu_to_le32(ps->store->chunk_size); return chunk_io(ps, 0, WRITE, 1); } @@ -474,18 +476,25 @@ static struct pstore *get_info(struct dm_exception_store *store) static void persistent_fraction_full(struct dm_exception_store *store, sector_t *numerator, sector_t *denominator) { - *numerator = get_info(store)->next_free * store->snap->chunk_size; - *denominator = get_dev_size(store->snap->cow->bdev); + *numerator = get_info(store)->next_free * store->chunk_size; + *denominator = get_dev_size(store->cow->bdev); } -static void persistent_destroy(struct dm_exception_store *store) +static void persistent_dtr(struct dm_exception_store *store) { struct pstore *ps = get_info(store); destroy_workqueue(ps->metadata_wq); - dm_io_client_destroy(ps->io_client); - vfree(ps->callbacks); + + /* Created in read_header */ + if (ps->io_client) + dm_io_client_destroy(ps->io_client); free_area(ps); + + /* Allocated in persistent_read_metadata */ + if (ps->callbacks) + vfree(ps->callbacks); + kfree(ps); } @@ -507,7 +516,7 @@ static int persistent_read_metadata(struct dm_exception_store *store, /* * Now we know correct chunk_size, complete the initialisation. */ - ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) / + ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) / sizeof(struct disk_exception); ps->callbacks = dm_vcalloc(ps->exceptions_per_area, sizeof(*ps->callbacks)); @@ -564,10 +573,10 @@ static int persistent_prepare_exception(struct dm_exception_store *store, struct pstore *ps = get_info(store); uint32_t stride; chunk_t next_free; - sector_t size = get_dev_size(store->snap->cow->bdev); + sector_t size = get_dev_size(store->cow->bdev); /* Is there enough room ? */ - if (size < ((ps->next_free + 1) * store->snap->chunk_size)) + if (size < ((ps->next_free + 1) * store->chunk_size)) return -ENOSPC; e->new_chunk = ps->next_free; @@ -656,16 +665,17 @@ static void persistent_drop_snapshot(struct dm_exception_store *store) DMWARN("write header failed"); } -int dm_create_persistent(struct dm_exception_store *store) +static int persistent_ctr(struct dm_exception_store *store, + unsigned argc, char **argv) { struct pstore *ps; /* allocate the pstore */ - ps = kmalloc(sizeof(*ps), GFP_KERNEL); + ps = kzalloc(sizeof(*ps), GFP_KERNEL); if (!ps) return -ENOMEM; - ps->snap = store->snap; + ps->store = store; ps->valid = 1; ps->version = SNAPSHOT_DISK_VERSION; ps->area = NULL; @@ -683,22 +693,77 @@ int dm_create_persistent(struct dm_exception_store *store) return -ENOMEM; } - store->destroy = persistent_destroy; - store->read_metadata = persistent_read_metadata; - store->prepare_exception = persistent_prepare_exception; - store->commit_exception = persistent_commit_exception; - store->drop_snapshot = persistent_drop_snapshot; - store->fraction_full = persistent_fraction_full; store->context = ps; return 0; } +static unsigned persistent_status(struct dm_exception_store *store, + status_type_t status, char *result, + unsigned maxlen) +{ + unsigned sz = 0; + + switch (status) { + case STATUSTYPE_INFO: + break; + case STATUSTYPE_TABLE: + DMEMIT(" %s P %llu", store->cow->name, + (unsigned long long)store->chunk_size); + } + + return sz; +} + +static struct dm_exception_store_type _persistent_type = { + .name = "persistent", + .module = THIS_MODULE, + .ctr = persistent_ctr, + .dtr = persistent_dtr, + .read_metadata = persistent_read_metadata, + .prepare_exception = persistent_prepare_exception, + .commit_exception = persistent_commit_exception, + .drop_snapshot = persistent_drop_snapshot, + .fraction_full = persistent_fraction_full, + .status = persistent_status, +}; + +static struct dm_exception_store_type _persistent_compat_type = { + .name = "P", + .module = THIS_MODULE, + .ctr = persistent_ctr, + .dtr = persistent_dtr, + .read_metadata = persistent_read_metadata, + .prepare_exception = persistent_prepare_exception, + .commit_exception = persistent_commit_exception, + .drop_snapshot = persistent_drop_snapshot, + .fraction_full = persistent_fraction_full, + .status = persistent_status, +}; + int dm_persistent_snapshot_init(void) { - return 0; + int r; + + r = dm_exception_store_type_register(&_persistent_type); + if (r) { + DMERR("Unable to register persistent exception store type"); + return r; + } + + r = dm_exception_store_type_register(&_persistent_compat_type); + if (r) { + DMERR("Unable to register old-style persistent exception " + "store type"); + dm_exception_store_type_unregister(&_persistent_type); + return r; + } + + return r; } void dm_persistent_snapshot_exit(void) { + dm_exception_store_type_unregister(&_persistent_type); + dm_exception_store_type_unregister(&_persistent_compat_type); } diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c index 7f6e2e6dcb0..cde5aa558e6 100644 --- a/drivers/md/dm-snap-transient.c +++ b/drivers/md/dm-snap-transient.c @@ -6,7 +6,6 @@ */ #include "dm-exception-store.h" -#include "dm-snap.h" #include <linux/mm.h> #include <linux/pagemap.h> @@ -23,7 +22,7 @@ struct transient_c { sector_t next_free; }; -static void transient_destroy(struct dm_exception_store *store) +static void transient_dtr(struct dm_exception_store *store) { kfree(store->context); } @@ -39,14 +38,14 @@ static int transient_read_metadata(struct dm_exception_store *store, static int transient_prepare_exception(struct dm_exception_store *store, struct dm_snap_exception *e) { - struct transient_c *tc = (struct transient_c *) store->context; - sector_t size = get_dev_size(store->snap->cow->bdev); + struct transient_c *tc = store->context; + sector_t size = get_dev_size(store->cow->bdev); - if (size < (tc->next_free + store->snap->chunk_size)) + if (size < (tc->next_free + store->chunk_size)) return -1; - e->new_chunk = sector_to_chunk(store->snap, tc->next_free); - tc->next_free += store->snap->chunk_size; + e->new_chunk = sector_to_chunk(store, tc->next_free); + tc->next_free += store->chunk_size; return 0; } @@ -64,20 +63,14 @@ static void transient_fraction_full(struct dm_exception_store *store, sector_t *numerator, sector_t *denominator) { *numerator = ((struct transient_c *) store->context)->next_free; - *denominator = get_dev_size(store->snap->cow->bdev); + *denominator = get_dev_size(store->cow->bdev); } -int dm_create_transient(struct dm_exception_store *store) +static int transient_ctr(struct dm_exception_store *store, + unsigned argc, char **argv) { struct transient_c *tc; - store->destroy = transient_destroy; - store->read_metadata = transient_read_metadata; - store->prepare_exception = transient_prepare_exception; - store->commit_exception = transient_commit_exception; - store->drop_snapshot = NULL; - store->fraction_full = transient_fraction_full; - tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL); if (!tc) return -ENOMEM; @@ -88,11 +81,70 @@ int dm_create_transient(struct dm_exception_store *store) return 0; } +static unsigned transient_status(struct dm_exception_store *store, + status_type_t status, char *result, + unsigned maxlen) +{ + unsigned sz = 0; + + switch (status) { + case STATUSTYPE_INFO: + break; + case STATUSTYPE_TABLE: + DMEMIT(" %s N %llu", store->cow->name, + (unsigned long long)store->chunk_size); + } + + return sz; +} + +static struct dm_exception_store_type _transient_type = { + .name = "transient", + .module = THIS_MODULE, + .ctr = transient_ctr, + .dtr = transient_dtr, + .read_metadata = transient_read_metadata, + .prepare_exception = transient_prepare_exception, + .commit_exception = transient_commit_exception, + .fraction_full = transient_fraction_full, + .status = transient_status, +}; + +static struct dm_exception_store_type _transient_compat_type = { + .name = "N", + .module = THIS_MODULE, + .ctr = transient_ctr, + .dtr = transient_dtr, + .read_metadata = transient_read_metadata, + .prepare_exception = transient_prepare_exception, + .commit_exception = transient_commit_exception, + .fraction_full = transient_fraction_full, + .status = transient_status, +}; + int dm_transient_snapshot_init(void) { - return 0; + int r; + + r = dm_exception_store_type_register(&_transient_type); + if (r) { + DMWARN("Unable to register transient exception store type"); + return r; + } + + r = dm_exception_store_type_register(&_transient_compat_type); + if (r) { + DMWARN("Unable to register old-style transient " + "exception store type"); + dm_exception_store_type_unregister(&_transient_type); + return r; + } + + return r; } void dm_transient_snapshot_exit(void) { + dm_exception_store_type_unregister(&_transient_type); + dm_exception_store_type_unregister(&_transient_compat_type); } diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 65ff82ff124..981a0413068 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -7,7 +7,6 @@ */ #include <linux/blkdev.h> -#include <linux/ctype.h> #include <linux/device-mapper.h> #include <linux/delay.h> #include <linux/fs.h> @@ -20,9 +19,9 @@ #include <linux/vmalloc.h> #include <linux/log2.h> #include <linux/dm-kcopyd.h> +#include <linux/workqueue.h> #include "dm-exception-store.h" -#include "dm-snap.h" #include "dm-bio-list.h" #define DM_MSG_PREFIX "snapshots" @@ -47,9 +46,76 @@ */ #define MIN_IOS 256 +#define DM_TRACKED_CHUNK_HASH_SIZE 16 +#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \ + (DM_TRACKED_CHUNK_HASH_SIZE - 1)) + +struct exception_table { + uint32_t hash_mask; + unsigned hash_shift; + struct list_head *table; +}; + +struct dm_snapshot { + struct rw_semaphore lock; + + struct dm_dev *origin; + + /* List of snapshots per Origin */ + struct list_head list; + + /* You can't use a snapshot if this is 0 (e.g. if full) */ + int valid; + + /* Origin writes don't trigger exceptions until this is set */ + int active; + + mempool_t *pending_pool; + + atomic_t pending_exceptions_count; + + struct exception_table pending; + struct exception_table complete; + + /* + * pe_lock protects all pending_exception operations and access + * as well as the snapshot_bios list. + */ + spinlock_t pe_lock; + + /* The on disk metadata handler */ + struct dm_exception_store *store; + + struct dm_kcopyd_client *kcopyd_client; + + /* Queue of snapshot writes for ksnapd to flush */ + struct bio_list queued_bios; + struct work_struct queued_bios_work; + + /* Chunks with outstanding reads */ + mempool_t *tracked_chunk_pool; + spinlock_t tracked_chunk_lock; + struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE]; +}; + static struct workqueue_struct *ksnapd; static void flush_queued_bios(struct work_struct *work); +static sector_t chunk_to_sector(struct dm_exception_store *store, + chunk_t chunk) +{ + return chunk << store->chunk_shift; +} + +static int bdev_equal(struct block_device *lhs, struct block_device *rhs) +{ + /* + * There is only ever one instance of a particular block + * device so we can compare pointers safely. + */ + return lhs == rhs; +} + struct dm_snap_pending_exception { struct dm_snap_exception e; @@ -476,11 +542,11 @@ static int init_hash_tables(struct dm_snapshot *s) * Calculate based on the size of the original volume or * the COW volume... */ - cow_dev_size = get_dev_size(s->cow->bdev); + cow_dev_size = get_dev_size(s->store->cow->bdev); origin_dev_size = get_dev_size(s->origin->bdev); max_buckets = calc_max_buckets(); - hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift; + hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift; hash_size = min(hash_size, max_buckets); hash_size = rounddown_pow_of_two(hash_size); @@ -505,58 +571,6 @@ static int init_hash_tables(struct dm_snapshot *s) } /* - * Round a number up to the nearest 'size' boundary. size must - * be a power of 2. - */ -static ulong round_up(ulong n, ulong size) -{ - size--; - return (n + size) & ~size; -} - -static int set_chunk_size(struct dm_snapshot *s, const char *chunk_size_arg, - char **error) -{ - unsigned long chunk_size; - char *value; - - chunk_size = simple_strtoul(chunk_size_arg, &value, 10); - if (*chunk_size_arg == '\0' || *value != '\0') { - *error = "Invalid chunk size"; - return -EINVAL; - } - - if (!chunk_size) { - s->chunk_size = s->chunk_mask = s->chunk_shift = 0; - return 0; - } - - /* - * Chunk size must be multiple of page size. Silently - * round up if it's not. - */ - chunk_size = round_up(chunk_size, PAGE_SIZE >> 9); - - /* Check chunk_size is a power of 2 */ - if (!is_power_of_2(chunk_size)) { - *error = "Chunk size is not a power of 2"; - return -EINVAL; - } - - /* Validate the chunk size against the device block size */ - if (chunk_size % (bdev_hardsect_size(s->cow->bdev) >> 9)) { - *error = "Chunk size is not a multiple of device blocksize"; - return -EINVAL; - } - - s->chunk_size = chunk_size; - s->chunk_mask = chunk_size - 1; - s->chunk_shift = ffs(chunk_size) - 1; - - return 0; -} - -/* * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size> */ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) @@ -564,91 +578,68 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) struct dm_snapshot *s; int i; int r = -EINVAL; - char persistent; char *origin_path; - char *cow_path; + struct dm_exception_store *store; + unsigned args_used; if (argc != 4) { ti->error = "requires exactly 4 arguments"; r = -EINVAL; - goto bad1; + goto bad_args; } origin_path = argv[0]; - cow_path = argv[1]; - persistent = toupper(*argv[2]); + argv++; + argc--; - if (persistent != 'P' && persistent != 'N') { - ti->error = "Persistent flag is not P or N"; + r = dm_exception_store_create(ti, argc, argv, &args_used, &store); + if (r) { + ti->error = "Couldn't create exception store"; r = -EINVAL; - goto bad1; + goto bad_args; } + argv += args_used; + argc -= args_used; + s = kmalloc(sizeof(*s), GFP_KERNEL); - if (s == NULL) { + if (!s) { ti->error = "Cannot allocate snapshot context private " "structure"; r = -ENOMEM; - goto bad1; + goto bad_snap; } r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin); if (r) { ti->error = "Cannot get origin device"; - goto bad2; - } - - r = dm_get_device(ti, cow_path, 0, 0, - FMODE_READ | FMODE_WRITE, &s->cow); - if (r) { - dm_put_device(ti, s->origin); - ti->error = "Cannot get COW device"; - goto bad2; + goto bad_origin; } - r = set_chunk_size(s, argv[3], &ti->error); - if (r) - goto bad3; - - s->type = persistent; - + s->store = store; s->valid = 1; s->active = 0; atomic_set(&s->pending_exceptions_count, 0); init_rwsem(&s->lock); spin_lock_init(&s->pe_lock); - s->ti = ti; /* Allocate hash table for COW data */ if (init_hash_tables(s)) { ti->error = "Unable to allocate hash table space"; r = -ENOMEM; - goto bad3; - } - - s->store.snap = s; - - if (persistent == 'P') - r = dm_create_persistent(&s->store); - else - r = dm_create_transient(&s->store); - - if (r) { - ti->error = "Couldn't create exception store"; - r = -EINVAL; - goto bad4; + goto bad_hash_tables; } r = dm_kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client); if (r) { ti->error = "Could not create kcopyd client"; - goto bad5; + goto bad_kcopyd; } s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache); if (!s->pending_pool) { ti->error = "Could not allocate mempool for pending exceptions"; - goto bad6; + goto bad_pending_pool; } s->tracked_chunk_pool = mempool_create_slab_pool(MIN_IOS, @@ -665,7 +656,8 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) spin_lock_init(&s->tracked_chunk_lock); /* Metadata must only be loaded into one table at once */ - r = s->store.read_metadata(&s->store, dm_add_exception, (void *)s); + r = s->store->type->read_metadata(s->store, dm_add_exception, + (void *)s); if (r < 0) { ti->error = "Failed to read snapshot metadata"; goto bad_load_and_register; @@ -686,34 +678,33 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) } ti->private = s; - ti->split_io = s->chunk_size; + ti->split_io = s->store->chunk_size; return 0; - bad_load_and_register: +bad_load_and_register: mempool_destroy(s->tracked_chunk_pool); - bad_tracked_chunk_pool: +bad_tracked_chunk_pool: mempool_destroy(s->pending_pool); - bad6: +bad_pending_pool: dm_kcopyd_client_destroy(s->kcopyd_client); - bad5: - s->store.destroy(&s->store); - - bad4: +bad_kcopyd: exit_exception_table(&s->pending, pending_cache); exit_exception_table(&s->complete, exception_cache); - bad3: - dm_put_device(ti, s->cow); +bad_hash_tables: dm_put_device(ti, s->origin); - bad2: +bad_origin: kfree(s); - bad1: +bad_snap: + dm_exception_store_destroy(store); + +bad_args: return r; } @@ -724,8 +715,6 @@ static void __free_exceptions(struct dm_snapshot *s) exit_exception_table(&s->pending, pending_cache); exit_exception_table(&s->complete, exception_cache); - - s->store.destroy(&s->store); } static void snapshot_dtr(struct dm_target *ti) @@ -761,7 +750,8 @@ static void snapshot_dtr(struct dm_target *ti) mempool_destroy(s->pending_pool); dm_put_device(ti, s->origin); - dm_put_device(ti, s->cow); + + dm_exception_store_destroy(s->store); kfree(s); } @@ -820,12 +810,12 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err) else if (err == -ENOMEM) DMERR("Invalidating snapshot: Unable to allocate exception."); - if (s->store.drop_snapshot) - s->store.drop_snapshot(&s->store); + if (s->store->type->drop_snapshot) + s->store->type->drop_snapshot(s->store); s->valid = 0; - dm_table_event(s->ti->table); + dm_table_event(s->store->ti->table); } static void get_pending_exception(struct dm_snap_pending_exception *pe) @@ -943,8 +933,8 @@ static void copy_callback(int read_err, unsigned long write_err, void *context) else /* Update the metadata if we are persistent */ - s->store.commit_exception(&s->store, &pe->e, commit_callback, - pe); + s->store->type->commit_exception(s->store, &pe->e, + commit_callback, pe); } /* @@ -960,11 +950,11 @@ static void start_copy(struct dm_snap_pending_exception *pe) dev_size = get_dev_size(bdev); src.bdev = bdev; - src.sector = chunk_to_sector(s, pe->e.old_chunk); - src.count = min(s->chunk_size, dev_size - src.sector); + src.sector = chunk_to_sector(s->store, pe->e.old_chunk); + src.count = min(s->store->chunk_size, dev_size - src.sector); - dest.bdev = s->cow->bdev; - dest.sector = chunk_to_sector(s, pe->e.new_chunk); + dest.bdev = s->store->cow->bdev; + dest.sector = chunk_to_sector(s->store, pe->e.new_chunk); dest.count = src.count; /* Hand over to kcopyd */ @@ -972,6 +962,17 @@ static void start_copy(struct dm_snap_pending_exception *pe) &src, 1, &dest, 0, copy_callback, pe); } +static struct dm_snap_pending_exception * +__lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk) +{ + struct dm_snap_exception *e = lookup_exception(&s->pending, chunk); + + if (!e) + return NULL; + + return container_of(e, struct dm_snap_pending_exception, e); +} + /* * Looks to see if this snapshot already has a pending exception * for this chunk, otherwise it allocates a new one and inserts @@ -981,40 +982,15 @@ static void start_copy(struct dm_snap_pending_exception *pe) * this. */ static struct dm_snap_pending_exception * -__find_pending_exception(struct dm_snapshot *s, struct bio *bio) +__find_pending_exception(struct dm_snapshot *s, + struct dm_snap_pending_exception *pe, chunk_t chunk) { - struct dm_snap_exception *e; - struct dm_snap_pending_exception *pe; - chunk_t chunk = sector_to_chunk(s, bio->bi_sector); + struct dm_snap_pending_exception *pe2; - /* - * Is there a pending exception for this already ? - */ - e = lookup_exception(&s->pending, chunk); - if (e) { - /* cast the exception to a pending exception */ - pe = container_of(e, struct dm_snap_pending_exception, e); - goto out; - } - - /* - * Create a new pending exception, we don't want - * to hold the lock while we do this. - */ - up_write(&s->lock); - pe = alloc_pending_exception(s); - down_write(&s->lock); - - if (!s->valid) { - free_pending_exception(pe); - return NULL; - } - - e = lookup_exception(&s->pending, chunk); - if (e) { + pe2 = __lookup_pending_exception(s, chunk); + if (pe2) { free_pending_exception(pe); - pe = container_of(e, struct dm_snap_pending_exception, e); - goto out; + return pe2; } pe->e.old_chunk = chunk; @@ -1024,7 +1000,7 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio) atomic_set(&pe->ref_count, 0); pe->started = 0; - if (s->store.prepare_exception(&s->store, &pe->e)) { + if (s->store->type->prepare_exception(s->store, &pe->e)) { free_pending_exception(pe); return NULL; } @@ -1032,17 +1008,18 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio) get_pending_exception(pe); insert_exception(&s->pending, &pe->e); - out: return pe; } static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e, struct bio *bio, chunk_t chunk) { - bio->bi_bdev = s->cow->bdev; - bio->bi_sector = chunk_to_sector(s, dm_chunk_number(e->new_chunk) + - (chunk - e->old_chunk)) + - (bio->bi_sector & s->chunk_mask); + bio->bi_bdev = s->store->cow->bdev; + bio->bi_sector = chunk_to_sector(s->store, + dm_chunk_number(e->new_chunk) + + (chunk - e->old_chunk)) + + (bio->bi_sector & + s->store->chunk_mask); } static int snapshot_map(struct dm_target *ti, struct bio *bio, @@ -1054,7 +1031,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, chunk_t chunk; struct dm_snap_pending_exception *pe = NULL; - chunk = sector_to_chunk(s, bio->bi_sector); + chunk = sector_to_chunk(s->store, bio->bi_sector); /* Full snapshots are not usable */ /* To get here the table must be live so s->active is always set. */ @@ -1083,11 +1060,31 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, * writeable. */ if (bio_rw(bio) == WRITE) { - pe = __find_pending_exception(s, bio); + pe = __lookup_pending_exception(s, chunk); if (!pe) { - __invalidate_snapshot(s, -ENOMEM); - r = -EIO; - goto out_unlock; + up_write(&s->lock); + pe = alloc_pending_exception(s); + down_write(&s->lock); + + if (!s->valid) { + free_pending_exception(pe); + r = -EIO; + goto out_unlock; + } + + e = lookup_exception(&s->complete, chunk); + if (e) { + free_pending_exception(pe); + remap_exception(s, e, bio, chunk); + goto out_unlock; + } + + pe = __find_pending_exception(s, pe, chunk); + if (!pe) { + __invalidate_snapshot(s, -ENOMEM); + r = -EIO; + goto out_unlock; + } } remap_exception(s, &pe->e, bio, chunk); @@ -1137,24 +1134,25 @@ static void snapshot_resume(struct dm_target *ti) static int snapshot_status(struct dm_target *ti, status_type_t type, char *result, unsigned int maxlen) { + unsigned sz = 0; struct dm_snapshot *snap = ti->private; switch (type) { case STATUSTYPE_INFO: if (!snap->valid) - snprintf(result, maxlen, "Invalid"); + DMEMIT("Invalid"); else { - if (snap->store.fraction_full) { + if (snap->store->type->fraction_full) { sector_t numerator, denominator; - snap->store.fraction_full(&snap->store, - &numerator, - &denominator); - snprintf(result, maxlen, "%llu/%llu", - (unsigned long long)numerator, - (unsigned long long)denominator); + snap->store->type->fraction_full(snap->store, + &numerator, + &denominator); + DMEMIT("%llu/%llu", + (unsigned long long)numerator, + (unsigned long long)denominator); } else - snprintf(result, maxlen, "Unknown"); + DMEMIT("Unknown"); } break; @@ -1164,10 +1162,9 @@ static int snapshot_status(struct dm_target *ti, status_type_t type, * to make private copies if the output is to * make sense. */ - snprintf(result, maxlen, "%s %s %c %llu", - snap->origin->name, snap->cow->name, - snap->type, - (unsigned long long)snap->chunk_size); + DMEMIT("%s", snap->origin->name); + snap->store->type->status(snap->store, type, result + sz, + maxlen - sz); break; } @@ -1196,14 +1193,14 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) goto next_snapshot; /* Nothing to do if writing beyond end of snapshot */ - if (bio->bi_sector >= dm_table_get_size(snap->ti->table)) + if (bio->bi_sector >= dm_table_get_size(snap->store->ti->table)) goto next_snapshot; /* * Remember, different snapshots can have * different chunk sizes. */ - chunk = sector_to_chunk(snap, bio->bi_sector); + chunk = sector_to_chunk(snap->store, bio->bi_sector); /* * Check exception table to see if block @@ -1217,10 +1214,28 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) if (e) goto next_snapshot; - pe = __find_pending_exception(snap, bio); + pe = __lookup_pending_exception(snap, chunk); if (!pe) { - __invalidate_snapshot(snap, -ENOMEM); - goto next_snapshot; + up_write(&snap->lock); + pe = alloc_pending_exception(snap); + down_write(&snap->lock); + + if (!snap->valid) { + free_pending_exception(pe); + goto next_snapshot; + } + + e = lookup_exception(&snap->complete, chunk); + if (e) { + free_pending_exception(pe); + goto next_snapshot; + } + + pe = __find_pending_exception(snap, pe, chunk); + if (!pe) { + __invalidate_snapshot(snap, -ENOMEM); + goto next_snapshot; + } } if (!primary_pe) { @@ -1360,7 +1375,8 @@ static void origin_resume(struct dm_target *ti) o = __lookup_origin(dev->bdev); if (o) list_for_each_entry (snap, &o->snapshots, list) - chunk_size = min_not_zero(chunk_size, snap->chunk_size); + chunk_size = min_not_zero(chunk_size, + snap->store->chunk_size); up_read(&_origins_lock); ti->split_io = chunk_size; diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h deleted file mode 100644 index d9e62b43cf8..00000000000 --- a/drivers/md/dm-snap.h +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright (C) 2001-2002 Sistina Software (UK) Limited. - * - * This file is released under the GPL. - */ - -#ifndef DM_SNAPSHOT_H -#define DM_SNAPSHOT_H - -#include <linux/device-mapper.h> -#include "dm-exception-store.h" -#include "dm-bio-list.h" -#include <linux/blkdev.h> -#include <linux/workqueue.h> - -struct exception_table { - uint32_t hash_mask; - unsigned hash_shift; - struct list_head *table; -}; - -#define DM_TRACKED_CHUNK_HASH_SIZE 16 -#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \ - (DM_TRACKED_CHUNK_HASH_SIZE - 1)) - -struct dm_snapshot { - struct rw_semaphore lock; - struct dm_target *ti; - - struct dm_dev *origin; - struct dm_dev *cow; - - /* List of snapshots per Origin */ - struct list_head list; - - /* Size of data blocks saved - must be a power of 2 */ - chunk_t chunk_size; - chunk_t chunk_mask; - chunk_t chunk_shift; - - /* You can't use a snapshot if this is 0 (e.g. if full) */ - int valid; - - /* Origin writes don't trigger exceptions until this is set */ - int active; - - /* Used for display of table */ - char type; - - mempool_t *pending_pool; - - atomic_t pending_exceptions_count; - - struct exception_table pending; - struct exception_table complete; - - /* - * pe_lock protects all pending_exception operations and access - * as well as the snapshot_bios list. - */ - spinlock_t pe_lock; - - /* The on disk metadata handler */ - struct dm_exception_store store; - - struct dm_kcopyd_client *kcopyd_client; - - /* Queue of snapshot writes for ksnapd to flush */ - struct bio_list queued_bios; - struct work_struct queued_bios_work; - - /* Chunks with outstanding reads */ - mempool_t *tracked_chunk_pool; - spinlock_t tracked_chunk_lock; - struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE]; -}; - -/* - * Return the number of sectors in the device. - */ -static inline sector_t get_dev_size(struct block_device *bdev) -{ - return bdev->bd_inode->i_size >> SECTOR_SHIFT; -} - -static inline chunk_t sector_to_chunk(struct dm_snapshot *s, sector_t sector) -{ - return (sector & ~s->chunk_mask) >> s->chunk_shift; -} - -static inline sector_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk) -{ - return chunk << s->chunk_shift; -} - -static inline int bdev_equal(struct block_device *lhs, struct block_device *rhs) -{ - /* - * There is only ever one instance of a particular block - * device so we can compare pointers safely. - */ - return lhs == rhs; -} - -#endif diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 2fd66c30f7f..e8361b191b9 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -399,28 +399,30 @@ static int check_device_area(struct dm_dev_internal *dd, sector_t start, } /* - * This upgrades the mode on an already open dm_dev. Being + * This upgrades the mode on an already open dm_dev, being * careful to leave things as they were if we fail to reopen the - * device. + * device and not to touch the existing bdev field in case + * it is accessed concurrently inside dm_table_any_congested(). */ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, struct mapped_device *md) { int r; - struct dm_dev_internal dd_copy; - dev_t dev = dd->dm_dev.bdev->bd_dev; + struct dm_dev_internal dd_new, dd_old; - dd_copy = *dd; + dd_new = dd_old = *dd; + + dd_new.dm_dev.mode |= new_mode; + dd_new.dm_dev.bdev = NULL; + + r = open_dev(&dd_new, dd->dm_dev.bdev->bd_dev, md); + if (r) + return r; dd->dm_dev.mode |= new_mode; - dd->dm_dev.bdev = NULL; - r = open_dev(dd, dev, md); - if (!r) - close_dev(&dd_copy, md); - else - *dd = dd_copy; + close_dev(&dd_old, md); - return r; + return 0; } /* diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 7decf10006e..04feccf2a99 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -14,45 +14,34 @@ #define DM_MSG_PREFIX "target" -struct tt_internal { - struct target_type tt; - - struct list_head list; - long use; -}; - static LIST_HEAD(_targets); static DECLARE_RWSEM(_lock); #define DM_MOD_NAME_SIZE 32 -static inline struct tt_internal *__find_target_type(const char *name) +static inline struct target_type *__find_target_type(const char *name) { - struct tt_internal *ti; + struct target_type *tt; - list_for_each_entry (ti, &_targets, list) - if (!strcmp(name, ti->tt.name)) - return ti; + list_for_each_entry(tt, &_targets, list) + if (!strcmp(name, tt->name)) + return tt; return NULL; } -static struct tt_internal *get_target_type(const char *name) +static struct target_type *get_target_type(const char *name) { - struct tt_internal *ti; + struct target_type *tt; down_read(&_lock); - ti = __find_target_type(name); - if (ti) { - if ((ti->use == 0) && !try_module_get(ti->tt.module)) - ti = NULL; - else - ti->use++; - } + tt = __find_target_type(name); + if (tt && !try_module_get(tt->module)) + tt = NULL; up_read(&_lock); - return ti; + return tt; } static void load_module(const char *name) @@ -62,92 +51,59 @@ static void load_module(const char *name) struct target_type *dm_get_target_type(const char *name) { - struct tt_internal *ti = get_target_type(name); + struct target_type *tt = get_target_type(name); - if (!ti) { + if (!tt) { load_module(name); - ti = get_target_type(name); + tt = get_target_type(name); } - return ti ? &ti->tt : NULL; + return tt; } -void dm_put_target_type(struct target_type *t) +void dm_put_target_type(struct target_type *tt) { - struct tt_internal *ti = (struct tt_internal *) t; - down_read(&_lock); - if (--ti->use == 0) - module_put(ti->tt.module); - - BUG_ON(ti->use < 0); + module_put(tt->module); up_read(&_lock); - - return; -} - -static struct tt_internal *alloc_target(struct target_type *t) -{ - struct tt_internal *ti = kzalloc(sizeof(*ti), GFP_KERNEL); - - if (ti) - ti->tt = *t; - - return ti; } - int dm_target_iterate(void (*iter_func)(struct target_type *tt, void *param), void *param) { - struct tt_internal *ti; + struct target_type *tt; down_read(&_lock); - list_for_each_entry (ti, &_targets, list) - iter_func(&ti->tt, param); + list_for_each_entry(tt, &_targets, list) + iter_func(tt, param); up_read(&_lock); return 0; } -int dm_register_target(struct target_type *t) +int dm_register_target(struct target_type *tt) { int rv = 0; - struct tt_internal *ti = alloc_target(t); - - if (!ti) - return -ENOMEM; down_write(&_lock); - if (__find_target_type(t->name)) + if (__find_target_type(tt->name)) rv = -EEXIST; else - list_add(&ti->list, &_targets); + list_add(&tt->list, &_targets); up_write(&_lock); - if (rv) - kfree(ti); return rv; } -void dm_unregister_target(struct target_type *t) +void dm_unregister_target(struct target_type *tt) { - struct tt_internal *ti; - down_write(&_lock); - if (!(ti = __find_target_type(t->name))) { - DMCRIT("Unregistering unrecognised target: %s", t->name); - BUG(); - } - - if (ti->use) { - DMCRIT("Attempt to unregister target still in use: %s", - t->name); + if (!__find_target_type(tt->name)) { + DMCRIT("Unregistering unrecognised target: %s", tt->name); BUG(); } - list_del(&ti->list); - kfree(ti); + list_del(&tt->list); up_write(&_lock); } @@ -156,17 +112,17 @@ void dm_unregister_target(struct target_type *t) * io-err: always fails an io, useful for bringing * up LVs that have holes in them. */ -static int io_err_ctr(struct dm_target *ti, unsigned int argc, char **args) +static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args) { return 0; } -static void io_err_dtr(struct dm_target *ti) +static void io_err_dtr(struct dm_target *tt) { /* empty */ } -static int io_err_map(struct dm_target *ti, struct bio *bio, +static int io_err_map(struct dm_target *tt, struct bio *bio, union map_info *map_context) { return -EIO; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 8d40f27cce8..788ba96a625 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -99,19 +99,9 @@ union map_info *dm_get_mapinfo(struct bio *bio) /* * Work processed by per-device workqueue. */ -struct dm_wq_req { - enum { - DM_WQ_FLUSH_DEFERRED, - } type; - struct work_struct work; - struct mapped_device *md; - void *context; -}; - struct mapped_device { struct rw_semaphore io_lock; struct mutex suspend_lock; - spinlock_t pushback_lock; rwlock_t map_lock; atomic_t holders; atomic_t open_count; @@ -129,8 +119,9 @@ struct mapped_device { */ atomic_t pending; wait_queue_head_t wait; + struct work_struct work; struct bio_list deferred; - struct bio_list pushback; + spinlock_t deferred_lock; /* * Processing queue (flush/barriers) @@ -453,7 +444,9 @@ static int queue_io(struct mapped_device *md, struct bio *bio) return 1; } + spin_lock_irq(&md->deferred_lock); bio_list_add(&md->deferred, bio); + spin_unlock_irq(&md->deferred_lock); up_write(&md->io_lock); return 0; /* deferred successfully */ @@ -537,16 +530,14 @@ static void dec_pending(struct dm_io *io, int error) if (io->error == DM_ENDIO_REQUEUE) { /* * Target requested pushing back the I/O. - * This must be handled before the sleeper on - * suspend queue merges the pushback list. */ - spin_lock_irqsave(&md->pushback_lock, flags); + spin_lock_irqsave(&md->deferred_lock, flags); if (__noflush_suspending(md)) - bio_list_add(&md->pushback, io->bio); + bio_list_add(&md->deferred, io->bio); else /* noflush suspend was interrupted. */ io->error = -EIO; - spin_unlock_irqrestore(&md->pushback_lock, flags); + spin_unlock_irqrestore(&md->deferred_lock, flags); } end_io_acct(io); @@ -834,20 +825,22 @@ static int __clone_and_map(struct clone_info *ci) } /* - * Split the bio into several clones. + * Split the bio into several clones and submit it to targets. */ -static int __split_bio(struct mapped_device *md, struct bio *bio) +static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) { struct clone_info ci; int error = 0; ci.map = dm_get_table(md); - if (unlikely(!ci.map)) - return -EIO; + if (unlikely(!ci.map)) { + bio_io_error(bio); + return; + } if (unlikely(bio_barrier(bio) && !dm_table_barrier_ok(ci.map))) { dm_table_put(ci.map); bio_endio(bio, -EOPNOTSUPP); - return 0; + return; } ci.md = md; ci.bio = bio; @@ -867,8 +860,6 @@ static int __split_bio(struct mapped_device *md, struct bio *bio) /* drop the extra reference count */ dec_pending(ci.io, error); dm_table_put(ci.map); - - return 0; } /*----------------------------------------------------------------- * CRUD END @@ -959,8 +950,9 @@ static int dm_request(struct request_queue *q, struct bio *bio) down_read(&md->io_lock); } - r = __split_bio(md, bio); + __split_and_process_bio(md, bio); up_read(&md->io_lock); + return 0; out_req: if (r < 0) @@ -1074,6 +1066,8 @@ out: static struct block_device_operations dm_blk_dops; +static void dm_wq_work(struct work_struct *work); + /* * Allocate and initialise a blank device with a given minor. */ @@ -1101,7 +1095,7 @@ static struct mapped_device *alloc_dev(int minor) init_rwsem(&md->io_lock); mutex_init(&md->suspend_lock); - spin_lock_init(&md->pushback_lock); + spin_lock_init(&md->deferred_lock); rwlock_init(&md->map_lock); atomic_set(&md->holders, 1); atomic_set(&md->open_count, 0); @@ -1118,6 +1112,7 @@ static struct mapped_device *alloc_dev(int minor) md->queue->backing_dev_info.congested_fn = dm_any_congested; md->queue->backing_dev_info.congested_data = md; blk_queue_make_request(md->queue, dm_request); + blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL); blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); md->queue->unplug_fn = dm_unplug_all; blk_queue_merge_bvec(md->queue, dm_merge_bvec); @@ -1140,6 +1135,7 @@ static struct mapped_device *alloc_dev(int minor) atomic_set(&md->pending, 0); init_waitqueue_head(&md->wait); + INIT_WORK(&md->work, dm_wq_work); init_waitqueue_head(&md->eventq); md->disk->major = _major; @@ -1379,18 +1375,24 @@ void dm_put(struct mapped_device *md) } EXPORT_SYMBOL_GPL(dm_put); -static int dm_wait_for_completion(struct mapped_device *md) +static int dm_wait_for_completion(struct mapped_device *md, int interruptible) { int r = 0; + DECLARE_WAITQUEUE(wait, current); + + dm_unplug_all(md->queue); + + add_wait_queue(&md->wait, &wait); while (1) { - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(interruptible); smp_mb(); if (!atomic_read(&md->pending)) break; - if (signal_pending(current)) { + if (interruptible == TASK_INTERRUPTIBLE && + signal_pending(current)) { r = -EINTR; break; } @@ -1399,67 +1401,40 @@ static int dm_wait_for_completion(struct mapped_device *md) } set_current_state(TASK_RUNNING); + remove_wait_queue(&md->wait, &wait); + return r; } /* * Process the deferred bios */ -static void __flush_deferred_io(struct mapped_device *md) +static void dm_wq_work(struct work_struct *work) { + struct mapped_device *md = container_of(work, struct mapped_device, + work); struct bio *c; - while ((c = bio_list_pop(&md->deferred))) { - if (__split_bio(md, c)) - bio_io_error(c); - } - - clear_bit(DMF_BLOCK_IO, &md->flags); -} + down_write(&md->io_lock); -static void __merge_pushback_list(struct mapped_device *md) -{ - unsigned long flags; +next_bio: + spin_lock_irq(&md->deferred_lock); + c = bio_list_pop(&md->deferred); + spin_unlock_irq(&md->deferred_lock); - spin_lock_irqsave(&md->pushback_lock, flags); - clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); - bio_list_merge_head(&md->deferred, &md->pushback); - bio_list_init(&md->pushback); - spin_unlock_irqrestore(&md->pushback_lock, flags); -} + if (c) { + __split_and_process_bio(md, c); + goto next_bio; + } -static void dm_wq_work(struct work_struct *work) -{ - struct dm_wq_req *req = container_of(work, struct dm_wq_req, work); - struct mapped_device *md = req->md; + clear_bit(DMF_BLOCK_IO, &md->flags); - down_write(&md->io_lock); - switch (req->type) { - case DM_WQ_FLUSH_DEFERRED: - __flush_deferred_io(md); - break; - default: - DMERR("dm_wq_work: unrecognised work type %d", req->type); - BUG(); - } up_write(&md->io_lock); } -static void dm_wq_queue(struct mapped_device *md, int type, void *context, - struct dm_wq_req *req) -{ - req->type = type; - req->md = md; - req->context = context; - INIT_WORK(&req->work, dm_wq_work); - queue_work(md->wq, &req->work); -} - -static void dm_queue_flush(struct mapped_device *md, int type, void *context) +static void dm_queue_flush(struct mapped_device *md) { - struct dm_wq_req req; - - dm_wq_queue(md, type, context, &req); + queue_work(md->wq, &md->work); flush_workqueue(md->wq); } @@ -1534,7 +1509,6 @@ static void unlock_fs(struct mapped_device *md) int dm_suspend(struct mapped_device *md, unsigned suspend_flags) { struct dm_table *map = NULL; - DECLARE_WAITQUEUE(wait, current); int r = 0; int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; @@ -1584,28 +1558,22 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) down_write(&md->io_lock); set_bit(DMF_BLOCK_IO, &md->flags); - add_wait_queue(&md->wait, &wait); up_write(&md->io_lock); - /* unplug */ - if (map) - dm_table_unplug_all(map); - /* * Wait for the already-mapped ios to complete. */ - r = dm_wait_for_completion(md); + r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); down_write(&md->io_lock); - remove_wait_queue(&md->wait, &wait); if (noflush) - __merge_pushback_list(md); + clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); up_write(&md->io_lock); /* were we interrupted ? */ if (r < 0) { - dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL); + dm_queue_flush(md); unlock_fs(md); goto out; /* pushback list is already flushed, so skip flush */ @@ -1645,7 +1613,7 @@ int dm_resume(struct mapped_device *md) if (r) goto out; - dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL); + dm_queue_flush(md); unlock_fs(md); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 20194e000c5..b48397c0abb 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -60,7 +60,7 @@ int dm_table_barrier_ok(struct dm_table *t); int dm_target_init(void); void dm_target_exit(void); struct target_type *dm_get_target_type(const char *name); -void dm_put_target_type(struct target_type *t); +void dm_put_target_type(struct target_type *tt); int dm_target_iterate(void (*iter_func)(struct target_type *tt, void *param), void *param); diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 86d9adf90e7..8695809b24b 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -62,7 +62,10 @@ #define ModeShift 5 #define MaxFault 50 -#include <linux/raid/md.h> +#include <linux/blkdev.h> +#include <linux/raid/md_u.h> +#include "md.h" +#include <linux/seq_file.h> static void faulty_fail(struct bio *bio, int error) @@ -280,6 +283,17 @@ static int reconfig(mddev_t *mddev, int layout, int chunk_size) return 0; } +static sector_t faulty_size(mddev_t *mddev, sector_t sectors, int raid_disks) +{ + WARN_ONCE(raid_disks, + "%s does not support generic reshape\n", __func__); + + if (sectors == 0) + return mddev->dev_sectors; + + return sectors; +} + static int run(mddev_t *mddev) { mdk_rdev_t *rdev; @@ -298,7 +312,7 @@ static int run(mddev_t *mddev) list_for_each_entry(rdev, &mddev->disks, same_set) conf->rdev = rdev; - mddev->array_sectors = mddev->size * 2; + md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); mddev->private = conf; reconfig(mddev, mddev->layout, -1); @@ -325,6 +339,7 @@ static struct mdk_personality faulty_personality = .stop = stop, .status = status, .reconfig = reconfig, + .size = faulty_size, }; static int __init raid_init(void) diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 09658b21847..7a36e38393a 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -16,7 +16,11 @@ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include <linux/raid/linear.h> +#include <linux/blkdev.h> +#include <linux/raid/md_u.h> +#include <linux/seq_file.h> +#include "md.h" +#include "linear.h" /* * find which device holds a particular offset @@ -97,6 +101,16 @@ static int linear_congested(void *data, int bits) return ret; } +static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks) +{ + linear_conf_t *conf = mddev_to_conf(mddev); + + WARN_ONCE(sectors || raid_disks, + "%s does not support generic reshape\n", __func__); + + return conf->array_sectors; +} + static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) { linear_conf_t *conf; @@ -135,8 +149,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) mddev->queue->max_sectors > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); - disk->num_sectors = rdev->size * 2; - conf->array_sectors += rdev->size * 2; + disk->num_sectors = rdev->sectors; + conf->array_sectors += rdev->sectors; cnt++; } @@ -249,7 +263,7 @@ static int linear_run (mddev_t *mddev) if (!conf) return 1; mddev->private = conf; - mddev->array_sectors = conf->array_sectors; + md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); mddev->queue->unplug_fn = linear_unplug; @@ -283,7 +297,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) newconf->prev = mddev_to_conf(mddev); mddev->private = newconf; mddev->raid_disks++; - mddev->array_sectors = newconf->array_sectors; + md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); set_capacity(mddev->gendisk, mddev->array_sectors); return 0; } @@ -381,6 +395,7 @@ static struct mdk_personality linear_personality = .stop = linear_stop, .status = linear_status, .hot_add_disk = linear_add, + .size = linear_size, }; static int __init linear_init (void) diff --git a/drivers/md/linear.h b/drivers/md/linear.h new file mode 100644 index 00000000000..bf8179587f9 --- /dev/null +++ b/drivers/md/linear.h @@ -0,0 +1,29 @@ +#ifndef _LINEAR_H +#define _LINEAR_H + +struct dev_info { + mdk_rdev_t *rdev; + sector_t num_sectors; + sector_t start_sector; +}; + +typedef struct dev_info dev_info_t; + +struct linear_private_data +{ + struct linear_private_data *prev; /* earlier version */ + dev_info_t **hash_table; + sector_t spacing; + sector_t array_sectors; + int sector_shift; /* shift before dividing + * by spacing + */ + dev_info_t disks[0]; +}; + + +typedef struct linear_private_data linear_conf_t; + +#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private) + +#endif diff --git a/drivers/md/md.c b/drivers/md/md.c index a307f87eb90..ed5727c089a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -33,9 +33,9 @@ */ #include <linux/kthread.h> -#include <linux/raid/md.h> -#include <linux/raid/bitmap.h> +#include <linux/blkdev.h> #include <linux/sysctl.h> +#include <linux/seq_file.h> #include <linux/buffer_head.h> /* for invalidate_bdev */ #include <linux/poll.h> #include <linux/ctype.h> @@ -45,11 +45,10 @@ #include <linux/reboot.h> #include <linux/file.h> #include <linux/delay.h> - -#define MAJOR_NR MD_MAJOR - -/* 63 partitions with the alternate major number (mdp) */ -#define MdpMinorShift 6 +#include <linux/raid/md_p.h> +#include <linux/raid/md_u.h> +#include "md.h" +#include "bitmap.h" #define DEBUG 0 #define dprintk(x...) ((void)(DEBUG && printk(x))) @@ -202,12 +201,68 @@ static DEFINE_SPINLOCK(all_mddevs_lock); ) -static int md_fail_request(struct request_queue *q, struct bio *bio) +/* Rather than calling directly into the personality make_request function, + * IO requests come here first so that we can check if the device is + * being suspended pending a reconfiguration. + * We hold a refcount over the call to ->make_request. By the time that + * call has finished, the bio has been linked into some internal structure + * and so is visible to ->quiesce(), so we don't need the refcount any more. + */ +static int md_make_request(struct request_queue *q, struct bio *bio) { - bio_io_error(bio); - return 0; + mddev_t *mddev = q->queuedata; + int rv; + if (mddev == NULL || mddev->pers == NULL) { + bio_io_error(bio); + return 0; + } + rcu_read_lock(); + if (mddev->suspended) { + DEFINE_WAIT(__wait); + for (;;) { + prepare_to_wait(&mddev->sb_wait, &__wait, + TASK_UNINTERRUPTIBLE); + if (!mddev->suspended) + break; + rcu_read_unlock(); + schedule(); + rcu_read_lock(); + } + finish_wait(&mddev->sb_wait, &__wait); + } + atomic_inc(&mddev->active_io); + rcu_read_unlock(); + rv = mddev->pers->make_request(q, bio); + if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) + wake_up(&mddev->sb_wait); + + return rv; +} + +static void mddev_suspend(mddev_t *mddev) +{ + BUG_ON(mddev->suspended); + mddev->suspended = 1; + synchronize_rcu(); + wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); + mddev->pers->quiesce(mddev, 1); + md_unregister_thread(mddev->thread); + mddev->thread = NULL; + /* we now know that no code is executing in the personality module, + * except possibly the tail end of a ->bi_end_io function, but that + * is certain to complete before the module has a chance to get + * unloaded + */ +} + +static void mddev_resume(mddev_t *mddev) +{ + mddev->suspended = 0; + wake_up(&mddev->sb_wait); + mddev->pers->quiesce(mddev, 0); } + static inline mddev_t *mddev_get(mddev_t *mddev) { atomic_inc(&mddev->active); @@ -310,6 +365,7 @@ static mddev_t * mddev_find(dev_t unit) init_timer(&new->safemode_timer); atomic_set(&new->active, 1); atomic_set(&new->openers, 0); + atomic_set(&new->active_io, 0); spin_lock_init(&new->write_lock); init_waitqueue_head(&new->sb_wait); init_waitqueue_head(&new->recovery_wait); @@ -326,6 +382,11 @@ static inline int mddev_lock(mddev_t * mddev) return mutex_lock_interruptible(&mddev->reconfig_mutex); } +static inline int mddev_is_locked(mddev_t *mddev) +{ + return mutex_is_locked(&mddev->reconfig_mutex); +} + static inline int mddev_trylock(mddev_t * mddev) { return mutex_trylock(&mddev->reconfig_mutex); @@ -409,7 +470,7 @@ static void free_disk_sb(mdk_rdev_t * rdev) rdev->sb_loaded = 0; rdev->sb_page = NULL; rdev->sb_start = 0; - rdev->size = 0; + rdev->sectors = 0; } } @@ -775,9 +836,9 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version else ret = 0; } - rdev->size = calc_num_sectors(rdev, sb->chunk_size) / 2; + rdev->sectors = calc_num_sectors(rdev, sb->chunk_size); - if (rdev->size < sb->size && sb->level > 1) + if (rdev->sectors < sb->size * 2 && sb->level > 1) /* "this cannot possibly happen" ... */ ret = -EINVAL; @@ -812,7 +873,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->clevel[0] = 0; mddev->layout = sb->layout; mddev->raid_disks = sb->raid_disks; - mddev->size = sb->size; + mddev->dev_sectors = sb->size * 2; mddev->events = ev1; mddev->bitmap_offset = 0; mddev->default_bitmap_offset = MD_SB_BYTES >> 9; @@ -926,7 +987,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->ctime = mddev->ctime; sb->level = mddev->level; - sb->size = mddev->size; + sb->size = mddev->dev_sectors / 2; sb->raid_disks = mddev->raid_disks; sb->md_minor = mddev->md_minor; sb->not_persistent = 0; @@ -1024,7 +1085,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) static unsigned long long super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) { - if (num_sectors && num_sectors < rdev->mddev->size * 2) + if (num_sectors && num_sectors < rdev->mddev->dev_sectors) return 0; /* component must fit device */ if (rdev->mddev->bitmap_offset) return 0; /* can't move bitmap */ @@ -1180,16 +1241,17 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) ret = 0; } if (minor_version) - rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; + rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) - + le64_to_cpu(sb->data_offset); else - rdev->size = rdev->sb_start / 2; - if (rdev->size < le64_to_cpu(sb->data_size)/2) + rdev->sectors = rdev->sb_start; + if (rdev->sectors < le64_to_cpu(sb->data_size)) return -EINVAL; - rdev->size = le64_to_cpu(sb->data_size)/2; + rdev->sectors = le64_to_cpu(sb->data_size); if (le32_to_cpu(sb->chunksize)) - rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); + rdev->sectors &= ~((sector_t)le32_to_cpu(sb->chunksize) - 1); - if (le64_to_cpu(sb->size) > rdev->size*2) + if (le64_to_cpu(sb->size) > rdev->sectors) return -EINVAL; return ret; } @@ -1216,7 +1278,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->clevel[0] = 0; mddev->layout = le32_to_cpu(sb->layout); mddev->raid_disks = le32_to_cpu(sb->raid_disks); - mddev->size = le64_to_cpu(sb->size)/2; + mddev->dev_sectors = le64_to_cpu(sb->size); mddev->events = ev1; mddev->bitmap_offset = 0; mddev->default_bitmap_offset = 1024 >> 9; @@ -1312,7 +1374,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); sb->raid_disks = cpu_to_le32(mddev->raid_disks); - sb->size = cpu_to_le64(mddev->size<<1); + sb->size = cpu_to_le64(mddev->dev_sectors); if (mddev->bitmap && mddev->bitmap_file == NULL) { sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); @@ -1320,10 +1382,15 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) } if (rdev->raid_disk >= 0 && - !test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset > 0) { - sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); - sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); + !test_bit(In_sync, &rdev->flags)) { + if (mddev->curr_resync_completed > rdev->recovery_offset) + rdev->recovery_offset = mddev->curr_resync_completed; + if (rdev->recovery_offset > 0) { + sb->feature_map |= + cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); + sb->recovery_offset = + cpu_to_le64(rdev->recovery_offset); + } } if (mddev->reshape_position != MaxSector) { @@ -1365,7 +1432,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) { struct mdp_superblock_1 *sb; sector_t max_sectors; - if (num_sectors && num_sectors < rdev->mddev->size * 2) + if (num_sectors && num_sectors < rdev->mddev->dev_sectors) return 0; /* component must fit device */ if (rdev->sb_start < rdev->data_offset) { /* minor versions 1 and 2; superblock before data */ @@ -1381,7 +1448,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) sector_t sb_start; sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; sb_start &= ~(sector_t)(4*2 - 1); - max_sectors = rdev->size * 2 + sb_start - rdev->sb_start; + max_sectors = rdev->sectors + sb_start - rdev->sb_start; if (!num_sectors || num_sectors > max_sectors) num_sectors = max_sectors; rdev->sb_start = sb_start; @@ -1433,6 +1500,38 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) static LIST_HEAD(pending_raid_disks); +static void md_integrity_check(mdk_rdev_t *rdev, mddev_t *mddev) +{ + struct mdk_personality *pers = mddev->pers; + struct gendisk *disk = mddev->gendisk; + struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); + struct blk_integrity *bi_mddev = blk_get_integrity(disk); + + /* Data integrity passthrough not supported on RAID 4, 5 and 6 */ + if (pers && pers->level >= 4 && pers->level <= 6) + return; + + /* If rdev is integrity capable, register profile for mddev */ + if (!bi_mddev && bi_rdev) { + if (blk_integrity_register(disk, bi_rdev)) + printk(KERN_ERR "%s: %s Could not register integrity!\n", + __func__, disk->disk_name); + else + printk(KERN_NOTICE "Enabling data integrity on %s\n", + disk->disk_name); + return; + } + + /* Check that mddev and rdev have matching profiles */ + if (blk_integrity_compare(disk, rdev->bdev->bd_disk) < 0) { + printk(KERN_ERR "%s: %s/%s integrity mismatch!\n", __func__, + disk->disk_name, rdev->bdev->bd_disk->disk_name); + printk(KERN_NOTICE "Disabling data integrity on %s\n", + disk->disk_name); + blk_integrity_unregister(disk); + } +} + static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) { char b[BDEVNAME_SIZE]; @@ -1449,8 +1548,9 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) if (find_rdev(mddev, rdev->bdev->bd_dev)) return -EEXIST; - /* make sure rdev->size exceeds mddev->size */ - if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) { + /* make sure rdev->sectors exceeds mddev->dev_sectors */ + if (rdev->sectors && (mddev->dev_sectors == 0 || + rdev->sectors < mddev->dev_sectors)) { if (mddev->pers) { /* Cannot change size, so fail * If mddev->level <= 0, then we don't care @@ -1459,7 +1559,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) if (mddev->level > 0) return -ENOSPC; } else - mddev->size = rdev->size; + mddev->dev_sectors = rdev->sectors; } /* Verify rdev->desc_nr is unique. @@ -1503,6 +1603,8 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) /* May as well allow recovery to be retried once */ mddev->recovery_disabled = 0; + + md_integrity_check(rdev, mddev); return 0; fail: @@ -1713,8 +1815,8 @@ static void print_sb_1(struct mdp_superblock_1 *sb) static void print_rdev(mdk_rdev_t *rdev, int major_version) { char b[BDEVNAME_SIZE]; - printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", - bdevname(rdev->bdev,b), (unsigned long long)rdev->size, + printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n", + bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors, test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), rdev->desc_nr); if (rdev->sb_loaded) { @@ -2153,7 +2255,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) return -EINVAL; if (rdev->mddev->pers && rdev->raid_disk >= 0) return -EBUSY; - if (rdev->size && rdev->mddev->external) + if (rdev->sectors && rdev->mddev->external) /* Must set offset before size, so overlap checks * can be sane */ return -EBUSY; @@ -2167,7 +2269,7 @@ __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); static ssize_t rdev_size_show(mdk_rdev_t *rdev, char *page) { - return sprintf(page, "%llu\n", (unsigned long long)rdev->size); + return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); } static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) @@ -2180,34 +2282,52 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) return 1; } +static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) +{ + unsigned long long blocks; + sector_t new; + + if (strict_strtoull(buf, 10, &blocks) < 0) + return -EINVAL; + + if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) + return -EINVAL; /* sector conversion overflow */ + + new = blocks * 2; + if (new != blocks * 2) + return -EINVAL; /* unsigned long long to sector_t overflow */ + + *sectors = new; + return 0; +} + static ssize_t rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) { - unsigned long long size; - unsigned long long oldsize = rdev->size; mddev_t *my_mddev = rdev->mddev; + sector_t oldsectors = rdev->sectors; + sector_t sectors; - if (strict_strtoull(buf, 10, &size) < 0) + if (strict_blocks_to_sectors(buf, §ors) < 0) return -EINVAL; if (my_mddev->pers && rdev->raid_disk >= 0) { if (my_mddev->persistent) { - size = super_types[my_mddev->major_version]. - rdev_size_change(rdev, size * 2); - if (!size) + sectors = super_types[my_mddev->major_version]. + rdev_size_change(rdev, sectors); + if (!sectors) return -EBUSY; - } else if (!size) { - size = (rdev->bdev->bd_inode->i_size >> 10); - size -= rdev->data_offset/2; - } + } else if (!sectors) + sectors = (rdev->bdev->bd_inode->i_size >> 9) - + rdev->data_offset; } - if (size < my_mddev->size) + if (sectors < my_mddev->dev_sectors) return -EINVAL; /* component must fit device */ - rdev->size = size; - if (size > oldsize && my_mddev->external) { + rdev->sectors = sectors; + if (sectors > oldsectors && my_mddev->external) { /* need to check that all other rdevs with the same ->bdev * do not overlap. We need to unlock the mddev to avoid - * a deadlock. We have already changed rdev->size, and if + * a deadlock. We have already changed rdev->sectors, and if * we have to change it back, we will have the lock again. */ mddev_t *mddev; @@ -2223,9 +2343,9 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) if (test_bit(AllReserved, &rdev2->flags) || (rdev->bdev == rdev2->bdev && rdev != rdev2 && - overlaps(rdev->data_offset, rdev->size * 2, + overlaps(rdev->data_offset, rdev->sectors, rdev2->data_offset, - rdev2->size * 2))) { + rdev2->sectors))) { overlap = 1; break; } @@ -2239,11 +2359,11 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) if (overlap) { /* Someone else could have slipped in a size * change here, but doing so is just silly. - * We put oldsize back because we *know* it is + * We put oldsectors back because we *know* it is * safe, and trust userspace not to race with * itself */ - rdev->size = oldsize; + rdev->sectors = oldsectors; return -EBUSY; } } @@ -2547,18 +2667,101 @@ level_show(mddev_t *mddev, char *page) static ssize_t level_store(mddev_t *mddev, const char *buf, size_t len) { + char level[16]; ssize_t rv = len; - if (mddev->pers) + struct mdk_personality *pers; + void *priv; + + if (mddev->pers == NULL) { + if (len == 0) + return 0; + if (len >= sizeof(mddev->clevel)) + return -ENOSPC; + strncpy(mddev->clevel, buf, len); + if (mddev->clevel[len-1] == '\n') + len--; + mddev->clevel[len] = 0; + mddev->level = LEVEL_NONE; + return rv; + } + + /* request to change the personality. Need to ensure: + * - array is not engaged in resync/recovery/reshape + * - old personality can be suspended + * - new personality will access other array. + */ + + if (mddev->sync_thread || mddev->reshape_position != MaxSector) return -EBUSY; - if (len == 0) - return 0; - if (len >= sizeof(mddev->clevel)) - return -ENOSPC; - strncpy(mddev->clevel, buf, len); - if (mddev->clevel[len-1] == '\n') + + if (!mddev->pers->quiesce) { + printk(KERN_WARNING "md: %s: %s does not support online personality change\n", + mdname(mddev), mddev->pers->name); + return -EINVAL; + } + + /* Now find the new personality */ + if (len == 0 || len >= sizeof(level)) + return -EINVAL; + strncpy(level, buf, len); + if (level[len-1] == '\n') len--; - mddev->clevel[len] = 0; - mddev->level = LEVEL_NONE; + level[len] = 0; + + request_module("md-%s", level); + spin_lock(&pers_lock); + pers = find_pers(LEVEL_NONE, level); + if (!pers || !try_module_get(pers->owner)) { + spin_unlock(&pers_lock); + printk(KERN_WARNING "md: personality %s not loaded\n", level); + return -EINVAL; + } + spin_unlock(&pers_lock); + + if (pers == mddev->pers) { + /* Nothing to do! */ + module_put(pers->owner); + return rv; + } + if (!pers->takeover) { + module_put(pers->owner); + printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", + mdname(mddev), level); + return -EINVAL; + } + + /* ->takeover must set new_* and/or delta_disks + * if it succeeds, and may set them when it fails. + */ + priv = pers->takeover(mddev); + if (IS_ERR(priv)) { + mddev->new_level = mddev->level; + mddev->new_layout = mddev->layout; + mddev->new_chunk = mddev->chunk_size; + mddev->raid_disks -= mddev->delta_disks; + mddev->delta_disks = 0; + module_put(pers->owner); + printk(KERN_WARNING "md: %s: %s would not accept array\n", + mdname(mddev), level); + return PTR_ERR(priv); + } + + /* Looks like we have a winner */ + mddev_suspend(mddev); + mddev->pers->stop(mddev); + module_put(mddev->pers->owner); + mddev->pers = pers; + mddev->private = priv; + strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + mddev->level = mddev->new_level; + mddev->layout = mddev->new_layout; + mddev->chunk_size = mddev->new_chunk; + mddev->delta_disks = 0; + pers->run(mddev); + mddev_resume(mddev); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); return rv; } @@ -2586,12 +2789,18 @@ layout_store(mddev_t *mddev, const char *buf, size_t len) if (!*buf || (*e && *e != '\n')) return -EINVAL; - if (mddev->pers) - return -EBUSY; - if (mddev->reshape_position != MaxSector) + if (mddev->pers) { + int err; + if (mddev->pers->reconfig == NULL) + return -EBUSY; + err = mddev->pers->reconfig(mddev, n, -1); + if (err) + return err; + } else { mddev->new_layout = n; - else - mddev->layout = n; + if (mddev->reshape_position == MaxSector) + mddev->layout = n; + } return len; } static struct md_sysfs_entry md_layout = @@ -2648,19 +2857,24 @@ chunk_size_show(mddev_t *mddev, char *page) static ssize_t chunk_size_store(mddev_t *mddev, const char *buf, size_t len) { - /* can only set chunk_size if array is not yet active */ char *e; unsigned long n = simple_strtoul(buf, &e, 10); if (!*buf || (*e && *e != '\n')) return -EINVAL; - if (mddev->pers) - return -EBUSY; - else if (mddev->reshape_position != MaxSector) + if (mddev->pers) { + int err; + if (mddev->pers->reconfig == NULL) + return -EBUSY; + err = mddev->pers->reconfig(mddev, -1, n); + if (err) + return err; + } else { mddev->new_chunk = n; - else - mddev->chunk_size = n; + if (mddev->reshape_position == MaxSector) + mddev->chunk_size = n; + } return len; } static struct md_sysfs_entry md_chunk_size = @@ -2669,6 +2883,8 @@ __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); static ssize_t resync_start_show(mddev_t *mddev, char *page) { + if (mddev->recovery_cp == MaxSector) + return sprintf(page, "none\n"); return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); } @@ -2766,7 +2982,7 @@ array_state_show(mddev_t *mddev, char *page) else { if (list_empty(&mddev->disks) && mddev->raid_disks == 0 && - mddev->size == 0) + mddev->dev_sectors == 0) st = clear; else st = inactive; @@ -2973,7 +3189,8 @@ __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); static ssize_t size_show(mddev_t *mddev, char *page) { - return sprintf(page, "%llu\n", (unsigned long long)mddev->size); + return sprintf(page, "%llu\n", + (unsigned long long)mddev->dev_sectors / 2); } static int update_size(mddev_t *mddev, sector_t num_sectors); @@ -2985,20 +3202,18 @@ size_store(mddev_t *mddev, const char *buf, size_t len) * not increase it (except from 0). * If array is active, we can try an on-line resize */ - char *e; - int err = 0; - unsigned long long size = simple_strtoull(buf, &e, 10); - if (!*buf || *buf == '\n' || - (*e && *e != '\n')) - return -EINVAL; + sector_t sectors; + int err = strict_blocks_to_sectors(buf, §ors); + if (err < 0) + return err; if (mddev->pers) { - err = update_size(mddev, size * 2); + err = update_size(mddev, sectors); md_update_sb(mddev, 1); } else { - if (mddev->size == 0 || - mddev->size > size) - mddev->size = size; + if (mddev->dev_sectors == 0 || + mddev->dev_sectors > sectors) + mddev->dev_sectors = sectors; else err = -ENOSPC; } @@ -3251,6 +3466,8 @@ static ssize_t sync_speed_show(mddev_t *mddev, char *page) { unsigned long resync, dt, db; + if (mddev->curr_resync == 0) + return sprintf(page, "none\n"); resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); dt = (jiffies - mddev->resync_mark) / HZ; if (!dt) dt++; @@ -3263,15 +3480,15 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); static ssize_t sync_completed_show(mddev_t *mddev, char *page) { - unsigned long max_blocks, resync; + unsigned long max_sectors, resync; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - max_blocks = mddev->resync_max_sectors; + max_sectors = mddev->resync_max_sectors; else - max_blocks = mddev->size << 1; + max_sectors = mddev->dev_sectors; resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); - return sprintf(page, "%lu / %lu\n", resync, max_blocks); + return sprintf(page, "%lu / %lu\n", resync, max_sectors); } static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); @@ -3431,6 +3648,57 @@ static struct md_sysfs_entry md_reshape_position = __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, reshape_position_store); +static ssize_t +array_size_show(mddev_t *mddev, char *page) +{ + if (mddev->external_size) + return sprintf(page, "%llu\n", + (unsigned long long)mddev->array_sectors/2); + else + return sprintf(page, "default\n"); +} + +static ssize_t +array_size_store(mddev_t *mddev, const char *buf, size_t len) +{ + sector_t sectors; + + if (strncmp(buf, "default", 7) == 0) { + if (mddev->pers) + sectors = mddev->pers->size(mddev, 0, 0); + else + sectors = mddev->array_sectors; + + mddev->external_size = 0; + } else { + if (strict_blocks_to_sectors(buf, §ors) < 0) + return -EINVAL; + if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) + return -EINVAL; + + mddev->external_size = 1; + } + + mddev->array_sectors = sectors; + set_capacity(mddev->gendisk, mddev->array_sectors); + if (mddev->pers) { + struct block_device *bdev = bdget_disk(mddev->gendisk, 0); + + if (bdev) { + mutex_lock(&bdev->bd_inode->i_mutex); + i_size_write(bdev->bd_inode, + (loff_t)mddev->array_sectors << 9); + mutex_unlock(&bdev->bd_inode->i_mutex); + bdput(bdev); + } + } + + return len; +} + +static struct md_sysfs_entry md_array_size = +__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, + array_size_store); static struct attribute *md_default_attrs[] = { &md_level.attr, @@ -3444,6 +3712,7 @@ static struct attribute *md_default_attrs[] = { &md_safe_delay.attr, &md_array_state.attr, &md_reshape_position.attr, + &md_array_size.attr, NULL, }; @@ -3602,10 +3871,12 @@ static int md_alloc(dev_t dev, char *name) mddev_put(mddev); return -ENOMEM; } + mddev->queue->queuedata = mddev; + /* Can be unlocked because the queue is new: no concurrency */ queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue); - blk_queue_make_request(mddev->queue, md_fail_request); + blk_queue_make_request(mddev->queue, md_make_request); disk = alloc_disk(1 << shift); if (!disk) { @@ -3731,13 +4002,13 @@ static int do_md_run(mddev_t * mddev) list_for_each_entry(rdev, &mddev->disks, same_set) { if (test_bit(Faulty, &rdev->flags)) continue; - if (rdev->size < chunk_size / 1024) { + if (rdev->sectors < chunk_size / 512) { printk(KERN_WARNING "md: Dev %s smaller than chunk_size:" - " %lluk < %dk\n", + " %llu < %d\n", bdevname(rdev->bdev,b), - (unsigned long long)rdev->size, - chunk_size / 1024); + (unsigned long long)rdev->sectors, + chunk_size / 512); return -EINVAL; } } @@ -3761,11 +4032,11 @@ static int do_md_run(mddev_t * mddev) /* perform some consistency tests on the device. * We don't want the data to overlap the metadata, - * Internal Bitmap issues has handled elsewhere. + * Internal Bitmap issues have been handled elsewhere. */ if (rdev->data_offset < rdev->sb_start) { - if (mddev->size && - rdev->data_offset + mddev->size*2 + if (mddev->dev_sectors && + rdev->data_offset + mddev->dev_sectors > rdev->sb_start) { printk("md: %s: data overlaps metadata\n", mdname(mddev)); @@ -3801,9 +4072,16 @@ static int do_md_run(mddev_t * mddev) } mddev->pers = pers; spin_unlock(&pers_lock); - mddev->level = pers->level; + if (mddev->level != pers->level) { + mddev->level = pers->level; + mddev->new_level = pers->level; + } strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + if (pers->level >= 4 && pers->level <= 6) + /* Cannot support integrity (yet) */ + blk_integrity_unregister(mddev->gendisk); + if (mddev->reshape_position != MaxSector && pers->start_reshape == NULL) { /* This personality cannot handle reshaping... */ @@ -3843,7 +4121,9 @@ static int do_md_run(mddev_t * mddev) } mddev->recovery = 0; - mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ + /* may be over-ridden by personality */ + mddev->resync_max_sectors = mddev->dev_sectors; + mddev->barriers_work = 1; mddev->ok_start_degraded = start_dirty_degraded; @@ -3853,7 +4133,17 @@ static int do_md_run(mddev_t * mddev) err = mddev->pers->run(mddev); if (err) printk(KERN_ERR "md: pers->run() failed ...\n"); - else if (mddev->pers->sync_request) { + else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) { + WARN_ONCE(!mddev->external_size, "%s: default size too small," + " but 'external_size' not in effect?\n", __func__); + printk(KERN_ERR + "md: invalid array_size %llu > default size %llu\n", + (unsigned long long)mddev->array_sectors / 2, + (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2); + err = -EINVAL; + mddev->pers->stop(mddev); + } + if (err == 0 && mddev->pers->sync_request) { err = bitmap_create(mddev); if (err) { printk(KERN_ERR "%s: failed to create bitmap (%d)\n", @@ -3899,16 +4189,6 @@ static int do_md_run(mddev_t * mddev) set_capacity(disk, mddev->array_sectors); - /* If we call blk_queue_make_request here, it will - * re-initialise max_sectors etc which may have been - * refined inside -> run. So just set the bits we need to set. - * Most initialisation happended when we called - * blk_queue_make_request(..., md_fail_request) - * earlier. - */ - mddev->queue->queuedata = mddev; - mddev->queue->make_request_fn = mddev->pers->make_request; - /* If there is a partially-recovered drive we need to * start recovery here. If we leave it to md_check_recovery, * it will remove the drives and not do the right thing @@ -4038,7 +4318,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) md_super_wait(mddev); if (mddev->ro) set_disk_ro(disk, 0); - blk_queue_make_request(mddev->queue, md_fail_request); + mddev->pers->stop(mddev); mddev->queue->merge_bvec_fn = NULL; mddev->queue->unplug_fn = NULL; @@ -4095,7 +4375,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) export_array(mddev); mddev->array_sectors = 0; - mddev->size = 0; + mddev->external_size = 0; + mddev->dev_sectors = 0; mddev->raid_disks = 0; mddev->recovery_cp = 0; mddev->resync_min = 0; @@ -4135,6 +4416,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) printk(KERN_INFO "md: %s switched to read-only mode.\n", mdname(mddev)); err = 0; + blk_integrity_unregister(disk); md_new_event(mddev); sysfs_notify_dirent(mddev->sysfs_state); out: @@ -4300,8 +4582,8 @@ static int get_array_info(mddev_t * mddev, void __user * arg) info.patch_version = MD_PATCHLEVEL_VERSION; info.ctime = mddev->ctime; info.level = mddev->level; - info.size = mddev->size; - if (info.size != mddev->size) /* overflow */ + info.size = mddev->dev_sectors / 2; + if (info.size != mddev->dev_sectors / 2) /* overflow */ info.size = -1; info.nr_disks = nr; info.raid_disks = mddev->raid_disks; @@ -4480,6 +4762,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) clear_bit(In_sync, &rdev->flags); /* just to be sure */ if (info->state & (1<<MD_DISK_WRITEMOSTLY)) set_bit(WriteMostly, &rdev->flags); + else + clear_bit(WriteMostly, &rdev->flags); rdev->raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); @@ -4543,7 +4827,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; } else rdev->sb_start = calc_dev_sboffset(rdev->bdev); - rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; + rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size); err = bind_rdev_to_array(rdev, mddev); if (err) { @@ -4613,7 +4897,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) else rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; - rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; + rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size); if (test_bit(Faulty, &rdev->flags)) { printk(KERN_WARNING @@ -4749,7 +5033,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) mddev->level = info->level; mddev->clevel[0] = 0; - mddev->size = info->size; + mddev->dev_sectors = 2 * (sector_t)info->size; mddev->raid_disks = info->raid_disks; /* don't set md_minor, it is determined by which /dev/md* was * openned @@ -4788,6 +5072,17 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) return 0; } +void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors) +{ + WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); + + if (mddev->external_size) + return; + + mddev->array_sectors = array_sectors; +} +EXPORT_SYMBOL(md_set_array_sectors); + static int update_size(mddev_t *mddev, sector_t num_sectors) { mdk_rdev_t *rdev; @@ -4814,8 +5109,7 @@ static int update_size(mddev_t *mddev, sector_t num_sectors) */ return -EBUSY; list_for_each_entry(rdev, &mddev->disks, same_set) { - sector_t avail; - avail = rdev->size * 2; + sector_t avail = rdev->sectors; if (fit && (num_sectors == 0 || num_sectors > avail)) num_sectors = avail; @@ -4887,12 +5181,18 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) ) return -EINVAL; /* Check there is only one change */ - if (info->size >= 0 && mddev->size != info->size) cnt++; - if (mddev->raid_disks != info->raid_disks) cnt++; - if (mddev->layout != info->layout) cnt++; - if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; - if (cnt == 0) return 0; - if (cnt > 1) return -EINVAL; + if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) + cnt++; + if (mddev->raid_disks != info->raid_disks) + cnt++; + if (mddev->layout != info->layout) + cnt++; + if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) + cnt++; + if (cnt == 0) + return 0; + if (cnt > 1) + return -EINVAL; if (mddev->layout != info->layout) { /* Change layout @@ -4904,7 +5204,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) else return mddev->pers->reconfig(mddev, info->layout, -1); } - if (info->size >= 0 && mddev->size != info->size) + if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) rv = update_size(mddev, (sector_t)info->size * 2); if (mddev->raid_disks != info->raid_disks) @@ -5331,6 +5631,8 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, void md_unregister_thread(mdk_thread_t *thread) { + if (!thread) + return; dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); kthread_stop(thread->tsk); @@ -5404,7 +5706,7 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev) if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) max_blocks = mddev->resync_max_sectors >> 1; else - max_blocks = mddev->size; + max_blocks = mddev->dev_sectors / 2; /* * Should not happen. @@ -5537,7 +5839,7 @@ struct mdstat_info { static int md_seq_show(struct seq_file *seq, void *v) { mddev_t *mddev = v; - sector_t size; + sector_t sectors; mdk_rdev_t *rdev; struct mdstat_info *mi = seq->private; struct bitmap *bitmap; @@ -5573,7 +5875,7 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, " %s", mddev->pers->name); } - size = 0; + sectors = 0; list_for_each_entry(rdev, &mddev->disks, same_set) { char b[BDEVNAME_SIZE]; seq_printf(seq, " %s[%d]", @@ -5585,7 +5887,7 @@ static int md_seq_show(struct seq_file *seq, void *v) continue; } else if (rdev->raid_disk < 0) seq_printf(seq, "(S)"); /* spare */ - size += rdev->size; + sectors += rdev->sectors; } if (!list_empty(&mddev->disks)) { @@ -5595,7 +5897,7 @@ static int md_seq_show(struct seq_file *seq, void *v) mddev->array_sectors / 2); else seq_printf(seq, "\n %llu blocks", - (unsigned long long)size); + (unsigned long long)sectors / 2); } if (mddev->persistent) { if (mddev->major_version != 0 || @@ -5722,19 +6024,19 @@ int unregister_md_personality(struct mdk_personality *p) return 0; } -static int is_mddev_idle(mddev_t *mddev) +static int is_mddev_idle(mddev_t *mddev, int init) { mdk_rdev_t * rdev; int idle; - long curr_events; + int curr_events; idle = 1; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; - curr_events = part_stat_read(&disk->part0, sectors[0]) + - part_stat_read(&disk->part0, sectors[1]) - - atomic_read(&disk->sync_io); + curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + + (int)part_stat_read(&disk->part0, sectors[1]) - + atomic_read(&disk->sync_io); /* sync IO will cause sync_io to increase before the disk_stats * as sync_io is counted when a request starts, and * disk_stats is counted when it completes. @@ -5757,7 +6059,7 @@ static int is_mddev_idle(mddev_t *mddev) * always make curr_events less than last_events. * */ - if (curr_events - rdev->last_events > 4096) { + if (init || curr_events - rdev->last_events > 64) { rdev->last_events = curr_events; idle = 0; } @@ -5980,10 +6282,10 @@ void md_do_sync(mddev_t *mddev) j = mddev->recovery_cp; } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - max_sectors = mddev->size << 1; + max_sectors = mddev->dev_sectors; else { /* recovery follows the physical size of devices */ - max_sectors = mddev->size << 1; + max_sectors = mddev->dev_sectors; j = MaxSector; list_for_each_entry(rdev, &mddev->disks, same_set) if (rdev->raid_disk >= 0 && @@ -6000,7 +6302,7 @@ void md_do_sync(mddev_t *mddev) "(but not more than %d KB/sec) for %s.\n", speed_max(mddev), desc); - is_mddev_idle(mddev); /* this also initializes IO event counters */ + is_mddev_idle(mddev, 1); /* this initializes IO event counters */ io_sectors = 0; for (m = 0; m < SYNC_MARKS; m++) { @@ -6040,6 +6342,18 @@ void md_do_sync(mddev_t *mddev) } if (kthread_should_stop()) goto interrupted; + + if (mddev->curr_resync > mddev->curr_resync_completed && + (mddev->curr_resync - mddev->curr_resync_completed) + > (max_sectors >> 4)) { + /* time to update curr_resync_completed */ + blk_unplug(mddev->queue); + wait_event(mddev->recovery_wait, + atomic_read(&mddev->recovery_active) == 0); + mddev->curr_resync_completed = + mddev->curr_resync; + set_bit(MD_CHANGE_CLEAN, &mddev->flags); + } sectors = mddev->pers->sync_request(mddev, j, &skipped, currspeed < speed_min(mddev)); if (sectors == 0) { @@ -6102,7 +6416,7 @@ void md_do_sync(mddev_t *mddev) if (currspeed > speed_min(mddev)) { if ((currspeed > speed_max(mddev)) || - !is_mddev_idle(mddev)) { + !is_mddev_idle(mddev, 0)) { msleep(500); goto repeat; } @@ -6173,6 +6487,8 @@ static int remove_and_add_spares(mddev_t *mddev) mdk_rdev_t *rdev; int spares = 0; + mddev->curr_resync_completed = 0; + list_for_each_entry(rdev, &mddev->disks, same_set) if (rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && @@ -6327,6 +6643,9 @@ void md_check_recovery(mddev_t *mddev) sysfs_notify(&mddev->kobj, NULL, "degraded"); } + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + mddev->pers->finish_reshape) + mddev->pers->finish_reshape(mddev); md_update_sb(mddev, 1); /* if array is no-longer degraded, then any saved_raid_disk @@ -6470,13 +6789,13 @@ static void md_geninit(void) static int __init md_init(void) { - if (register_blkdev(MAJOR_NR, "md")) + if (register_blkdev(MD_MAJOR, "md")) return -1; if ((mdp_major=register_blkdev(0, "mdp"))<=0) { - unregister_blkdev(MAJOR_NR, "md"); + unregister_blkdev(MD_MAJOR, "md"); return -1; } - blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE, + blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE, md_probe, NULL, NULL); blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, md_probe, NULL, NULL); @@ -6562,10 +6881,10 @@ static __exit void md_exit(void) mddev_t *mddev; struct list_head *tmp; - blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS); + blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS); blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); - unregister_blkdev(MAJOR_NR,"md"); + unregister_blkdev(MD_MAJOR,"md"); unregister_blkdev(mdp_major, "mdp"); unregister_reboot_notifier(&md_notifier); unregister_sysctl_table(raid_table_header); diff --git a/drivers/md/md.h b/drivers/md/md.h new file mode 100644 index 00000000000..e9b7f54c24d --- /dev/null +++ b/drivers/md/md.h @@ -0,0 +1,436 @@ +/* + md_k.h : kernel internal structure of the Linux MD driver + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_K_H +#define _MD_K_H + +#ifdef CONFIG_BLOCK + +#define MaxSector (~(sector_t)0) + +typedef struct mddev_s mddev_t; +typedef struct mdk_rdev_s mdk_rdev_t; + +/* + * options passed in raidrun: + */ + +/* Currently this must fit in an 'int' */ +#define MAX_CHUNK_SIZE (1<<30) + +/* + * MD's 'extended' device + */ +struct mdk_rdev_s +{ + struct list_head same_set; /* RAID devices within the same set */ + + sector_t sectors; /* Device size (in 512bytes sectors) */ + mddev_t *mddev; /* RAID array if running */ + int last_events; /* IO event timestamp */ + + struct block_device *bdev; /* block device handle */ + + struct page *sb_page; + int sb_loaded; + __u64 sb_events; + sector_t data_offset; /* start of data in array */ + sector_t sb_start; /* offset of the super block (in 512byte sectors) */ + int sb_size; /* bytes in the superblock */ + int preferred_minor; /* autorun support */ + + struct kobject kobj; + + /* A device can be in one of three states based on two flags: + * Not working: faulty==1 in_sync==0 + * Fully working: faulty==0 in_sync==1 + * Working, but not + * in sync with array + * faulty==0 in_sync==0 + * + * It can never have faulty==1, in_sync==1 + * This reduces the burden of testing multiple flags in many cases + */ + + unsigned long flags; +#define Faulty 1 /* device is known to have a fault */ +#define In_sync 2 /* device is in_sync with rest of array */ +#define WriteMostly 4 /* Avoid reading if at all possible */ +#define BarriersNotsupp 5 /* BIO_RW_BARRIER is not supported */ +#define AllReserved 6 /* If whole device is reserved for + * one array */ +#define AutoDetected 7 /* added by auto-detect */ +#define Blocked 8 /* An error occured on an externally + * managed array, don't allow writes + * until it is cleared */ +#define StateChanged 9 /* Faulty or Blocked has changed during + * interrupt, so it needs to be + * notified by the thread */ + wait_queue_head_t blocked_wait; + + int desc_nr; /* descriptor index in the superblock */ + int raid_disk; /* role of device in array */ + int saved_raid_disk; /* role that device used to have in the + * array and could again if we did a partial + * resync from the bitmap + */ + sector_t recovery_offset;/* If this device has been partially + * recovered, this is where we were + * up to. + */ + + atomic_t nr_pending; /* number of pending requests. + * only maintained for arrays that + * support hot removal + */ + atomic_t read_errors; /* number of consecutive read errors that + * we have tried to ignore. + */ + atomic_t corrected_errors; /* number of corrected read errors, + * for reporting to userspace and storing + * in superblock. + */ + struct work_struct del_work; /* used for delayed sysfs removal */ + + struct sysfs_dirent *sysfs_state; /* handle for 'state' + * sysfs entry */ +}; + +struct mddev_s +{ + void *private; + struct mdk_personality *pers; + dev_t unit; + int md_minor; + struct list_head disks; + unsigned long flags; +#define MD_CHANGE_DEVS 0 /* Some device status has changed */ +#define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */ +#define MD_CHANGE_PENDING 2 /* superblock update in progress */ + + int suspended; + atomic_t active_io; + int ro; + + struct gendisk *gendisk; + + struct kobject kobj; + int hold_active; +#define UNTIL_IOCTL 1 +#define UNTIL_STOP 2 + + /* Superblock information */ + int major_version, + minor_version, + patch_version; + int persistent; + int external; /* metadata is + * managed externally */ + char metadata_type[17]; /* externally set*/ + int chunk_size; + time_t ctime, utime; + int level, layout; + char clevel[16]; + int raid_disks; + int max_disks; + sector_t dev_sectors; /* used size of + * component devices */ + sector_t array_sectors; /* exported array size */ + int external_size; /* size managed + * externally */ + __u64 events; + + char uuid[16]; + + /* If the array is being reshaped, we need to record the + * new shape and an indication of where we are up to. + * This is written to the superblock. + * If reshape_position is MaxSector, then no reshape is happening (yet). + */ + sector_t reshape_position; + int delta_disks, new_level, new_layout, new_chunk; + + struct mdk_thread_s *thread; /* management thread */ + struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ + sector_t curr_resync; /* last block scheduled */ + /* As resync requests can complete out of order, we cannot easily track + * how much resync has been completed. So we occasionally pause until + * everything completes, then set curr_resync_completed to curr_resync. + * As such it may be well behind the real resync mark, but it is a value + * we are certain of. + */ + sector_t curr_resync_completed; + unsigned long resync_mark; /* a recent timestamp */ + sector_t resync_mark_cnt;/* blocks written at resync_mark */ + sector_t curr_mark_cnt; /* blocks scheduled now */ + + sector_t resync_max_sectors; /* may be set by personality */ + + sector_t resync_mismatches; /* count of sectors where + * parity/replica mismatch found + */ + + /* allow user-space to request suspension of IO to regions of the array */ + sector_t suspend_lo; + sector_t suspend_hi; + /* if zero, use the system-wide default */ + int sync_speed_min; + int sync_speed_max; + + /* resync even though the same disks are shared among md-devices */ + int parallel_resync; + + int ok_start_degraded; + /* recovery/resync flags + * NEEDED: we might need to start a resync/recover + * RUNNING: a thread is running, or about to be started + * SYNC: actually doing a resync, not a recovery + * RECOVER: doing recovery, or need to try it. + * INTR: resync needs to be aborted for some reason + * DONE: thread is done and is waiting to be reaped + * REQUEST: user-space has requested a sync (used with SYNC) + * CHECK: user-space request for for check-only, no repair + * RESHAPE: A reshape is happening + * + * If neither SYNC or RESHAPE are set, then it is a recovery. + */ +#define MD_RECOVERY_RUNNING 0 +#define MD_RECOVERY_SYNC 1 +#define MD_RECOVERY_RECOVER 2 +#define MD_RECOVERY_INTR 3 +#define MD_RECOVERY_DONE 4 +#define MD_RECOVERY_NEEDED 5 +#define MD_RECOVERY_REQUESTED 6 +#define MD_RECOVERY_CHECK 7 +#define MD_RECOVERY_RESHAPE 8 +#define MD_RECOVERY_FROZEN 9 + + unsigned long recovery; + int recovery_disabled; /* if we detect that recovery + * will always fail, set this + * so we don't loop trying */ + + int in_sync; /* know to not need resync */ + struct mutex reconfig_mutex; + atomic_t active; /* general refcount */ + atomic_t openers; /* number of active opens */ + + int changed; /* true if we might need to reread partition info */ + int degraded; /* whether md should consider + * adding a spare + */ + int barriers_work; /* initialised to true, cleared as soon + * as a barrier request to slave + * fails. Only supported + */ + struct bio *biolist; /* bios that need to be retried + * because BIO_RW_BARRIER is not supported + */ + + atomic_t recovery_active; /* blocks scheduled, but not written */ + wait_queue_head_t recovery_wait; + sector_t recovery_cp; + sector_t resync_min; /* user requested sync + * starts here */ + sector_t resync_max; /* resync should pause + * when it gets here */ + + struct sysfs_dirent *sysfs_state; /* handle for 'array_state' + * file in sysfs. + */ + struct sysfs_dirent *sysfs_action; /* handle for 'sync_action' */ + + struct work_struct del_work; /* used for delayed sysfs removal */ + + spinlock_t write_lock; + wait_queue_head_t sb_wait; /* for waiting on superblock updates */ + atomic_t pending_writes; /* number of active superblock writes */ + + unsigned int safemode; /* if set, update "clean" superblock + * when no writes pending. + */ + unsigned int safemode_delay; + struct timer_list safemode_timer; + atomic_t writes_pending; + struct request_queue *queue; /* for plugging ... */ + + atomic_t write_behind; /* outstanding async IO */ + unsigned int max_write_behind; /* 0 = sync */ + + struct bitmap *bitmap; /* the bitmap for the device */ + struct file *bitmap_file; /* the bitmap file */ + long bitmap_offset; /* offset from superblock of + * start of bitmap. May be + * negative, but not '0' + */ + long default_bitmap_offset; /* this is the offset to use when + * hot-adding a bitmap. It should + * eventually be settable by sysfs. + */ + + struct list_head all_mddevs; +}; + + +static inline void rdev_dec_pending(mdk_rdev_t *rdev, mddev_t *mddev) +{ + int faulty = test_bit(Faulty, &rdev->flags); + if (atomic_dec_and_test(&rdev->nr_pending) && faulty) + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); +} + +static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) +{ + atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); +} + +struct mdk_personality +{ + char *name; + int level; + struct list_head list; + struct module *owner; + int (*make_request)(struct request_queue *q, struct bio *bio); + int (*run)(mddev_t *mddev); + int (*stop)(mddev_t *mddev); + void (*status)(struct seq_file *seq, mddev_t *mddev); + /* error_handler must set ->faulty and clear ->in_sync + * if appropriate, and should abort recovery if needed + */ + void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev); + int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev); + int (*hot_remove_disk) (mddev_t *mddev, int number); + int (*spare_active) (mddev_t *mddev); + sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster); + int (*resize) (mddev_t *mddev, sector_t sectors); + sector_t (*size) (mddev_t *mddev, sector_t sectors, int raid_disks); + int (*check_reshape) (mddev_t *mddev); + int (*start_reshape) (mddev_t *mddev); + void (*finish_reshape) (mddev_t *mddev); + int (*reconfig) (mddev_t *mddev, int layout, int chunk_size); + /* quiesce moves between quiescence states + * 0 - fully active + * 1 - no new requests allowed + * others - reserved + */ + void (*quiesce) (mddev_t *mddev, int state); + /* takeover is used to transition an array from one + * personality to another. The new personality must be able + * to handle the data in the current layout. + * e.g. 2drive raid1 -> 2drive raid5 + * ndrive raid5 -> degraded n+1drive raid6 with special layout + * If the takeover succeeds, a new 'private' structure is returned. + * This needs to be installed and then ->run used to activate the + * array. + */ + void *(*takeover) (mddev_t *mddev); +}; + + +struct md_sysfs_entry { + struct attribute attr; + ssize_t (*show)(mddev_t *, char *); + ssize_t (*store)(mddev_t *, const char *, size_t); +}; + + +static inline char * mdname (mddev_t * mddev) +{ + return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; +} + +/* + * iterates through some rdev ringlist. It's safe to remove the + * current 'rdev'. Dont touch 'tmp' though. + */ +#define rdev_for_each_list(rdev, tmp, head) \ + list_for_each_entry_safe(rdev, tmp, head, same_set) + +/* + * iterates through the 'same array disks' ringlist + */ +#define rdev_for_each(rdev, tmp, mddev) \ + list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set) + +#define rdev_for_each_rcu(rdev, mddev) \ + list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) + +typedef struct mdk_thread_s { + void (*run) (mddev_t *mddev); + mddev_t *mddev; + wait_queue_head_t wqueue; + unsigned long flags; + struct task_struct *tsk; + unsigned long timeout; +} mdk_thread_t; + +#define THREAD_WAKEUP 0 + +#define __wait_event_lock_irq(wq, condition, lock, cmd) \ +do { \ + wait_queue_t __wait; \ + init_waitqueue_entry(&__wait, current); \ + \ + add_wait_queue(&wq, &__wait); \ + for (;;) { \ + set_current_state(TASK_UNINTERRUPTIBLE); \ + if (condition) \ + break; \ + spin_unlock_irq(&lock); \ + cmd; \ + schedule(); \ + spin_lock_irq(&lock); \ + } \ + current->state = TASK_RUNNING; \ + remove_wait_queue(&wq, &__wait); \ +} while (0) + +#define wait_event_lock_irq(wq, condition, lock, cmd) \ +do { \ + if (condition) \ + break; \ + __wait_event_lock_irq(wq, condition, lock, cmd); \ +} while (0) + +static inline void safe_put_page(struct page *p) +{ + if (p) put_page(p); +} + +#endif /* CONFIG_BLOCK */ +#endif + + +extern int register_md_personality(struct mdk_personality *p); +extern int unregister_md_personality(struct mdk_personality *p); +extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev), + mddev_t *mddev, const char *name); +extern void md_unregister_thread(mdk_thread_t *thread); +extern void md_wakeup_thread(mdk_thread_t *thread); +extern void md_check_recovery(mddev_t *mddev); +extern void md_write_start(mddev_t *mddev, struct bio *bi); +extern void md_write_end(mddev_t *mddev); +extern void md_done_sync(mddev_t *mddev, int blocks, int ok); +extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); + +extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, + sector_t sector, int size, struct page *page); +extern void md_super_wait(mddev_t *mddev); +extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, + struct page *page, int rw); +extern void md_do_sync(mddev_t *mddev); +extern void md_new_event(mddev_t *mddev); +extern int md_allow_write(mddev_t *mddev); +extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); +extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); diff --git a/drivers/md/mktables.c b/drivers/md/mktables.c index b61d5767aae..3b1500843bb 100644 --- a/drivers/md/mktables.c +++ b/drivers/md/mktables.c @@ -59,7 +59,7 @@ int main(int argc, char *argv[]) uint8_t v; uint8_t exptbl[256], invtbl[256]; - printf("#include \"raid6.h\"\n"); + printf("#include <linux/raid/pq.h>\n"); /* Compute multiplication table */ printf("\nconst u8 __attribute__((aligned(256)))\n" @@ -76,6 +76,9 @@ int main(int argc, char *argv[]) printf("\t},\n"); } printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gfmul);\n"); + printf("#endif\n"); /* Compute power-of-2 table (exponent) */ v = 1; @@ -92,6 +95,9 @@ int main(int argc, char *argv[]) } } printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gfexp);\n"); + printf("#endif\n"); /* Compute inverse table x^-1 == x^254 */ printf("\nconst u8 __attribute__((aligned(256)))\n" @@ -104,6 +110,9 @@ int main(int argc, char *argv[]) } } printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gfinv);\n"); + printf("#endif\n"); /* Compute inv(2^x + 1) (exponent-xor-inverse) table */ printf("\nconst u8 __attribute__((aligned(256)))\n" @@ -115,6 +124,9 @@ int main(int argc, char *argv[]) (j == 7) ? '\n' : ' '); } printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gfexi);\n"); + printf("#endif\n"); return 0; } diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index f6d08f24167..41ced0cbe82 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -19,7 +19,11 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include <linux/raid/multipath.h> +#include <linux/blkdev.h> +#include <linux/raid/md_u.h> +#include <linux/seq_file.h> +#include "md.h" +#include "multipath.h" #define MAX_WORK_PER_DISK 128 @@ -402,6 +406,14 @@ static void multipathd (mddev_t *mddev) spin_unlock_irqrestore(&conf->device_lock, flags); } +static sector_t multipath_size(mddev_t *mddev, sector_t sectors, int raid_disks) +{ + WARN_ONCE(sectors || raid_disks, + "%s does not support generic reshape\n", __func__); + + return mddev->dev_sectors; +} + static int multipath_run (mddev_t *mddev) { multipath_conf_t *conf; @@ -498,7 +510,7 @@ static int multipath_run (mddev_t *mddev) /* * Ok, everything is just fine now */ - mddev->array_sectors = mddev->size * 2; + md_set_array_sectors(mddev, multipath_size(mddev, 0, 0)); mddev->queue->unplug_fn = multipath_unplug; mddev->queue->backing_dev_info.congested_fn = multipath_congested; @@ -543,6 +555,7 @@ static struct mdk_personality multipath_personality = .error_handler = multipath_error, .hot_add_disk = multipath_add_disk, .hot_remove_disk= multipath_remove_disk, + .size = multipath_size, }; static int __init multipath_init (void) diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h new file mode 100644 index 00000000000..6fa70b400cd --- /dev/null +++ b/drivers/md/multipath.h @@ -0,0 +1,40 @@ +#ifndef _MULTIPATH_H +#define _MULTIPATH_H + +struct multipath_info { + mdk_rdev_t *rdev; +}; + +struct multipath_private_data { + mddev_t *mddev; + struct multipath_info *multipaths; + int raid_disks; + int working_disks; + spinlock_t device_lock; + struct list_head retry_list; + + mempool_t *pool; +}; + +typedef struct multipath_private_data multipath_conf_t; + +/* + * this is the only point in the RAID code where we violate + * C type safety. mddev->private is an 'opaque' pointer. + */ +#define mddev_to_conf(mddev) ((multipath_conf_t *) mddev->private) + +/* + * this is our 'private' 'collective' MULTIPATH buffer head. + * it contains information about what kind of IO operations were started + * for this MULTIPATH operation, and about their status: + */ + +struct multipath_bh { + mddev_t *mddev; + struct bio *master_bio; + struct bio bio; + int path; + struct list_head retry_list; +}; +#endif diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index c605ba80558..c08d7559be5 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -18,7 +18,10 @@ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include <linux/raid/raid0.h> +#include <linux/blkdev.h> +#include <linux/seq_file.h> +#include "md.h" +#include "raid0.h" static void raid0_unplug(struct request_queue *q) { @@ -73,16 +76,15 @@ static int create_strip_zones (mddev_t *mddev) list_for_each_entry(rdev2, &mddev->disks, same_set) { printk(KERN_INFO "raid0: comparing %s(%llu)", bdevname(rdev1->bdev,b), - (unsigned long long)rdev1->size); + (unsigned long long)rdev1->sectors); printk(KERN_INFO " with %s(%llu)\n", bdevname(rdev2->bdev,b), - (unsigned long long)rdev2->size); + (unsigned long long)rdev2->sectors); if (rdev2 == rdev1) { printk(KERN_INFO "raid0: END\n"); break; } - if (rdev2->size == rdev1->size) - { + if (rdev2->sectors == rdev1->sectors) { /* * Not unique, don't count it as a new * group @@ -145,7 +147,7 @@ static int create_strip_zones (mddev_t *mddev) mddev->queue->max_sectors > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); - if (!smallest || (rdev1->size <smallest->size)) + if (!smallest || (rdev1->sectors < smallest->sectors)) smallest = rdev1; cnt++; } @@ -155,10 +157,10 @@ static int create_strip_zones (mddev_t *mddev) goto abort; } zone->nb_dev = cnt; - zone->sectors = smallest->size * cnt * 2; + zone->sectors = smallest->sectors * cnt; zone->zone_start = 0; - current_start = smallest->size * 2; + current_start = smallest->sectors; curr_zone_start = zone->sectors; /* now do the other zones */ @@ -177,29 +179,29 @@ static int create_strip_zones (mddev_t *mddev) rdev = conf->strip_zone[0].dev[j]; printk(KERN_INFO "raid0: checking %s ...", bdevname(rdev->bdev, b)); - if (rdev->size > current_start / 2) { - printk(KERN_INFO " contained as device %d\n", - c); - zone->dev[c] = rdev; - c++; - if (!smallest || (rdev->size <smallest->size)) { - smallest = rdev; - printk(KERN_INFO " (%llu) is smallest!.\n", - (unsigned long long)rdev->size); - } - } else + if (rdev->sectors <= current_start) { printk(KERN_INFO " nope.\n"); + continue; + } + printk(KERN_INFO " contained as device %d\n", c); + zone->dev[c] = rdev; + c++; + if (!smallest || rdev->sectors < smallest->sectors) { + smallest = rdev; + printk(KERN_INFO " (%llu) is smallest!.\n", + (unsigned long long)rdev->sectors); + } } zone->nb_dev = c; - zone->sectors = (smallest->size * 2 - current_start) * c; + zone->sectors = (smallest->sectors - current_start) * c; printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n", zone->nb_dev, (unsigned long long)zone->sectors); zone->zone_start = curr_zone_start; curr_zone_start += zone->sectors; - current_start = smallest->size * 2; + current_start = smallest->sectors; printk(KERN_INFO "raid0: current zone start: %llu\n", (unsigned long long)current_start); } @@ -261,12 +263,25 @@ static int raid0_mergeable_bvec(struct request_queue *q, return max; } +static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks) +{ + sector_t array_sectors = 0; + mdk_rdev_t *rdev; + + WARN_ONCE(sectors || raid_disks, + "%s does not support generic reshape\n", __func__); + + list_for_each_entry(rdev, &mddev->disks, same_set) + array_sectors += rdev->sectors; + + return array_sectors; +} + static int raid0_run (mddev_t *mddev) { unsigned cur=0, i=0, nb_zone; s64 sectors; raid0_conf_t *conf; - mdk_rdev_t *rdev; if (mddev->chunk_size == 0) { printk(KERN_ERR "md/raid0: non-zero chunk size required.\n"); @@ -291,16 +306,14 @@ static int raid0_run (mddev_t *mddev) goto out_free_conf; /* calculate array device size */ - mddev->array_sectors = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) - mddev->array_sectors += rdev->size * 2; + md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); printk(KERN_INFO "raid0 : md_size is %llu sectors.\n", (unsigned long long)mddev->array_sectors); printk(KERN_INFO "raid0 : conf->spacing is %llu sectors.\n", (unsigned long long)conf->spacing); { - sector_t s = mddev->array_sectors; + sector_t s = raid0_size(mddev, 0, 0); sector_t space = conf->spacing; int round; conf->sector_shift = 0; @@ -509,6 +522,7 @@ static struct mdk_personality raid0_personality= .run = raid0_run, .stop = raid0_stop, .status = raid0_status, + .size = raid0_size, }; static int __init raid0_init (void) diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h new file mode 100644 index 00000000000..824b12eb1d4 --- /dev/null +++ b/drivers/md/raid0.h @@ -0,0 +1,28 @@ +#ifndef _RAID0_H +#define _RAID0_H + +struct strip_zone +{ + sector_t zone_start; /* Zone offset in md_dev (in sectors) */ + sector_t dev_start; /* Zone offset in real dev (in sectors) */ + sector_t sectors; /* Zone size in sectors */ + int nb_dev; /* # of devices attached to the zone */ + mdk_rdev_t **dev; /* Devices attached to the zone */ +}; + +struct raid0_private_data +{ + struct strip_zone **hash_table; /* Table of indexes into strip_zone */ + struct strip_zone *strip_zone; + mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ + int nr_strip_zones; + + sector_t spacing; + int sector_shift; /* shift this before divide by spacing */ +}; + +typedef struct raid0_private_data raid0_conf_t; + +#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private) + +#endif diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e2466425d9c..b4f4badc006 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -31,10 +31,13 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include "dm-bio-list.h" #include <linux/delay.h> -#include <linux/raid/raid1.h> -#include <linux/raid/bitmap.h> +#include <linux/blkdev.h> +#include <linux/seq_file.h> +#include "md.h" +#include "dm-bio-list.h" +#include "raid1.h" +#include "bitmap.h" #define DEBUG 0 #if DEBUG @@ -1723,7 +1726,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return 0; } - max_sector = mddev->size << 1; + max_sector = mddev->dev_sectors; if (sector_nr >= max_sector) { /* If we aborted, we need to abort the * sync on the 'current' bitmap chunk (there will @@ -1919,6 +1922,14 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return nr_sectors; } +static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks) +{ + if (sectors) + return sectors; + + return mddev->dev_sectors; +} + static int run(mddev_t *mddev) { conf_t *conf; @@ -2048,7 +2059,7 @@ static int run(mddev_t *mddev) /* * Ok, everything is just fine now */ - mddev->array_sectors = mddev->size * 2; + md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); mddev->queue->unplug_fn = raid1_unplug; mddev->queue->backing_dev_info.congested_fn = raid1_congested; @@ -2089,6 +2100,9 @@ static int stop(mddev_t *mddev) /* need to kick something here to make sure I/O goes? */ } + raise_barrier(conf); + lower_barrier(conf); + md_unregister_thread(mddev->thread); mddev->thread = NULL; blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ @@ -2110,15 +2124,17 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) * any io in the removed space completes, but it hardly seems * worth it. */ - mddev->array_sectors = sectors; + md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0)); + if (mddev->array_sectors > raid1_size(mddev, sectors, 0)) + return -EINVAL; set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; - if (mddev->array_sectors / 2 > mddev->size && + if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { - mddev->recovery_cp = mddev->size << 1; + mddev->recovery_cp = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } - mddev->size = mddev->array_sectors / 2; + mddev->dev_sectors = sectors; mddev->resync_max_sectors = sectors; return 0; } @@ -2264,6 +2280,7 @@ static struct mdk_personality raid1_personality = .spare_active = raid1_spare_active, .sync_request = sync_request, .resize = raid1_resize, + .size = raid1_size, .check_reshape = raid1_reshape, .quiesce = raid1_quiesce, }; diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h new file mode 100644 index 00000000000..1620eea3d57 --- /dev/null +++ b/drivers/md/raid1.h @@ -0,0 +1,132 @@ +#ifndef _RAID1_H +#define _RAID1_H + +typedef struct mirror_info mirror_info_t; + +struct mirror_info { + mdk_rdev_t *rdev; + sector_t head_position; +}; + +/* + * memory pools need a pointer to the mddev, so they can force an unplug + * when memory is tight, and a count of the number of drives that the + * pool was allocated for, so they know how much to allocate and free. + * mddev->raid_disks cannot be used, as it can change while a pool is active + * These two datums are stored in a kmalloced struct. + */ + +struct pool_info { + mddev_t *mddev; + int raid_disks; +}; + + +typedef struct r1bio_s r1bio_t; + +struct r1_private_data_s { + mddev_t *mddev; + mirror_info_t *mirrors; + int raid_disks; + int last_used; + sector_t next_seq_sect; + spinlock_t device_lock; + + struct list_head retry_list; + /* queue pending writes and submit them on unplug */ + struct bio_list pending_bio_list; + /* queue of writes that have been unplugged */ + struct bio_list flushing_bio_list; + + /* for use when syncing mirrors: */ + + spinlock_t resync_lock; + int nr_pending; + int nr_waiting; + int nr_queued; + int barrier; + sector_t next_resync; + int fullsync; /* set to 1 if a full sync is needed, + * (fresh device added). + * Cleared when a sync completes. + */ + + wait_queue_head_t wait_barrier; + + struct pool_info *poolinfo; + + struct page *tmppage; + + mempool_t *r1bio_pool; + mempool_t *r1buf_pool; +}; + +typedef struct r1_private_data_s conf_t; + +/* + * this is the only point in the RAID code where we violate + * C type safety. mddev->private is an 'opaque' pointer. + */ +#define mddev_to_conf(mddev) ((conf_t *) mddev->private) + +/* + * this is our 'private' RAID1 bio. + * + * it contains information about what kind of IO operations were started + * for this RAID1 operation, and about their status: + */ + +struct r1bio_s { + atomic_t remaining; /* 'have we finished' count, + * used from IRQ handlers + */ + atomic_t behind_remaining; /* number of write-behind ios remaining + * in this BehindIO request + */ + sector_t sector; + int sectors; + unsigned long state; + mddev_t *mddev; + /* + * original bio going to /dev/mdx + */ + struct bio *master_bio; + /* + * if the IO is in READ direction, then this is where we read + */ + int read_disk; + + struct list_head retry_list; + struct bitmap_update *bitmap_update; + /* + * if the IO is in WRITE direction, then multiple bios are used. + * We choose the number when they are allocated. + */ + struct bio *bios[0]; + /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ +}; + +/* when we get a read error on a read-only array, we redirect to another + * device without failing the first device, or trying to over-write to + * correct the read error. To keep track of bad blocks on a per-bio + * level, we store IO_BLOCKED in the appropriate 'bios' pointer + */ +#define IO_BLOCKED ((struct bio*)1) + +/* bits for r1bio.state */ +#define R1BIO_Uptodate 0 +#define R1BIO_IsSync 1 +#define R1BIO_Degraded 2 +#define R1BIO_BehindIO 3 +#define R1BIO_Barrier 4 +#define R1BIO_BarrierRetry 5 +/* For write-behind requests, we call bi_end_io when + * the last non-write-behind device completes, providing + * any write was successful. Otherwise we call when + * any write-behind write succeeds, otherwise we call + * with failure when last write completes (and all failed). + * Record that bi_end_io was called with this flag... + */ +#define R1BIO_Returned 6 + +#endif diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 7301631abe0..e293d92641a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -18,10 +18,13 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include "dm-bio-list.h" #include <linux/delay.h> -#include <linux/raid/raid10.h> -#include <linux/raid/bitmap.h> +#include <linux/blkdev.h> +#include <linux/seq_file.h> +#include "md.h" +#include "dm-bio-list.h" +#include "raid10.h" +#include "bitmap.h" /* * RAID10 provides a combination of RAID0 and RAID1 functionality. @@ -1695,7 +1698,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return 0; skipped: - max_sector = mddev->size << 1; + max_sector = mddev->dev_sectors; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) max_sector = mddev->resync_max_sectors; if (sector_nr >= max_sector) { @@ -2020,6 +2023,25 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i goto skipped; } +static sector_t +raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks) +{ + sector_t size; + conf_t *conf = mddev_to_conf(mddev); + + if (!raid_disks) + raid_disks = mddev->raid_disks; + if (!sectors) + sectors = mddev->dev_sectors; + + size = sectors >> conf->chunk_shift; + sector_div(size, conf->far_copies); + size = size * raid_disks; + sector_div(size, conf->near_copies); + + return size << conf->chunk_shift; +} + static int run(mddev_t *mddev) { conf_t *conf; @@ -2076,7 +2098,7 @@ static int run(mddev_t *mddev) conf->far_offset = fo; conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1; conf->chunk_shift = ffz(~mddev->chunk_size) - 9; - size = mddev->size >> (conf->chunk_shift-1); + size = mddev->dev_sectors >> conf->chunk_shift; sector_div(size, fc); size = size * conf->raid_disks; sector_div(size, nc); @@ -2089,7 +2111,7 @@ static int run(mddev_t *mddev) */ stride += conf->raid_disks - 1; sector_div(stride, conf->raid_disks); - mddev->size = stride << (conf->chunk_shift-1); + mddev->dev_sectors = stride << conf->chunk_shift; if (fo) stride = 1; @@ -2171,8 +2193,8 @@ static int run(mddev_t *mddev) /* * Ok, everything is just fine now */ - mddev->array_sectors = size << conf->chunk_shift; - mddev->resync_max_sectors = size << conf->chunk_shift; + md_set_array_sectors(mddev, raid10_size(mddev, 0, 0)); + mddev->resync_max_sectors = raid10_size(mddev, 0, 0); mddev->queue->unplug_fn = raid10_unplug; mddev->queue->backing_dev_info.congested_fn = raid10_congested; @@ -2208,6 +2230,9 @@ static int stop(mddev_t *mddev) { conf_t *conf = mddev_to_conf(mddev); + raise_barrier(conf, 0); + lower_barrier(conf); + md_unregister_thread(mddev->thread); mddev->thread = NULL; blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ @@ -2255,6 +2280,7 @@ static struct mdk_personality raid10_personality = .spare_active = raid10_spare_active, .sync_request = sync_request, .quiesce = raid10_quiesce, + .size = raid10_size, }; static int __init raid_init(void) diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h new file mode 100644 index 00000000000..244dbe507a5 --- /dev/null +++ b/drivers/md/raid10.h @@ -0,0 +1,121 @@ +#ifndef _RAID10_H +#define _RAID10_H + +typedef struct mirror_info mirror_info_t; + +struct mirror_info { + mdk_rdev_t *rdev; + sector_t head_position; +}; + +typedef struct r10bio_s r10bio_t; + +struct r10_private_data_s { + mddev_t *mddev; + mirror_info_t *mirrors; + int raid_disks; + spinlock_t device_lock; + + /* geometry */ + int near_copies; /* number of copies layed out raid0 style */ + int far_copies; /* number of copies layed out + * at large strides across drives + */ + int far_offset; /* far_copies are offset by 1 stripe + * instead of many + */ + int copies; /* near_copies * far_copies. + * must be <= raid_disks + */ + sector_t stride; /* distance between far copies. + * This is size / far_copies unless + * far_offset, in which case it is + * 1 stripe. + */ + + int chunk_shift; /* shift from chunks to sectors */ + sector_t chunk_mask; + + struct list_head retry_list; + /* queue pending writes and submit them on unplug */ + struct bio_list pending_bio_list; + + + spinlock_t resync_lock; + int nr_pending; + int nr_waiting; + int nr_queued; + int barrier; + sector_t next_resync; + int fullsync; /* set to 1 if a full sync is needed, + * (fresh device added). + * Cleared when a sync completes. + */ + + wait_queue_head_t wait_barrier; + + mempool_t *r10bio_pool; + mempool_t *r10buf_pool; + struct page *tmppage; +}; + +typedef struct r10_private_data_s conf_t; + +/* + * this is the only point in the RAID code where we violate + * C type safety. mddev->private is an 'opaque' pointer. + */ +#define mddev_to_conf(mddev) ((conf_t *) mddev->private) + +/* + * this is our 'private' RAID10 bio. + * + * it contains information about what kind of IO operations were started + * for this RAID10 operation, and about their status: + */ + +struct r10bio_s { + atomic_t remaining; /* 'have we finished' count, + * used from IRQ handlers + */ + sector_t sector; /* virtual sector number */ + int sectors; + unsigned long state; + mddev_t *mddev; + /* + * original bio going to /dev/mdx + */ + struct bio *master_bio; + /* + * if the IO is in READ direction, then this is where we read + */ + int read_slot; + + struct list_head retry_list; + /* + * if the IO is in WRITE direction, then multiple bios are used, + * one for each copy. + * When resyncing we also use one for each copy. + * When reconstructing, we use 2 bios, one for read, one for write. + * We choose the number when they are allocated. + */ + struct { + struct bio *bio; + sector_t addr; + int devnum; + } devs[0]; +}; + +/* when we get a read error on a read-only array, we redirect to another + * device without failing the first device, or trying to over-write to + * correct the read error. To keep track of bad blocks on a per-bio + * level, we store IO_BLOCKED in the appropriate 'bios' pointer + */ +#define IO_BLOCKED ((struct bio*)1) + +/* bits for r10bio.state */ +#define R10BIO_Uptodate 0 +#define R10BIO_IsSync 1 +#define R10BIO_IsRecover 2 +#define R10BIO_Degraded 3 +#endif diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a5ba080d303..3bbc6d64704 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -43,11 +43,14 @@ * miss any bits. */ +#include <linux/blkdev.h> #include <linux/kthread.h> -#include "raid6.h" - -#include <linux/raid/bitmap.h> +#include <linux/raid/pq.h> #include <linux/async_tx.h> +#include <linux/seq_file.h> +#include "md.h" +#include "raid5.h" +#include "bitmap.h" /* * Stripe cache @@ -91,11 +94,6 @@ #define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) -#if !RAID6_USE_EMPTY_ZERO_PAGE -/* In .bss so it's zeroed */ -const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); -#endif - /* * We maintain a biased count of active stripes in the bottom 16 bits of * bi_phys_segments, and a count of processed stripes in the upper 16 bits @@ -130,12 +128,42 @@ static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16); } +/* Find first data disk in a raid6 stripe */ +static inline int raid6_d0(struct stripe_head *sh) +{ + if (sh->ddf_layout) + /* ddf always start from first device */ + return 0; + /* md starts just after Q block */ + if (sh->qd_idx == sh->disks - 1) + return 0; + else + return sh->qd_idx + 1; +} static inline int raid6_next_disk(int disk, int raid_disks) { disk++; return (disk < raid_disks) ? disk : 0; } +/* When walking through the disks in a raid5, starting at raid6_d0, + * We need to map each disk to a 'slot', where the data disks are slot + * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk + * is raid_disks-1. This help does that mapping. + */ +static int raid6_idx_to_slot(int idx, struct stripe_head *sh, + int *count, int syndrome_disks) +{ + int slot; + + if (idx == sh->pd_idx) + return syndrome_disks; + if (idx == sh->qd_idx) + return syndrome_disks + 1; + slot = (*count)++; + return slot; +} + static void return_io(struct bio *return_bi) { struct bio *bi = return_bi; @@ -193,6 +221,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) } } } + static void release_stripe(struct stripe_head *sh) { raid5_conf_t *conf = sh->raid_conf; @@ -270,9 +299,11 @@ static int grow_buffers(struct stripe_head *sh, int num) return 0; } -static void raid5_build_block(struct stripe_head *sh, int i); +static void raid5_build_block(struct stripe_head *sh, int i, int previous); +static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, + struct stripe_head *sh); -static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks) +static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) { raid5_conf_t *conf = sh->raid_conf; int i; @@ -287,11 +318,12 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int remove_hash(sh); + sh->generation = conf->generation - previous; + sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; sh->sector = sector; - sh->pd_idx = pd_idx; + stripe_set_idx(sector, conf, previous, sh); sh->state = 0; - sh->disks = disks; for (i = sh->disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -305,12 +337,13 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int BUG(); } dev->flags = 0; - raid5_build_block(sh, i); + raid5_build_block(sh, i, previous); } insert_hash(conf, sh); } -static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks) +static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, + short generation) { struct stripe_head *sh; struct hlist_node *hn; @@ -318,7 +351,7 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in CHECK_DEVLOCK(); pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) - if (sh->sector == sector && sh->disks == disks) + if (sh->sector == sector && sh->generation == generation) return sh; pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); return NULL; @@ -327,8 +360,9 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in static void unplug_slaves(mddev_t *mddev); static void raid5_unplug_device(struct request_queue *q); -static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks, - int pd_idx, int noblock) +static struct stripe_head * +get_active_stripe(raid5_conf_t *conf, sector_t sector, + int previous, int noblock) { struct stripe_head *sh; @@ -340,7 +374,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector wait_event_lock_irq(conf->wait_for_stripe, conf->quiesce == 0, conf->device_lock, /* nothing */); - sh = __find_stripe(conf, sector, disks); + sh = __find_stripe(conf, sector, conf->generation - previous); if (!sh) { if (!conf->inactive_blocked) sh = get_free_stripe(conf); @@ -358,10 +392,11 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector ); conf->inactive_blocked = 0; } else - init_stripe(sh, sector, pd_idx, disks); + init_stripe(sh, sector, previous); } else { if (atomic_read(&sh->count)) { - BUG_ON(!list_empty(&sh->lru)); + BUG_ON(!list_empty(&sh->lru) + && !test_bit(STRIPE_EXPANDING, &sh->state)); } else { if (!test_bit(STRIPE_HANDLE, &sh->state)) atomic_inc(&conf->active_stripes); @@ -895,8 +930,10 @@ static int grow_stripes(raid5_conf_t *conf, int num) struct kmem_cache *sc; int devs = conf->raid_disks; - sprintf(conf->cache_name[0], "raid5-%s", mdname(conf->mddev)); - sprintf(conf->cache_name[1], "raid5-%s-alt", mdname(conf->mddev)); + sprintf(conf->cache_name[0], + "raid%d-%s", conf->level, mdname(conf->mddev)); + sprintf(conf->cache_name[1], + "raid%d-%s-alt", conf->level, mdname(conf->mddev)); conf->active_name = 0; sc = kmem_cache_create(conf->cache_name[conf->active_name], sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), @@ -911,7 +948,6 @@ static int grow_stripes(raid5_conf_t *conf, int num) return 0; } -#ifdef CONFIG_MD_RAID5_RESHAPE static int resize_stripes(raid5_conf_t *conf, int newsize) { /* Make all the stripes able to hold 'newsize' devices. @@ -1036,7 +1072,6 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) conf->pool_size = newsize; return err; } -#endif static int drop_one_stripe(raid5_conf_t *conf) { @@ -1066,7 +1101,7 @@ static void shrink_stripes(raid5_conf_t *conf) static void raid5_end_read_request(struct bio * bi, int error) { - struct stripe_head *sh = bi->bi_private; + struct stripe_head *sh = bi->bi_private; raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks, i; int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); @@ -1148,7 +1183,7 @@ static void raid5_end_read_request(struct bio * bi, int error) static void raid5_end_write_request(struct bio *bi, int error) { - struct stripe_head *sh = bi->bi_private; + struct stripe_head *sh = bi->bi_private; raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks, i; int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); @@ -1176,9 +1211,9 @@ static void raid5_end_write_request(struct bio *bi, int error) } -static sector_t compute_blocknr(struct stripe_head *sh, int i); +static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); -static void raid5_build_block(struct stripe_head *sh, int i) +static void raid5_build_block(struct stripe_head *sh, int i, int previous) { struct r5dev *dev = &sh->dev[i]; @@ -1194,7 +1229,7 @@ static void raid5_build_block(struct stripe_head *sh, int i) dev->req.bi_private = sh; dev->flags = 0; - dev->sector = compute_blocknr(sh, i); + dev->sector = compute_blocknr(sh, i, previous); } static void error(mddev_t *mddev, mdk_rdev_t *rdev) @@ -1227,15 +1262,23 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * Input: a 'big' sector number, * Output: index of the data and parity disk, and the sector # in them. */ -static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, - unsigned int data_disks, unsigned int * dd_idx, - unsigned int * pd_idx, raid5_conf_t *conf) +static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, + int previous, int *dd_idx, + struct stripe_head *sh) { long stripe; unsigned long chunk_number; unsigned int chunk_offset; + int pd_idx, qd_idx; + int ddf_layout = 0; sector_t new_sector; - int sectors_per_chunk = conf->chunk_size >> 9; + int algorithm = previous ? conf->prev_algo + : conf->algorithm; + int sectors_per_chunk = previous ? (conf->prev_chunk >> 9) + : (conf->chunk_size >> 9); + int raid_disks = previous ? conf->previous_raid_disks + : conf->raid_disks; + int data_disks = raid_disks - conf->max_degraded; /* First compute the information on this sector */ @@ -1259,68 +1302,170 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, /* * Select the parity disk based on the user selected algorithm. */ + pd_idx = qd_idx = ~0; switch(conf->level) { case 4: - *pd_idx = data_disks; + pd_idx = data_disks; break; case 5: - switch (conf->algorithm) { + switch (algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: - *pd_idx = data_disks - stripe % raid_disks; - if (*dd_idx >= *pd_idx) + pd_idx = data_disks - stripe % raid_disks; + if (*dd_idx >= pd_idx) (*dd_idx)++; break; case ALGORITHM_RIGHT_ASYMMETRIC: - *pd_idx = stripe % raid_disks; - if (*dd_idx >= *pd_idx) + pd_idx = stripe % raid_disks; + if (*dd_idx >= pd_idx) (*dd_idx)++; break; case ALGORITHM_LEFT_SYMMETRIC: - *pd_idx = data_disks - stripe % raid_disks; - *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; + pd_idx = data_disks - stripe % raid_disks; + *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; break; case ALGORITHM_RIGHT_SYMMETRIC: - *pd_idx = stripe % raid_disks; - *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; + pd_idx = stripe % raid_disks; + *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; + break; + case ALGORITHM_PARITY_0: + pd_idx = 0; + (*dd_idx)++; + break; + case ALGORITHM_PARITY_N: + pd_idx = data_disks; break; default: printk(KERN_ERR "raid5: unsupported algorithm %d\n", - conf->algorithm); + algorithm); + BUG(); } break; case 6: - /**** FIX THIS ****/ - switch (conf->algorithm) { + switch (algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: - *pd_idx = raid_disks - 1 - (stripe % raid_disks); - if (*pd_idx == raid_disks-1) - (*dd_idx)++; /* Q D D D P */ - else if (*dd_idx >= *pd_idx) + pd_idx = raid_disks - 1 - (stripe % raid_disks); + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { + (*dd_idx)++; /* Q D D D P */ + qd_idx = 0; + } else if (*dd_idx >= pd_idx) (*dd_idx) += 2; /* D D P Q D */ break; case ALGORITHM_RIGHT_ASYMMETRIC: - *pd_idx = stripe % raid_disks; - if (*pd_idx == raid_disks-1) - (*dd_idx)++; /* Q D D D P */ - else if (*dd_idx >= *pd_idx) + pd_idx = stripe % raid_disks; + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { + (*dd_idx)++; /* Q D D D P */ + qd_idx = 0; + } else if (*dd_idx >= pd_idx) (*dd_idx) += 2; /* D D P Q D */ break; case ALGORITHM_LEFT_SYMMETRIC: - *pd_idx = raid_disks - 1 - (stripe % raid_disks); - *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; + pd_idx = raid_disks - 1 - (stripe % raid_disks); + qd_idx = (pd_idx + 1) % raid_disks; + *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; break; case ALGORITHM_RIGHT_SYMMETRIC: - *pd_idx = stripe % raid_disks; - *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; + pd_idx = stripe % raid_disks; + qd_idx = (pd_idx + 1) % raid_disks; + *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; + break; + + case ALGORITHM_PARITY_0: + pd_idx = 0; + qd_idx = 1; + (*dd_idx) += 2; + break; + case ALGORITHM_PARITY_N: + pd_idx = data_disks; + qd_idx = data_disks + 1; break; + + case ALGORITHM_ROTATING_ZERO_RESTART: + /* Exactly the same as RIGHT_ASYMMETRIC, but or + * of blocks for computing Q is different. + */ + pd_idx = stripe % raid_disks; + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { + (*dd_idx)++; /* Q D D D P */ + qd_idx = 0; + } else if (*dd_idx >= pd_idx) + (*dd_idx) += 2; /* D D P Q D */ + ddf_layout = 1; + break; + + case ALGORITHM_ROTATING_N_RESTART: + /* Same a left_asymmetric, by first stripe is + * D D D P Q rather than + * Q D D D P + */ + pd_idx = raid_disks - 1 - ((stripe + 1) % raid_disks); + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { + (*dd_idx)++; /* Q D D D P */ + qd_idx = 0; + } else if (*dd_idx >= pd_idx) + (*dd_idx) += 2; /* D D P Q D */ + ddf_layout = 1; + break; + + case ALGORITHM_ROTATING_N_CONTINUE: + /* Same as left_symmetric but Q is before P */ + pd_idx = raid_disks - 1 - (stripe % raid_disks); + qd_idx = (pd_idx + raid_disks - 1) % raid_disks; + *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; + ddf_layout = 1; + break; + + case ALGORITHM_LEFT_ASYMMETRIC_6: + /* RAID5 left_asymmetric, with Q on last device */ + pd_idx = data_disks - stripe % (raid_disks-1); + if (*dd_idx >= pd_idx) + (*dd_idx)++; + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_RIGHT_ASYMMETRIC_6: + pd_idx = stripe % (raid_disks-1); + if (*dd_idx >= pd_idx) + (*dd_idx)++; + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_LEFT_SYMMETRIC_6: + pd_idx = data_disks - stripe % (raid_disks-1); + *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_RIGHT_SYMMETRIC_6: + pd_idx = stripe % (raid_disks-1); + *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_PARITY_0_6: + pd_idx = 0; + (*dd_idx)++; + qd_idx = raid_disks - 1; + break; + + default: printk(KERN_CRIT "raid6: unsupported algorithm %d\n", - conf->algorithm); + algorithm); + BUG(); } break; } + if (sh) { + sh->pd_idx = pd_idx; + sh->qd_idx = qd_idx; + sh->ddf_layout = ddf_layout; + } /* * Finally, compute the new sector number */ @@ -1329,17 +1474,21 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, } -static sector_t compute_blocknr(struct stripe_head *sh, int i) +static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) { raid5_conf_t *conf = sh->raid_conf; int raid_disks = sh->disks; int data_disks = raid_disks - conf->max_degraded; sector_t new_sector = sh->sector, check; - int sectors_per_chunk = conf->chunk_size >> 9; + int sectors_per_chunk = previous ? (conf->prev_chunk >> 9) + : (conf->chunk_size >> 9); + int algorithm = previous ? conf->prev_algo + : conf->algorithm; sector_t stripe; int chunk_offset; - int chunk_number, dummy1, dummy2, dd_idx = i; + int chunk_number, dummy1, dd_idx = i; sector_t r_sector; + struct stripe_head sh2; chunk_offset = sector_div(new_sector, sectors_per_chunk); @@ -1351,7 +1500,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) switch(conf->level) { case 4: break; case 5: - switch (conf->algorithm) { + switch (algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: case ALGORITHM_RIGHT_ASYMMETRIC: if (i > sh->pd_idx) @@ -1363,19 +1512,27 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) i += raid_disks; i -= (sh->pd_idx + 1); break; + case ALGORITHM_PARITY_0: + i -= 1; + break; + case ALGORITHM_PARITY_N: + break; default: printk(KERN_ERR "raid5: unsupported algorithm %d\n", - conf->algorithm); + algorithm); + BUG(); } break; case 6: - if (i == raid6_next_disk(sh->pd_idx, raid_disks)) + if (i == sh->qd_idx) return 0; /* It is the Q disk */ - switch (conf->algorithm) { + switch (algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: case ALGORITHM_RIGHT_ASYMMETRIC: - if (sh->pd_idx == raid_disks-1) - i--; /* Q D D D P */ + case ALGORITHM_ROTATING_ZERO_RESTART: + case ALGORITHM_ROTATING_N_RESTART: + if (sh->pd_idx == raid_disks-1) + i--; /* Q D D D P */ else if (i > sh->pd_idx) i -= 2; /* D D P Q D */ break; @@ -1390,9 +1547,35 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) i -= (sh->pd_idx + 2); } break; + case ALGORITHM_PARITY_0: + i -= 2; + break; + case ALGORITHM_PARITY_N: + break; + case ALGORITHM_ROTATING_N_CONTINUE: + if (sh->pd_idx == 0) + i--; /* P D D D Q */ + else if (i > sh->pd_idx) + i -= 2; /* D D Q P D */ + break; + case ALGORITHM_LEFT_ASYMMETRIC_6: + case ALGORITHM_RIGHT_ASYMMETRIC_6: + if (i > sh->pd_idx) + i--; + break; + case ALGORITHM_LEFT_SYMMETRIC_6: + case ALGORITHM_RIGHT_SYMMETRIC_6: + if (i < sh->pd_idx) + i += data_disks + 1; + i -= (sh->pd_idx + 1); + break; + case ALGORITHM_PARITY_0_6: + i -= 1; + break; default: printk(KERN_CRIT "raid6: unsupported algorithm %d\n", - conf->algorithm); + algorithm); + BUG(); } break; } @@ -1400,8 +1583,10 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) chunk_number = stripe * data_disks + i; r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset; - check = raid5_compute_sector(r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf); - if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { + check = raid5_compute_sector(conf, r_sector, + previous, &dummy1, &sh2); + if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx + || sh2.qd_idx != sh->qd_idx) { printk(KERN_ERR "compute_blocknr: map not correct\n"); return 0; } @@ -1468,14 +1653,16 @@ static void copy_data(int frombio, struct bio *bio, static void compute_parity6(struct stripe_head *sh, int method) { - raid6_conf_t *conf = sh->raid_conf; - int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count; + raid5_conf_t *conf = sh->raid_conf; + int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count; + int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); struct bio *chosen; /**** FIX THIS: This could be very bad if disks is close to 256 ****/ - void *ptrs[disks]; + void *ptrs[syndrome_disks+2]; - qd_idx = raid6_next_disk(pd_idx, disks); - d0_idx = raid6_next_disk(qd_idx, disks); + pd_idx = sh->pd_idx; + qd_idx = sh->qd_idx; + d0_idx = raid6_d0(sh); pr_debug("compute_parity, stripe %llu, method %d\n", (unsigned long long)sh->sector, method); @@ -1513,24 +1700,29 @@ static void compute_parity6(struct stripe_head *sh, int method) set_bit(R5_UPTODATE, &sh->dev[i].flags); } -// switch(method) { -// case RECONSTRUCT_WRITE: -// case CHECK_PARITY: -// case UPDATE_PARITY: - /* Note that unlike RAID-5, the ordering of the disks matters greatly. */ - /* FIX: Is this ordering of drives even remotely optimal? */ - count = 0; - i = d0_idx; - do { - ptrs[count++] = page_address(sh->dev[i].page); - if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags)) - printk("block %d/%d not uptodate on parity calc\n", i,count); - i = raid6_next_disk(i, disks); - } while ( i != d0_idx ); -// break; -// } - - raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs); + /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/ + + for (i = 0; i < disks; i++) + ptrs[i] = (void *)raid6_empty_zero_page; + + count = 0; + i = d0_idx; + do { + int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + + ptrs[slot] = page_address(sh->dev[i].page); + if (slot < syndrome_disks && + !test_bit(R5_UPTODATE, &sh->dev[i].flags)) { + printk(KERN_ERR "block %d/%d not uptodate " + "on parity calc\n", i, count); + BUG(); + } + + i = raid6_next_disk(i, disks); + } while (i != d0_idx); + BUG_ON(count != syndrome_disks); + + raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs); switch(method) { case RECONSTRUCT_WRITE: @@ -1552,8 +1744,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) { int i, count, disks = sh->disks; void *ptr[MAX_XOR_BLOCKS], *dest, *p; - int pd_idx = sh->pd_idx; - int qd_idx = raid6_next_disk(pd_idx, disks); + int qd_idx = sh->qd_idx; pr_debug("compute_block_1, stripe %llu, idx %d\n", (unsigned long long)sh->sector, dd_idx); @@ -1589,63 +1780,65 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) { int i, count, disks = sh->disks; - int pd_idx = sh->pd_idx; - int qd_idx = raid6_next_disk(pd_idx, disks); - int d0_idx = raid6_next_disk(qd_idx, disks); - int faila, failb; + int syndrome_disks = sh->ddf_layout ? disks : disks-2; + int d0_idx = raid6_d0(sh); + int faila = -1, failb = -1; + /**** FIX THIS: This could be very bad if disks is close to 256 ****/ + void *ptrs[syndrome_disks+2]; - /* faila and failb are disk numbers relative to d0_idx */ - /* pd_idx become disks-2 and qd_idx become disks-1 */ - faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx; - failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx; + for (i = 0; i < disks ; i++) + ptrs[i] = (void *)raid6_empty_zero_page; + count = 0; + i = d0_idx; + do { + int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + + ptrs[slot] = page_address(sh->dev[i].page); + + if (i == dd_idx1) + faila = slot; + if (i == dd_idx2) + failb = slot; + i = raid6_next_disk(i, disks); + } while (i != d0_idx); + BUG_ON(count != syndrome_disks); BUG_ON(faila == failb); if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n", - (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb); + (unsigned long long)sh->sector, dd_idx1, dd_idx2, + faila, failb); - if ( failb == disks-1 ) { + if (failb == syndrome_disks+1) { /* Q disk is one of the missing disks */ - if ( faila == disks-2 ) { + if (faila == syndrome_disks) { /* Missing P+Q, just recompute */ compute_parity6(sh, UPDATE_PARITY); return; } else { /* We're missing D+Q; recompute D from P */ - compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0); + compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ? + dd_idx2 : dd_idx1), + 0); compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */ return; } } - /* We're missing D+P or D+D; build pointer table */ - { - /**** FIX THIS: This could be very bad if disks is close to 256 ****/ - void *ptrs[disks]; - - count = 0; - i = d0_idx; - do { - ptrs[count++] = page_address(sh->dev[i].page); - i = raid6_next_disk(i, disks); - if (i != dd_idx1 && i != dd_idx2 && - !test_bit(R5_UPTODATE, &sh->dev[i].flags)) - printk("compute_2 with missing block %d/%d\n", count, i); - } while ( i != d0_idx ); - - if ( failb == disks-2 ) { - /* We're missing D+P. */ - raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs); - } else { - /* We're missing D+D. */ - raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs); - } - - /* Both the above update both missing blocks */ - set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags); - set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags); + /* We're missing D+P or D+D; */ + if (failb == syndrome_disks) { + /* We're missing D+P. */ + raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs); + } else { + /* We're missing D+D. */ + raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb, + ptrs); } + + /* Both the above update both missing blocks */ + set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags); + set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags); } static void @@ -1800,17 +1993,21 @@ static int page_is_zero(struct page *p) memcmp(a, a+4, STRIPE_SIZE-4)==0); } -static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) +static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, + struct stripe_head *sh) { - int sectors_per_chunk = conf->chunk_size >> 9; - int pd_idx, dd_idx; + int sectors_per_chunk = + previous ? (conf->prev_chunk >> 9) + : (conf->chunk_size >> 9); + int dd_idx; int chunk_offset = sector_div(stripe, sectors_per_chunk); + int disks = previous ? conf->previous_raid_disks : conf->raid_disks; - raid5_compute_sector(stripe * (disks - conf->max_degraded) + raid5_compute_sector(conf, + stripe * (disks - conf->max_degraded) *sectors_per_chunk + chunk_offset, - disks, disks - conf->max_degraded, - &dd_idx, &pd_idx, conf); - return pd_idx; + previous, + &dd_idx, sh); } static void @@ -2181,7 +2378,7 @@ static void handle_stripe_dirtying6(raid5_conf_t *conf, struct r6_state *r6s, int disks) { int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i; - int qd_idx = r6s->qd_idx; + int qd_idx = sh->qd_idx; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; /* Would I have to read this buffer for reconstruct_write */ @@ -2371,7 +2568,7 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, int update_p = 0, update_q = 0; struct r5dev *dev; int pd_idx = sh->pd_idx; - int qd_idx = r6s->qd_idx; + int qd_idx = sh->qd_idx; set_bit(STRIPE_HANDLE, &sh->state); @@ -2467,17 +2664,14 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, struct dma_async_tx_descriptor *tx = NULL; clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); for (i = 0; i < sh->disks; i++) - if (i != sh->pd_idx && (!r6s || i != r6s->qd_idx)) { - int dd_idx, pd_idx, j; + if (i != sh->pd_idx && i != sh->qd_idx) { + int dd_idx, j; struct stripe_head *sh2; - sector_t bn = compute_blocknr(sh, i); - sector_t s = raid5_compute_sector(bn, conf->raid_disks, - conf->raid_disks - - conf->max_degraded, &dd_idx, - &pd_idx, conf); - sh2 = get_active_stripe(conf, s, conf->raid_disks, - pd_idx, 1); + sector_t bn = compute_blocknr(sh, i, 1); + sector_t s = raid5_compute_sector(conf, bn, 0, + &dd_idx, NULL); + sh2 = get_active_stripe(conf, s, 0, 1); if (sh2 == NULL) /* so far only the early blocks of this stripe * have been requested. When later blocks @@ -2500,8 +2694,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); for (j = 0; j < conf->raid_disks; j++) if (j != sh2->pd_idx && - (!r6s || j != raid6_next_disk(sh2->pd_idx, - sh2->disks)) && + (!r6s || j != sh2->qd_idx) && !test_bit(R5_Expanded, &sh2->dev[j].flags)) break; if (j == conf->raid_disks) { @@ -2750,6 +2943,23 @@ static bool handle_stripe5(struct stripe_head *sh) /* Finish reconstruct operations initiated by the expansion process */ if (sh->reconstruct_state == reconstruct_state_result) { + struct stripe_head *sh2 + = get_active_stripe(conf, sh->sector, 1, 1); + if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { + /* sh cannot be written until sh2 has been read. + * so arrange for sh to be delayed a little + */ + set_bit(STRIPE_DELAYED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, + &sh2->state)) + atomic_inc(&conf->preread_active_stripes); + release_stripe(sh2); + goto unlock; + } + if (sh2) + release_stripe(sh2); + sh->reconstruct_state = reconstruct_state_idle; clear_bit(STRIPE_EXPANDING, &sh->state); for (i = conf->raid_disks; i--; ) { @@ -2763,8 +2973,7 @@ static bool handle_stripe5(struct stripe_head *sh) !sh->reconstruct_state) { /* Need to write out all blocks after computing parity */ sh->disks = conf->raid_disks; - sh->pd_idx = stripe_to_pdidx(sh->sector, conf, - conf->raid_disks); + stripe_set_idx(sh->sector, conf, 0, sh); schedule_reconstruction5(sh, &s, 1, 1); } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { clear_bit(STRIPE_EXPAND_READY, &sh->state); @@ -2796,20 +3005,19 @@ static bool handle_stripe5(struct stripe_head *sh) static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) { - raid6_conf_t *conf = sh->raid_conf; + raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks; struct bio *return_bi = NULL; - int i, pd_idx = sh->pd_idx; + int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx; struct stripe_head_state s; struct r6_state r6s; struct r5dev *dev, *pdev, *qdev; mdk_rdev_t *blocked_rdev = NULL; - r6s.qd_idx = raid6_next_disk(pd_idx, disks); pr_debug("handling stripe %llu, state=%#lx cnt=%d, " "pd_idx=%d, qd_idx=%d\n", (unsigned long long)sh->sector, sh->state, - atomic_read(&sh->count), pd_idx, r6s.qd_idx); + atomic_read(&sh->count), pd_idx, qd_idx); memset(&s, 0, sizeof(s)); spin_lock(&sh->lock); @@ -2920,9 +3128,9 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) pdev = &sh->dev[pd_idx]; r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx) || (s.failed >= 2 && r6s.failed_num[1] == pd_idx); - qdev = &sh->dev[r6s.qd_idx]; - r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == r6s.qd_idx) - || (s.failed >= 2 && r6s.failed_num[1] == r6s.qd_idx); + qdev = &sh->dev[qd_idx]; + r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx) + || (s.failed >= 2 && r6s.failed_num[1] == qd_idx); if ( s.written && ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags) @@ -2980,10 +3188,26 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) } if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { + struct stripe_head *sh2 + = get_active_stripe(conf, sh->sector, 1, 1); + if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { + /* sh cannot be written until sh2 has been read. + * so arrange for sh to be delayed a little + */ + set_bit(STRIPE_DELAYED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, + &sh2->state)) + atomic_inc(&conf->preread_active_stripes); + release_stripe(sh2); + goto unlock; + } + if (sh2) + release_stripe(sh2); + /* Need to write out all blocks after computing P&Q */ sh->disks = conf->raid_disks; - sh->pd_idx = stripe_to_pdidx(sh->sector, conf, - conf->raid_disks); + stripe_set_idx(sh->sector, conf, 0, sh); compute_parity6(sh, RECONSTRUCT_WRITE); for (i = conf->raid_disks ; i-- ; ) { set_bit(R5_LOCKED, &sh->dev[i].flags); @@ -3134,6 +3358,8 @@ static int raid5_mergeable_bvec(struct request_queue *q, if ((bvm->bi_rw & 1) == WRITE) return biovec->bv_len; /* always allow writes to be mergeable */ + if (mddev->new_chunk < mddev->chunk_size) + chunk_sectors = mddev->new_chunk >> 9; max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; if (max < 0) max = 0; if (max <= biovec->bv_len && bio_sectors == 0) @@ -3149,6 +3375,8 @@ static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) unsigned int chunk_sectors = mddev->chunk_size >> 9; unsigned int bio_sectors = bio->bi_size >> 9; + if (mddev->new_chunk < mddev->chunk_size) + chunk_sectors = mddev->new_chunk >> 9; return chunk_sectors >= ((sector & (chunk_sectors - 1)) + bio_sectors); } @@ -3255,9 +3483,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) { mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev_to_conf(mddev); - const unsigned int raid_disks = conf->raid_disks; - const unsigned int data_disks = raid_disks - conf->max_degraded; - unsigned int dd_idx, pd_idx; + unsigned int dd_idx; struct bio* align_bi; mdk_rdev_t *rdev; @@ -3266,7 +3492,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) return 0; } /* - * use bio_clone to make a copy of the bio + * use bio_clone to make a copy of the bio */ align_bi = bio_clone(raid_bio, GFP_NOIO); if (!align_bi) @@ -3280,12 +3506,9 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) /* * compute position */ - align_bi->bi_sector = raid5_compute_sector(raid_bio->bi_sector, - raid_disks, - data_disks, - &dd_idx, - &pd_idx, - conf); + align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, + 0, + &dd_idx, NULL); rcu_read_lock(); rdev = rcu_dereference(conf->disks[dd_idx].rdev); @@ -3377,7 +3600,7 @@ static int make_request(struct request_queue *q, struct bio * bi) { mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev_to_conf(mddev); - unsigned int dd_idx, pd_idx; + int dd_idx; sector_t new_sector; sector_t logical_sector, last_sector; struct stripe_head *sh; @@ -3400,7 +3623,7 @@ static int make_request(struct request_queue *q, struct bio * bi) if (rw == READ && mddev->reshape_position == MaxSector && chunk_aligned_read(q,bi)) - return 0; + return 0; logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); last_sector = bi->bi_sector + (bi->bi_size>>9); @@ -3410,26 +3633,31 @@ static int make_request(struct request_queue *q, struct bio * bi) for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { DEFINE_WAIT(w); int disks, data_disks; + int previous; retry: + previous = 0; + disks = conf->raid_disks; prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); - if (likely(conf->expand_progress == MaxSector)) - disks = conf->raid_disks; - else { - /* spinlock is needed as expand_progress may be + if (unlikely(conf->reshape_progress != MaxSector)) { + /* spinlock is needed as reshape_progress may be * 64bit on a 32bit platform, and so it might be * possible to see a half-updated value - * Ofcourse expand_progress could change after + * Ofcourse reshape_progress could change after * the lock is dropped, so once we get a reference * to the stripe that we think it is, we will have * to check again. */ spin_lock_irq(&conf->device_lock); - disks = conf->raid_disks; - if (logical_sector >= conf->expand_progress) + if (mddev->delta_disks < 0 + ? logical_sector < conf->reshape_progress + : logical_sector >= conf->reshape_progress) { disks = conf->previous_raid_disks; - else { - if (logical_sector >= conf->expand_lo) { + previous = 1; + } else { + if (mddev->delta_disks < 0 + ? logical_sector < conf->reshape_safe + : logical_sector >= conf->reshape_safe) { spin_unlock_irq(&conf->device_lock); schedule(); goto retry; @@ -3439,15 +3667,17 @@ static int make_request(struct request_queue *q, struct bio * bi) } data_disks = disks - conf->max_degraded; - new_sector = raid5_compute_sector(logical_sector, disks, data_disks, - &dd_idx, &pd_idx, conf); + new_sector = raid5_compute_sector(conf, logical_sector, + previous, + &dd_idx, NULL); pr_debug("raid5: make_request, sector %llu logical %llu\n", (unsigned long long)new_sector, (unsigned long long)logical_sector); - sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK)); + sh = get_active_stripe(conf, new_sector, previous, + (bi->bi_rw&RWA_MASK)); if (sh) { - if (unlikely(conf->expand_progress != MaxSector)) { + if (unlikely(previous)) { /* expansion might have moved on while waiting for a * stripe, so we must do the range check again. * Expansion could still move past after this @@ -3458,8 +3688,9 @@ static int make_request(struct request_queue *q, struct bio * bi) */ int must_retry = 0; spin_lock_irq(&conf->device_lock); - if (logical_sector < conf->expand_progress && - disks == conf->previous_raid_disks) + if (mddev->delta_disks < 0 + ? logical_sector >= conf->reshape_progress + : logical_sector < conf->reshape_progress) /* mismatch, need to try again */ must_retry = 1; spin_unlock_irq(&conf->device_lock); @@ -3514,6 +3745,8 @@ static int make_request(struct request_queue *q, struct bio * bi) return 0; } +static sector_t raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks); + static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped) { /* reshaping is quite different to recovery/resync so it is @@ -3527,61 +3760,118 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped */ raid5_conf_t *conf = (raid5_conf_t *) mddev->private; struct stripe_head *sh; - int pd_idx; sector_t first_sector, last_sector; int raid_disks = conf->previous_raid_disks; int data_disks = raid_disks - conf->max_degraded; int new_data_disks = conf->raid_disks - conf->max_degraded; int i; int dd_idx; - sector_t writepos, safepos, gap; - - if (sector_nr == 0 && - conf->expand_progress != 0) { - /* restarting in the middle, skip the initial sectors */ - sector_nr = conf->expand_progress; + sector_t writepos, readpos, safepos; + sector_t stripe_addr; + int reshape_sectors; + struct list_head stripes; + + if (sector_nr == 0) { + /* If restarting in the middle, skip the initial sectors */ + if (mddev->delta_disks < 0 && + conf->reshape_progress < raid5_size(mddev, 0, 0)) { + sector_nr = raid5_size(mddev, 0, 0) + - conf->reshape_progress; + } else if (mddev->delta_disks > 0 && + conf->reshape_progress > 0) + sector_nr = conf->reshape_progress; sector_div(sector_nr, new_data_disks); - *skipped = 1; - return sector_nr; + if (sector_nr) { + *skipped = 1; + return sector_nr; + } } + /* We need to process a full chunk at a time. + * If old and new chunk sizes differ, we need to process the + * largest of these + */ + if (mddev->new_chunk > mddev->chunk_size) + reshape_sectors = mddev->new_chunk / 512; + else + reshape_sectors = mddev->chunk_size / 512; + /* we update the metadata when there is more than 3Meg * in the block range (that is rather arbitrary, should * probably be time based) or when the data about to be * copied would over-write the source of the data at * the front of the range. - * i.e. one new_stripe forward from expand_progress new_maps - * to after where expand_lo old_maps to + * i.e. one new_stripe along from reshape_progress new_maps + * to after where reshape_safe old_maps to */ - writepos = conf->expand_progress + - conf->chunk_size/512*(new_data_disks); + writepos = conf->reshape_progress; sector_div(writepos, new_data_disks); - safepos = conf->expand_lo; + readpos = conf->reshape_progress; + sector_div(readpos, data_disks); + safepos = conf->reshape_safe; sector_div(safepos, data_disks); - gap = conf->expand_progress - conf->expand_lo; + if (mddev->delta_disks < 0) { + writepos -= reshape_sectors; + readpos += reshape_sectors; + safepos += reshape_sectors; + } else { + writepos += reshape_sectors; + readpos -= reshape_sectors; + safepos -= reshape_sectors; + } - if (writepos >= safepos || - gap > (new_data_disks)*3000*2 /*3Meg*/) { + /* 'writepos' is the most advanced device address we might write. + * 'readpos' is the least advanced device address we might read. + * 'safepos' is the least address recorded in the metadata as having + * been reshaped. + * If 'readpos' is behind 'writepos', then there is no way that we can + * ensure safety in the face of a crash - that must be done by userspace + * making a backup of the data. So in that case there is no particular + * rush to update metadata. + * Otherwise if 'safepos' is behind 'writepos', then we really need to + * update the metadata to advance 'safepos' to match 'readpos' so that + * we can be safe in the event of a crash. + * So we insist on updating metadata if safepos is behind writepos and + * readpos is beyond writepos. + * In any case, update the metadata every 10 seconds. + * Maybe that number should be configurable, but I'm not sure it is + * worth it.... maybe it could be a multiple of safemode_delay??? + */ + if ((mddev->delta_disks < 0 + ? (safepos > writepos && readpos < writepos) + : (safepos < writepos && readpos > writepos)) || + time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes)==0); - mddev->reshape_position = conf->expand_progress; + mddev->reshape_position = conf->reshape_progress; + conf->reshape_checkpoint = jiffies; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, mddev->flags == 0 || kthread_should_stop()); spin_lock_irq(&conf->device_lock); - conf->expand_lo = mddev->reshape_position; + conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); } - for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) { + if (mddev->delta_disks < 0) { + BUG_ON(conf->reshape_progress == 0); + stripe_addr = writepos; + BUG_ON((mddev->dev_sectors & + ~((sector_t)reshape_sectors - 1)) + - reshape_sectors - stripe_addr + != sector_nr); + } else { + BUG_ON(writepos != sector_nr + reshape_sectors); + stripe_addr = sector_nr; + } + INIT_LIST_HEAD(&stripes); + for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { int j; int skipped = 0; - pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks); - sh = get_active_stripe(conf, sector_nr+i, - conf->raid_disks, pd_idx, 0); + sh = get_active_stripe(conf, stripe_addr+i, 0, 0); set_bit(STRIPE_EXPANDING, &sh->state); atomic_inc(&conf->reshape_stripes); /* If any of this stripe is beyond the end of the old @@ -3592,10 +3882,10 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped if (j == sh->pd_idx) continue; if (conf->level == 6 && - j == raid6_next_disk(sh->pd_idx, sh->disks)) + j == sh->qd_idx) continue; - s = compute_blocknr(sh, j); - if (s < mddev->array_sectors) { + s = compute_blocknr(sh, j, 0); + if (s < raid5_size(mddev, 0, 0)) { skipped = 1; continue; } @@ -3607,10 +3897,13 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped set_bit(STRIPE_EXPAND_READY, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); } - release_stripe(sh); + list_add(&sh->lru, &stripes); } spin_lock_irq(&conf->device_lock); - conf->expand_progress = (sector_nr + i) * new_data_disks; + if (mddev->delta_disks < 0) + conf->reshape_progress -= reshape_sectors * new_data_disks; + else + conf->reshape_progress += reshape_sectors * new_data_disks; spin_unlock_irq(&conf->device_lock); /* Ok, those stripe are ready. We can start scheduling * reads on the source stripes. @@ -3618,46 +3911,50 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped * block on the destination stripes. */ first_sector = - raid5_compute_sector(sector_nr*(new_data_disks), - raid_disks, data_disks, - &dd_idx, &pd_idx, conf); + raid5_compute_sector(conf, stripe_addr*(new_data_disks), + 1, &dd_idx, NULL); last_sector = - raid5_compute_sector((sector_nr+conf->chunk_size/512) - *(new_data_disks) -1, - raid_disks, data_disks, - &dd_idx, &pd_idx, conf); - if (last_sector >= (mddev->size<<1)) - last_sector = (mddev->size<<1)-1; + raid5_compute_sector(conf, ((stripe_addr+conf->chunk_size/512) + *(new_data_disks) - 1), + 1, &dd_idx, NULL); + if (last_sector >= mddev->dev_sectors) + last_sector = mddev->dev_sectors - 1; while (first_sector <= last_sector) { - pd_idx = stripe_to_pdidx(first_sector, conf, - conf->previous_raid_disks); - sh = get_active_stripe(conf, first_sector, - conf->previous_raid_disks, pd_idx, 0); + sh = get_active_stripe(conf, first_sector, 1, 0); set_bit(STRIPE_EXPAND_SOURCE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); first_sector += STRIPE_SECTORS; } + /* Now that the sources are clearly marked, we can release + * the destination stripes + */ + while (!list_empty(&stripes)) { + sh = list_entry(stripes.next, struct stripe_head, lru); + list_del_init(&sh->lru); + release_stripe(sh); + } /* If this takes us to the resync_max point where we have to pause, * then we need to write out the superblock. */ - sector_nr += conf->chunk_size>>9; + sector_nr += reshape_sectors; if (sector_nr >= mddev->resync_max) { /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes) == 0); - mddev->reshape_position = conf->expand_progress; + mddev->reshape_position = conf->reshape_progress; + conf->reshape_checkpoint = jiffies; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, !test_bit(MD_CHANGE_DEVS, &mddev->flags) || kthread_should_stop()); spin_lock_irq(&conf->device_lock); - conf->expand_lo = mddev->reshape_position; + conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); } - return conf->chunk_size>>9; + return reshape_sectors; } /* FIXME go_faster isn't used */ @@ -3665,9 +3962,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski { raid5_conf_t *conf = (raid5_conf_t *) mddev->private; struct stripe_head *sh; - int pd_idx; - int raid_disks = conf->raid_disks; - sector_t max_sector = mddev->size << 1; + sector_t max_sector = mddev->dev_sectors; int sync_blocks; int still_degraded = 0; int i; @@ -3675,6 +3970,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski if (sector_nr >= max_sector) { /* just being told to finish up .. nothing much to do */ unplug_slaves(mddev); + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { end_reshape(conf); return 0; @@ -3705,7 +4001,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski */ if (mddev->degraded >= conf->max_degraded && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - sector_t rv = (mddev->size << 1) - sector_nr; + sector_t rv = mddev->dev_sectors - sector_nr; *skipped = 1; return rv; } @@ -3721,10 +4017,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski bitmap_cond_end_sync(mddev->bitmap, sector_nr); - pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks); - sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1); + sh = get_active_stripe(conf, sector_nr, 0, 1); if (sh == NULL) { - sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0); + sh = get_active_stripe(conf, sector_nr, 0, 0); /* make sure we don't swamp the stripe cache if someone else * is trying to get access */ @@ -3766,19 +4061,15 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. */ struct stripe_head *sh; - int dd_idx, pd_idx; + int dd_idx; sector_t sector, logical_sector, last_sector; int scnt = 0; int remaining; int handled = 0; logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); - sector = raid5_compute_sector( logical_sector, - conf->raid_disks, - conf->raid_disks - conf->max_degraded, - &dd_idx, - &pd_idx, - conf); + sector = raid5_compute_sector(conf, logical_sector, + 0, &dd_idx, NULL); last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); for (; logical_sector < last_sector; @@ -3790,7 +4081,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) /* already done this stripe */ continue; - sh = get_active_stripe(conf, sector, conf->raid_disks, pd_idx, 1); + sh = get_active_stripe(conf, sector, 0, 1); if (!sh) { /* failed to get a stripe - must wait */ @@ -3992,89 +4283,69 @@ static struct attribute_group raid5_attrs_group = { .attrs = raid5_attrs, }; -static int run(mddev_t *mddev) +static sector_t +raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) +{ + raid5_conf_t *conf = mddev_to_conf(mddev); + + if (!sectors) + sectors = mddev->dev_sectors; + if (!raid_disks) { + /* size is defined by the smallest of previous and new size */ + if (conf->raid_disks < conf->previous_raid_disks) + raid_disks = conf->raid_disks; + else + raid_disks = conf->previous_raid_disks; + } + + sectors &= ~((sector_t)mddev->chunk_size/512 - 1); + sectors &= ~((sector_t)mddev->new_chunk/512 - 1); + return sectors * (raid_disks - conf->max_degraded); +} + +static raid5_conf_t *setup_conf(mddev_t *mddev) { raid5_conf_t *conf; int raid_disk, memory; mdk_rdev_t *rdev; struct disk_info *disk; - int working_disks = 0; - if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) { + if (mddev->new_level != 5 + && mddev->new_level != 4 + && mddev->new_level != 6) { printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n", - mdname(mddev), mddev->level); - return -EIO; + mdname(mddev), mddev->new_level); + return ERR_PTR(-EIO); } - - if (mddev->chunk_size < PAGE_SIZE) { - printk(KERN_ERR "md/raid5: chunk_size must be at least " - "PAGE_SIZE but %d < %ld\n", - mddev->chunk_size, PAGE_SIZE); - return -EINVAL; + if ((mddev->new_level == 5 + && !algorithm_valid_raid5(mddev->new_layout)) || + (mddev->new_level == 6 + && !algorithm_valid_raid6(mddev->new_layout))) { + printk(KERN_ERR "raid5: %s: layout %d not supported\n", + mdname(mddev), mddev->new_layout); + return ERR_PTR(-EIO); } - - if (mddev->reshape_position != MaxSector) { - /* Check that we can continue the reshape. - * Currently only disks can change, it must - * increase, and we must be past the point where - * a stripe over-writes itself - */ - sector_t here_new, here_old; - int old_disks; - int max_degraded = (mddev->level == 5 ? 1 : 2); - - if (mddev->new_level != mddev->level || - mddev->new_layout != mddev->layout || - mddev->new_chunk != mddev->chunk_size) { - printk(KERN_ERR "raid5: %s: unsupported reshape " - "required - aborting.\n", - mdname(mddev)); - return -EINVAL; - } - if (mddev->delta_disks <= 0) { - printk(KERN_ERR "raid5: %s: unsupported reshape " - "(reduce disks) required - aborting.\n", - mdname(mddev)); - return -EINVAL; - } - old_disks = mddev->raid_disks - mddev->delta_disks; - /* reshape_position must be on a new-stripe boundary, and one - * further up in new geometry must map after here in old - * geometry. - */ - here_new = mddev->reshape_position; - if (sector_div(here_new, (mddev->chunk_size>>9)* - (mddev->raid_disks - max_degraded))) { - printk(KERN_ERR "raid5: reshape_position not " - "on a stripe boundary\n"); - return -EINVAL; - } - /* here_new is the stripe we will write to */ - here_old = mddev->reshape_position; - sector_div(here_old, (mddev->chunk_size>>9)* - (old_disks-max_degraded)); - /* here_old is the first stripe that we might need to read - * from */ - if (here_new >= here_old) { - /* Reading from the same stripe as writing to - bad */ - printk(KERN_ERR "raid5: reshape_position too early for " - "auto-recovery - aborting.\n"); - return -EINVAL; - } - printk(KERN_INFO "raid5: reshape will continue\n"); - /* OK, we should be able to continue; */ + if (mddev->new_level == 6 && mddev->raid_disks < 4) { + printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", + mdname(mddev), mddev->raid_disks); + return ERR_PTR(-EINVAL); } + if (!mddev->new_chunk || mddev->new_chunk % PAGE_SIZE) { + printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", + mddev->new_chunk, mdname(mddev)); + return ERR_PTR(-EINVAL); + } - mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL); - if ((conf = mddev->private) == NULL) + conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL); + if (conf == NULL) goto abort; - if (mddev->reshape_position == MaxSector) { - conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks; - } else { - conf->raid_disks = mddev->raid_disks; + + conf->raid_disks = mddev->raid_disks; + if (mddev->reshape_position == MaxSector) + conf->previous_raid_disks = mddev->raid_disks; + else conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; - } conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info), GFP_KERNEL); @@ -4086,13 +4357,12 @@ static int run(mddev_t *mddev) if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) goto abort; - if (mddev->level == 6) { + if (mddev->new_level == 6) { conf->spare_page = alloc_page(GFP_KERNEL); if (!conf->spare_page) goto abort; } spin_lock_init(&conf->device_lock); - mddev->queue->queue_lock = &conf->device_lock; init_waitqueue_head(&conf->wait_for_stripe); init_waitqueue_head(&conf->wait_for_overlap); INIT_LIST_HEAD(&conf->handle_list); @@ -4121,47 +4391,134 @@ static int run(mddev_t *mddev) printk(KERN_INFO "raid5: device %s operational as raid" " disk %d\n", bdevname(rdev->bdev,b), raid_disk); - working_disks++; } else /* Cannot rely on bitmap to complete recovery */ conf->fullsync = 1; } - /* - * 0 for a fully functional array, 1 or 2 for a degraded array. - */ - mddev->degraded = conf->raid_disks - working_disks; - conf->mddev = mddev; - conf->chunk_size = mddev->chunk_size; - conf->level = mddev->level; + conf->chunk_size = mddev->new_chunk; + conf->level = mddev->new_level; if (conf->level == 6) conf->max_degraded = 2; else conf->max_degraded = 1; - conf->algorithm = mddev->layout; + conf->algorithm = mddev->new_layout; conf->max_nr_stripes = NR_STRIPES; - conf->expand_progress = mddev->reshape_position; - - /* device size must be a multiple of chunk size */ - mddev->size &= ~(mddev->chunk_size/1024 -1); - mddev->resync_max_sectors = mddev->size << 1; + conf->reshape_progress = mddev->reshape_position; + if (conf->reshape_progress != MaxSector) { + conf->prev_chunk = mddev->chunk_size; + conf->prev_algo = mddev->layout; + } - if (conf->level == 6 && conf->raid_disks < 4) { - printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", - mdname(mddev), conf->raid_disks); + memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + + conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; + if (grow_stripes(conf, conf->max_nr_stripes)) { + printk(KERN_ERR + "raid5: couldn't allocate %dkB for buffers\n", memory); goto abort; - } - if (!conf->chunk_size || conf->chunk_size % 4) { - printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", - conf->chunk_size, mdname(mddev)); + } else + printk(KERN_INFO "raid5: allocated %dkB for %s\n", + memory, mdname(mddev)); + + conf->thread = md_register_thread(raid5d, mddev, "%s_raid5"); + if (!conf->thread) { + printk(KERN_ERR + "raid5: couldn't allocate thread for %s\n", + mdname(mddev)); goto abort; } - if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) { - printk(KERN_ERR - "raid5: unsupported parity algorithm %d for %s\n", - conf->algorithm, mdname(mddev)); - goto abort; + + return conf; + + abort: + if (conf) { + shrink_stripes(conf); + safe_put_page(conf->spare_page); + kfree(conf->disks); + kfree(conf->stripe_hashtbl); + kfree(conf); + return ERR_PTR(-EIO); + } else + return ERR_PTR(-ENOMEM); +} + +static int run(mddev_t *mddev) +{ + raid5_conf_t *conf; + int working_disks = 0; + mdk_rdev_t *rdev; + + if (mddev->reshape_position != MaxSector) { + /* Check that we can continue the reshape. + * Currently only disks can change, it must + * increase, and we must be past the point where + * a stripe over-writes itself + */ + sector_t here_new, here_old; + int old_disks; + int max_degraded = (mddev->level == 6 ? 2 : 1); + + if (mddev->new_level != mddev->level) { + printk(KERN_ERR "raid5: %s: unsupported reshape " + "required - aborting.\n", + mdname(mddev)); + return -EINVAL; + } + old_disks = mddev->raid_disks - mddev->delta_disks; + /* reshape_position must be on a new-stripe boundary, and one + * further up in new geometry must map after here in old + * geometry. + */ + here_new = mddev->reshape_position; + if (sector_div(here_new, (mddev->new_chunk>>9)* + (mddev->raid_disks - max_degraded))) { + printk(KERN_ERR "raid5: reshape_position not " + "on a stripe boundary\n"); + return -EINVAL; + } + /* here_new is the stripe we will write to */ + here_old = mddev->reshape_position; + sector_div(here_old, (mddev->chunk_size>>9)* + (old_disks-max_degraded)); + /* here_old is the first stripe that we might need to read + * from */ + if (here_new >= here_old) { + /* Reading from the same stripe as writing to - bad */ + printk(KERN_ERR "raid5: reshape_position too early for " + "auto-recovery - aborting.\n"); + return -EINVAL; + } + printk(KERN_INFO "raid5: reshape will continue\n"); + /* OK, we should be able to continue; */ + } else { + BUG_ON(mddev->level != mddev->new_level); + BUG_ON(mddev->layout != mddev->new_layout); + BUG_ON(mddev->chunk_size != mddev->new_chunk); + BUG_ON(mddev->delta_disks != 0); } + + if (mddev->private == NULL) + conf = setup_conf(mddev); + else + conf = mddev->private; + + if (IS_ERR(conf)) + return PTR_ERR(conf); + + mddev->thread = conf->thread; + conf->thread = NULL; + mddev->private = conf; + + /* + * 0 for a fully functional array, 1 or 2 for a degraded array. + */ + list_for_each_entry(rdev, &mddev->disks, same_set) + if (rdev->raid_disk >= 0 && + test_bit(In_sync, &rdev->flags)) + working_disks++; + + mddev->degraded = conf->raid_disks - working_disks; + if (mddev->degraded > conf->max_degraded) { printk(KERN_ERR "raid5: not enough operational devices for %s" " (%d/%d failed)\n", @@ -4169,6 +4526,10 @@ static int run(mddev_t *mddev) goto abort; } + /* device size must be a multiple of chunk size */ + mddev->dev_sectors &= ~(mddev->chunk_size / 512 - 1); + mddev->resync_max_sectors = mddev->dev_sectors; + if (mddev->degraded > 0 && mddev->recovery_cp != MaxSector) { if (mddev->ok_start_degraded) @@ -4184,43 +4545,22 @@ static int run(mddev_t *mddev) } } - { - mddev->thread = md_register_thread(raid5d, mddev, "%s_raid5"); - if (!mddev->thread) { - printk(KERN_ERR - "raid5: couldn't allocate thread for %s\n", - mdname(mddev)); - goto abort; - } - } - memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + - conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; - if (grow_stripes(conf, conf->max_nr_stripes)) { - printk(KERN_ERR - "raid5: couldn't allocate %dkB for buffers\n", memory); - shrink_stripes(conf); - md_unregister_thread(mddev->thread); - goto abort; - } else - printk(KERN_INFO "raid5: allocated %dkB for %s\n", - memory, mdname(mddev)); - if (mddev->degraded == 0) printk("raid5: raid level %d set %s active with %d out of %d" - " devices, algorithm %d\n", conf->level, mdname(mddev), - mddev->raid_disks-mddev->degraded, mddev->raid_disks, - conf->algorithm); + " devices, algorithm %d\n", conf->level, mdname(mddev), + mddev->raid_disks-mddev->degraded, mddev->raid_disks, + mddev->new_layout); else printk(KERN_ALERT "raid5: raid level %d set %s active with %d" " out of %d devices, algorithm %d\n", conf->level, mdname(mddev), mddev->raid_disks - mddev->degraded, - mddev->raid_disks, conf->algorithm); + mddev->raid_disks, mddev->new_layout); print_raid5_conf(conf); - if (conf->expand_progress != MaxSector) { + if (conf->reshape_progress != MaxSector) { printk("...ok start reshape thread\n"); - conf->expand_lo = conf->expand_progress; + conf->reshape_safe = conf->reshape_progress; atomic_set(&conf->reshape_stripes, 0); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); @@ -4247,18 +4587,22 @@ static int run(mddev_t *mddev) "raid5: failed to create sysfs attributes for %s\n", mdname(mddev)); + mddev->queue->queue_lock = &conf->device_lock; + mddev->queue->unplug_fn = raid5_unplug_device; mddev->queue->backing_dev_info.congested_data = mddev; mddev->queue->backing_dev_info.congested_fn = raid5_congested; - mddev->array_sectors = 2 * mddev->size * (conf->previous_raid_disks - - conf->max_degraded); + md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); return 0; abort: + md_unregister_thread(mddev->thread); + mddev->thread = NULL; if (conf) { + shrink_stripes(conf); print_raid5_conf(conf); safe_put_page(conf->spare_page); kfree(conf->disks); @@ -4396,6 +4740,10 @@ static int raid5_remove_disk(mddev_t *mddev, int number) print_raid5_conf(conf); rdev = p->rdev; if (rdev) { + if (number >= conf->raid_disks && + conf->reshape_progress == MaxSector) + clear_bit(In_sync, &rdev->flags); + if (test_bit(In_sync, &rdev->flags) || atomic_read(&rdev->nr_pending)) { err = -EBUSY; @@ -4405,7 +4753,8 @@ static int raid5_remove_disk(mddev_t *mddev, int number) * isn't possible. */ if (!test_bit(Faulty, &rdev->flags) && - mddev->degraded <= conf->max_degraded) { + mddev->degraded <= conf->max_degraded && + number < conf->raid_disks) { err = -EBUSY; goto abort; } @@ -4472,36 +4821,48 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) * any io in the removed space completes, but it hardly seems * worth it. */ - raid5_conf_t *conf = mddev_to_conf(mddev); - sectors &= ~((sector_t)mddev->chunk_size/512 - 1); - mddev->array_sectors = sectors * (mddev->raid_disks - - conf->max_degraded); + md_set_array_sectors(mddev, raid5_size(mddev, sectors, + mddev->raid_disks)); + if (mddev->array_sectors > + raid5_size(mddev, sectors, mddev->raid_disks)) + return -EINVAL; set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; - if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { - mddev->recovery_cp = mddev->size << 1; + if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { + mddev->recovery_cp = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } - mddev->size = sectors /2; + mddev->dev_sectors = sectors; mddev->resync_max_sectors = sectors; return 0; } -#ifdef CONFIG_MD_RAID5_RESHAPE static int raid5_check_reshape(mddev_t *mddev) { raid5_conf_t *conf = mddev_to_conf(mddev); - int err; - if (mddev->delta_disks < 0 || - mddev->new_level != mddev->level) - return -EINVAL; /* Cannot shrink array or change level yet */ - if (mddev->delta_disks == 0) - return 0; /* nothing to do */ + if (mddev->delta_disks == 0 && + mddev->new_layout == mddev->layout && + mddev->new_chunk == mddev->chunk_size) + return -EINVAL; /* nothing to do */ if (mddev->bitmap) /* Cannot grow a bitmap yet */ return -EBUSY; + if (mddev->degraded > conf->max_degraded) + return -EINVAL; + if (mddev->delta_disks < 0) { + /* We might be able to shrink, but the devices must + * be made bigger first. + * For raid6, 4 is the minimum size. + * Otherwise 2 is the minimum + */ + int min = 2; + if (mddev->level == 6) + min = 4; + if (mddev->raid_disks + mddev->delta_disks < min) + return -EINVAL; + } /* Can only proceed if there are plenty of stripe_heads. * We need a minimum of one full stripe,, and for sensible progress @@ -4514,18 +4875,12 @@ static int raid5_check_reshape(mddev_t *mddev) if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes || (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) { printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", - (mddev->chunk_size / STRIPE_SIZE)*4); + (max(mddev->chunk_size, mddev->new_chunk) + / STRIPE_SIZE)*4); return -ENOSPC; } - err = resize_stripes(conf, conf->raid_disks + mddev->delta_disks); - if (err) - return err; - - if (mddev->degraded > conf->max_degraded) - return -EINVAL; - /* looks like we might be able to manage this */ - return 0; + return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); } static int raid5_start_reshape(mddev_t *mddev) @@ -4550,12 +4905,31 @@ static int raid5_start_reshape(mddev_t *mddev) */ return -EINVAL; + /* Refuse to reduce size of the array. Any reductions in + * array size must be through explicit setting of array_size + * attribute. + */ + if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) + < mddev->array_sectors) { + printk(KERN_ERR "md: %s: array size must be reduced " + "before number of disks\n", mdname(mddev)); + return -EINVAL; + } + atomic_set(&conf->reshape_stripes, 0); spin_lock_irq(&conf->device_lock); conf->previous_raid_disks = conf->raid_disks; conf->raid_disks += mddev->delta_disks; - conf->expand_progress = 0; - conf->expand_lo = 0; + conf->prev_chunk = conf->chunk_size; + conf->chunk_size = mddev->new_chunk; + conf->prev_algo = conf->algorithm; + conf->algorithm = mddev->new_layout; + if (mddev->delta_disks < 0) + conf->reshape_progress = raid5_size(mddev, 0, 0); + else + conf->reshape_progress = 0; + conf->reshape_safe = conf->reshape_progress; + conf->generation++; spin_unlock_irq(&conf->device_lock); /* Add some new drives, as many as will fit. @@ -4580,9 +4954,12 @@ static int raid5_start_reshape(mddev_t *mddev) break; } - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices; - spin_unlock_irqrestore(&conf->device_lock, flags); + if (mddev->delta_disks > 0) { + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) + - added_devices; + spin_unlock_irqrestore(&conf->device_lock, flags); + } mddev->raid_disks = conf->raid_disks; mddev->reshape_position = 0; set_bit(MD_CHANGE_DEVS, &mddev->flags); @@ -4597,52 +4974,86 @@ static int raid5_start_reshape(mddev_t *mddev) mddev->recovery = 0; spin_lock_irq(&conf->device_lock); mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; - conf->expand_progress = MaxSector; + conf->reshape_progress = MaxSector; spin_unlock_irq(&conf->device_lock); return -EAGAIN; } + conf->reshape_checkpoint = jiffies; md_wakeup_thread(mddev->sync_thread); md_new_event(mddev); return 0; } -#endif +/* This is called from the reshape thread and should make any + * changes needed in 'conf' + */ static void end_reshape(raid5_conf_t *conf) { - struct block_device *bdev; if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { - conf->mddev->array_sectors = 2 * conf->mddev->size * - (conf->raid_disks - conf->max_degraded); - set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors); - conf->mddev->changed = 1; - - bdev = bdget_disk(conf->mddev->gendisk, 0); - if (bdev) { - mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, - (loff_t)conf->mddev->array_sectors << 9); - mutex_unlock(&bdev->bd_inode->i_mutex); - bdput(bdev); - } + spin_lock_irq(&conf->device_lock); - conf->expand_progress = MaxSector; + conf->previous_raid_disks = conf->raid_disks; + conf->reshape_progress = MaxSector; spin_unlock_irq(&conf->device_lock); - conf->mddev->reshape_position = MaxSector; + wake_up(&conf->wait_for_overlap); /* read-ahead size must cover two whole stripes, which is * 2 * (datadisks) * chunksize where 'n' is the number of raid devices */ { - int data_disks = conf->previous_raid_disks - conf->max_degraded; - int stripe = data_disks * - (conf->mddev->chunk_size / PAGE_SIZE); + int data_disks = conf->raid_disks - conf->max_degraded; + int stripe = data_disks * (conf->chunk_size + / PAGE_SIZE); if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; } } } +/* This is called from the raid5d thread with mddev_lock held. + * It makes config changes to the device. + */ +static void raid5_finish_reshape(mddev_t *mddev) +{ + struct block_device *bdev; + raid5_conf_t *conf = mddev_to_conf(mddev); + + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + + if (mddev->delta_disks > 0) { + md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); + set_capacity(mddev->gendisk, mddev->array_sectors); + mddev->changed = 1; + + bdev = bdget_disk(mddev->gendisk, 0); + if (bdev) { + mutex_lock(&bdev->bd_inode->i_mutex); + i_size_write(bdev->bd_inode, + (loff_t)mddev->array_sectors << 9); + mutex_unlock(&bdev->bd_inode->i_mutex); + bdput(bdev); + } + } else { + int d; + mddev->degraded = conf->raid_disks; + for (d = 0; d < conf->raid_disks ; d++) + if (conf->disks[d].rdev && + test_bit(In_sync, + &conf->disks[d].rdev->flags)) + mddev->degraded--; + for (d = conf->raid_disks ; + d < conf->raid_disks - mddev->delta_disks; + d++) + raid5_remove_disk(mddev, d); + } + mddev->layout = conf->algorithm; + mddev->chunk_size = conf->chunk_size; + mddev->reshape_position = MaxSector; + mddev->delta_disks = 0; + } +} + static void raid5_quiesce(mddev_t *mddev, int state) { raid5_conf_t *conf = mddev_to_conf(mddev); @@ -4672,6 +5083,212 @@ static void raid5_quiesce(mddev_t *mddev, int state) } } + +static void *raid5_takeover_raid1(mddev_t *mddev) +{ + int chunksect; + + if (mddev->raid_disks != 2 || + mddev->degraded > 1) + return ERR_PTR(-EINVAL); + + /* Should check if there are write-behind devices? */ + + chunksect = 64*2; /* 64K by default */ + + /* The array must be an exact multiple of chunksize */ + while (chunksect && (mddev->array_sectors & (chunksect-1))) + chunksect >>= 1; + + if ((chunksect<<9) < STRIPE_SIZE) + /* array size does not allow a suitable chunk size */ + return ERR_PTR(-EINVAL); + + mddev->new_level = 5; + mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; + mddev->new_chunk = chunksect << 9; + + return setup_conf(mddev); +} + +static void *raid5_takeover_raid6(mddev_t *mddev) +{ + int new_layout; + + switch (mddev->layout) { + case ALGORITHM_LEFT_ASYMMETRIC_6: + new_layout = ALGORITHM_LEFT_ASYMMETRIC; + break; + case ALGORITHM_RIGHT_ASYMMETRIC_6: + new_layout = ALGORITHM_RIGHT_ASYMMETRIC; + break; + case ALGORITHM_LEFT_SYMMETRIC_6: + new_layout = ALGORITHM_LEFT_SYMMETRIC; + break; + case ALGORITHM_RIGHT_SYMMETRIC_6: + new_layout = ALGORITHM_RIGHT_SYMMETRIC; + break; + case ALGORITHM_PARITY_0_6: + new_layout = ALGORITHM_PARITY_0; + break; + case ALGORITHM_PARITY_N: + new_layout = ALGORITHM_PARITY_N; + break; + default: + return ERR_PTR(-EINVAL); + } + mddev->new_level = 5; + mddev->new_layout = new_layout; + mddev->delta_disks = -1; + mddev->raid_disks -= 1; + return setup_conf(mddev); +} + + +static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk) +{ + /* For a 2-drive array, the layout and chunk size can be changed + * immediately as not restriping is needed. + * For larger arrays we record the new value - after validation + * to be used by a reshape pass. + */ + raid5_conf_t *conf = mddev_to_conf(mddev); + + if (new_layout >= 0 && !algorithm_valid_raid5(new_layout)) + return -EINVAL; + if (new_chunk > 0) { + if (new_chunk & (new_chunk-1)) + /* not a power of 2 */ + return -EINVAL; + if (new_chunk < PAGE_SIZE) + return -EINVAL; + if (mddev->array_sectors & ((new_chunk>>9)-1)) + /* not factor of array size */ + return -EINVAL; + } + + /* They look valid */ + + if (mddev->raid_disks == 2) { + + if (new_layout >= 0) { + conf->algorithm = new_layout; + mddev->layout = mddev->new_layout = new_layout; + } + if (new_chunk > 0) { + conf->chunk_size = new_chunk; + mddev->chunk_size = mddev->new_chunk = new_chunk; + } + set_bit(MD_CHANGE_DEVS, &mddev->flags); + md_wakeup_thread(mddev->thread); + } else { + if (new_layout >= 0) + mddev->new_layout = new_layout; + if (new_chunk > 0) + mddev->new_chunk = new_chunk; + } + return 0; +} + +static int raid6_reconfig(mddev_t *mddev, int new_layout, int new_chunk) +{ + if (new_layout >= 0 && !algorithm_valid_raid6(new_layout)) + return -EINVAL; + if (new_chunk > 0) { + if (new_chunk & (new_chunk-1)) + /* not a power of 2 */ + return -EINVAL; + if (new_chunk < PAGE_SIZE) + return -EINVAL; + if (mddev->array_sectors & ((new_chunk>>9)-1)) + /* not factor of array size */ + return -EINVAL; + } + + /* They look valid */ + + if (new_layout >= 0) + mddev->new_layout = new_layout; + if (new_chunk > 0) + mddev->new_chunk = new_chunk; + + return 0; +} + +static void *raid5_takeover(mddev_t *mddev) +{ + /* raid5 can take over: + * raid0 - if all devices are the same - make it a raid4 layout + * raid1 - if there are two drives. We need to know the chunk size + * raid4 - trivial - just use a raid4 layout. + * raid6 - Providing it is a *_6 layout + * + * For now, just do raid1 + */ + + if (mddev->level == 1) + return raid5_takeover_raid1(mddev); + if (mddev->level == 4) { + mddev->new_layout = ALGORITHM_PARITY_N; + mddev->new_level = 5; + return setup_conf(mddev); + } + if (mddev->level == 6) + return raid5_takeover_raid6(mddev); + + return ERR_PTR(-EINVAL); +} + + +static struct mdk_personality raid5_personality; + +static void *raid6_takeover(mddev_t *mddev) +{ + /* Currently can only take over a raid5. We map the + * personality to an equivalent raid6 personality + * with the Q block at the end. + */ + int new_layout; + + if (mddev->pers != &raid5_personality) + return ERR_PTR(-EINVAL); + if (mddev->degraded > 1) + return ERR_PTR(-EINVAL); + if (mddev->raid_disks > 253) + return ERR_PTR(-EINVAL); + if (mddev->raid_disks < 3) + return ERR_PTR(-EINVAL); + + switch (mddev->layout) { + case ALGORITHM_LEFT_ASYMMETRIC: + new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; + break; + case ALGORITHM_RIGHT_ASYMMETRIC: + new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; + break; + case ALGORITHM_LEFT_SYMMETRIC: + new_layout = ALGORITHM_LEFT_SYMMETRIC_6; + break; + case ALGORITHM_RIGHT_SYMMETRIC: + new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; + break; + case ALGORITHM_PARITY_0: + new_layout = ALGORITHM_PARITY_0_6; + break; + case ALGORITHM_PARITY_N: + new_layout = ALGORITHM_PARITY_N; + break; + default: + return ERR_PTR(-EINVAL); + } + mddev->new_level = 6; + mddev->new_layout = new_layout; + mddev->delta_disks = 1; + mddev->raid_disks += 1; + return setup_conf(mddev); +} + + static struct mdk_personality raid6_personality = { .name = "raid6", @@ -4687,11 +5304,13 @@ static struct mdk_personality raid6_personality = .spare_active = raid5_spare_active, .sync_request = sync_request, .resize = raid5_resize, -#ifdef CONFIG_MD_RAID5_RESHAPE + .size = raid5_size, .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, -#endif + .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, + .takeover = raid6_takeover, + .reconfig = raid6_reconfig, }; static struct mdk_personality raid5_personality = { @@ -4708,11 +5327,13 @@ static struct mdk_personality raid5_personality = .spare_active = raid5_spare_active, .sync_request = sync_request, .resize = raid5_resize, -#ifdef CONFIG_MD_RAID5_RESHAPE + .size = raid5_size, .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, -#endif + .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, + .takeover = raid5_takeover, + .reconfig = raid5_reconfig, }; static struct mdk_personality raid4_personality = @@ -4730,20 +5351,15 @@ static struct mdk_personality raid4_personality = .spare_active = raid5_spare_active, .sync_request = sync_request, .resize = raid5_resize, -#ifdef CONFIG_MD_RAID5_RESHAPE + .size = raid5_size, .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, -#endif + .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, }; static int __init raid5_init(void) { - int e; - - e = raid6_select_algo(); - if ( e ) - return e; register_md_personality(&raid6_personality); register_md_personality(&raid5_personality); register_md_personality(&raid4_personality); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h new file mode 100644 index 00000000000..52ba99954de --- /dev/null +++ b/drivers/md/raid5.h @@ -0,0 +1,474 @@ +#ifndef _RAID5_H +#define _RAID5_H + +#include <linux/raid/xor.h> + +/* + * + * Each stripe contains one buffer per disc. Each buffer can be in + * one of a number of states stored in "flags". Changes between + * these states happen *almost* exclusively under a per-stripe + * spinlock. Some very specific changes can happen in bi_end_io, and + * these are not protected by the spin lock. + * + * The flag bits that are used to represent these states are: + * R5_UPTODATE and R5_LOCKED + * + * State Empty == !UPTODATE, !LOCK + * We have no data, and there is no active request + * State Want == !UPTODATE, LOCK + * A read request is being submitted for this block + * State Dirty == UPTODATE, LOCK + * Some new data is in this buffer, and it is being written out + * State Clean == UPTODATE, !LOCK + * We have valid data which is the same as on disc + * + * The possible state transitions are: + * + * Empty -> Want - on read or write to get old data for parity calc + * Empty -> Dirty - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE) + * Empty -> Clean - on compute_block when computing a block for failed drive + * Want -> Empty - on failed read + * Want -> Clean - on successful completion of read request + * Dirty -> Clean - on successful completion of write request + * Dirty -> Clean - on failed write + * Clean -> Dirty - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW) + * + * The Want->Empty, Want->Clean, Dirty->Clean, transitions + * all happen in b_end_io at interrupt time. + * Each sets the Uptodate bit before releasing the Lock bit. + * This leaves one multi-stage transition: + * Want->Dirty->Clean + * This is safe because thinking that a Clean buffer is actually dirty + * will at worst delay some action, and the stripe will be scheduled + * for attention after the transition is complete. + * + * There is one possibility that is not covered by these states. That + * is if one drive has failed and there is a spare being rebuilt. We + * can't distinguish between a clean block that has been generated + * from parity calculations, and a clean block that has been + * successfully written to the spare ( or to parity when resyncing). + * To distingush these states we have a stripe bit STRIPE_INSYNC that + * is set whenever a write is scheduled to the spare, or to the parity + * disc if there is no spare. A sync request clears this bit, and + * when we find it set with no buffers locked, we know the sync is + * complete. + * + * Buffers for the md device that arrive via make_request are attached + * to the appropriate stripe in one of two lists linked on b_reqnext. + * One list (bh_read) for read requests, one (bh_write) for write. + * There should never be more than one buffer on the two lists + * together, but we are not guaranteed of that so we allow for more. + * + * If a buffer is on the read list when the associated cache buffer is + * Uptodate, the data is copied into the read buffer and it's b_end_io + * routine is called. This may happen in the end_request routine only + * if the buffer has just successfully been read. end_request should + * remove the buffers from the list and then set the Uptodate bit on + * the buffer. Other threads may do this only if they first check + * that the Uptodate bit is set. Once they have checked that they may + * take buffers off the read queue. + * + * When a buffer on the write list is committed for write it is copied + * into the cache buffer, which is then marked dirty, and moved onto a + * third list, the written list (bh_written). Once both the parity + * block and the cached buffer are successfully written, any buffer on + * a written list can be returned with b_end_io. + * + * The write list and read list both act as fifos. The read list is + * protected by the device_lock. The write and written lists are + * protected by the stripe lock. The device_lock, which can be + * claimed while the stipe lock is held, is only for list + * manipulations and will only be held for a very short time. It can + * be claimed from interrupts. + * + * + * Stripes in the stripe cache can be on one of two lists (or on + * neither). The "inactive_list" contains stripes which are not + * currently being used for any request. They can freely be reused + * for another stripe. The "handle_list" contains stripes that need + * to be handled in some way. Both of these are fifo queues. Each + * stripe is also (potentially) linked to a hash bucket in the hash + * table so that it can be found by sector number. Stripes that are + * not hashed must be on the inactive_list, and will normally be at + * the front. All stripes start life this way. + * + * The inactive_list, handle_list and hash bucket lists are all protected by the + * device_lock. + * - stripes on the inactive_list never have their stripe_lock held. + * - stripes have a reference counter. If count==0, they are on a list. + * - If a stripe might need handling, STRIPE_HANDLE is set. + * - When refcount reaches zero, then if STRIPE_HANDLE it is put on + * handle_list else inactive_list + * + * This, combined with the fact that STRIPE_HANDLE is only ever + * cleared while a stripe has a non-zero count means that if the + * refcount is 0 and STRIPE_HANDLE is set, then it is on the + * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then + * the stripe is on inactive_list. + * + * The possible transitions are: + * activate an unhashed/inactive stripe (get_active_stripe()) + * lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev + * activate a hashed, possibly active stripe (get_active_stripe()) + * lockdev check-hash if(!cnt++)unlink-stripe unlockdev + * attach a request to an active stripe (add_stripe_bh()) + * lockdev attach-buffer unlockdev + * handle a stripe (handle_stripe()) + * lockstripe clrSTRIPE_HANDLE ... + * (lockdev check-buffers unlockdev) .. + * change-state .. + * record io/ops needed unlockstripe schedule io/ops + * release an active stripe (release_stripe()) + * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev + * + * The refcount counts each thread that have activated the stripe, + * plus raid5d if it is handling it, plus one for each active request + * on a cached buffer, and plus one if the stripe is undergoing stripe + * operations. + * + * Stripe operations are performed outside the stripe lock, + * the stripe operations are: + * -copying data between the stripe cache and user application buffers + * -computing blocks to save a disk access, or to recover a missing block + * -updating the parity on a write operation (reconstruct write and + * read-modify-write) + * -checking parity correctness + * -running i/o to disk + * These operations are carried out by raid5_run_ops which uses the async_tx + * api to (optionally) offload operations to dedicated hardware engines. + * When requesting an operation handle_stripe sets the pending bit for the + * operation and increments the count. raid5_run_ops is then run whenever + * the count is non-zero. + * There are some critical dependencies between the operations that prevent some + * from being requested while another is in flight. + * 1/ Parity check operations destroy the in cache version of the parity block, + * so we prevent parity dependent operations like writes and compute_blocks + * from starting while a check is in progress. Some dma engines can perform + * the check without damaging the parity block, in these cases the parity + * block is re-marked up to date (assuming the check was successful) and is + * not re-read from disk. + * 2/ When a write operation is requested we immediately lock the affected + * blocks, and mark them as not up to date. This causes new read requests + * to be held off, as well as parity checks and compute block operations. + * 3/ Once a compute block operation has been requested handle_stripe treats + * that block as if it is up to date. raid5_run_ops guaruntees that any + * operation that is dependent on the compute block result is initiated after + * the compute block completes. + */ + +/* + * Operations state - intermediate states that are visible outside of sh->lock + * In general _idle indicates nothing is running, _run indicates a data + * processing operation is active, and _result means the data processing result + * is stable and can be acted upon. For simple operations like biofill and + * compute that only have an _idle and _run state they are indicated with + * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN) + */ +/** + * enum check_states - handles syncing / repairing a stripe + * @check_state_idle - check operations are quiesced + * @check_state_run - check operation is running + * @check_state_result - set outside lock when check result is valid + * @check_state_compute_run - check failed and we are repairing + * @check_state_compute_result - set outside lock when compute result is valid + */ +enum check_states { + check_state_idle = 0, + check_state_run, /* parity check */ + check_state_check_result, + check_state_compute_run, /* parity repair */ + check_state_compute_result, +}; + +/** + * enum reconstruct_states - handles writing or expanding a stripe + */ +enum reconstruct_states { + reconstruct_state_idle = 0, + reconstruct_state_prexor_drain_run, /* prexor-write */ + reconstruct_state_drain_run, /* write */ + reconstruct_state_run, /* expand */ + reconstruct_state_prexor_drain_result, + reconstruct_state_drain_result, + reconstruct_state_result, +}; + +struct stripe_head { + struct hlist_node hash; + struct list_head lru; /* inactive_list or handle_list */ + struct raid5_private_data *raid_conf; + short generation; /* increments with every + * reshape */ + sector_t sector; /* sector of this row */ + short pd_idx; /* parity disk index */ + short qd_idx; /* 'Q' disk index for raid6 */ + short ddf_layout;/* use DDF ordering to calculate Q */ + unsigned long state; /* state flags */ + atomic_t count; /* nr of active thread/requests */ + spinlock_t lock; + int bm_seq; /* sequence number for bitmap flushes */ + int disks; /* disks in stripe */ + enum check_states check_state; + enum reconstruct_states reconstruct_state; + /* stripe_operations + * @target - STRIPE_OP_COMPUTE_BLK target + */ + struct stripe_operations { + int target; + u32 zero_sum_result; + } ops; + struct r5dev { + struct bio req; + struct bio_vec vec; + struct page *page; + struct bio *toread, *read, *towrite, *written; + sector_t sector; /* sector of this page */ + unsigned long flags; + } dev[1]; /* allocated with extra space depending of RAID geometry */ +}; + +/* stripe_head_state - collects and tracks the dynamic state of a stripe_head + * for handle_stripe. It is only valid under spin_lock(sh->lock); + */ +struct stripe_head_state { + int syncing, expanding, expanded; + int locked, uptodate, to_read, to_write, failed, written; + int to_fill, compute, req_compute, non_overwrite; + int failed_num; + unsigned long ops_request; +}; + +/* r6_state - extra state data only relevant to r6 */ +struct r6_state { + int p_failed, q_failed, failed_num[2]; +}; + +/* Flags */ +#define R5_UPTODATE 0 /* page contains current data */ +#define R5_LOCKED 1 /* IO has been submitted on "req" */ +#define R5_OVERWRITE 2 /* towrite covers whole page */ +/* and some that are internal to handle_stripe */ +#define R5_Insync 3 /* rdev && rdev->in_sync at start */ +#define R5_Wantread 4 /* want to schedule a read */ +#define R5_Wantwrite 5 +#define R5_Overlap 7 /* There is a pending overlapping request on this block */ +#define R5_ReadError 8 /* seen a read error here recently */ +#define R5_ReWrite 9 /* have tried to over-write the readerror */ + +#define R5_Expanded 10 /* This block now has post-expand data */ +#define R5_Wantcompute 11 /* compute_block in progress treat as + * uptodate + */ +#define R5_Wantfill 12 /* dev->toread contains a bio that needs + * filling + */ +#define R5_Wantdrain 13 /* dev->towrite needs to be drained */ +/* + * Write method + */ +#define RECONSTRUCT_WRITE 1 +#define READ_MODIFY_WRITE 2 +/* not a write method, but a compute_parity mode */ +#define CHECK_PARITY 3 +/* Additional compute_parity mode -- updates the parity w/o LOCKING */ +#define UPDATE_PARITY 4 + +/* + * Stripe state + */ +#define STRIPE_HANDLE 2 +#define STRIPE_SYNCING 3 +#define STRIPE_INSYNC 4 +#define STRIPE_PREREAD_ACTIVE 5 +#define STRIPE_DELAYED 6 +#define STRIPE_DEGRADED 7 +#define STRIPE_BIT_DELAY 8 +#define STRIPE_EXPANDING 9 +#define STRIPE_EXPAND_SOURCE 10 +#define STRIPE_EXPAND_READY 11 +#define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */ +#define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ +#define STRIPE_BIOFILL_RUN 14 +#define STRIPE_COMPUTE_RUN 15 +/* + * Operation request flags + */ +#define STRIPE_OP_BIOFILL 0 +#define STRIPE_OP_COMPUTE_BLK 1 +#define STRIPE_OP_PREXOR 2 +#define STRIPE_OP_BIODRAIN 3 +#define STRIPE_OP_POSTXOR 4 +#define STRIPE_OP_CHECK 5 + +/* + * Plugging: + * + * To improve write throughput, we need to delay the handling of some + * stripes until there has been a chance that several write requests + * for the one stripe have all been collected. + * In particular, any write request that would require pre-reading + * is put on a "delayed" queue until there are no stripes currently + * in a pre-read phase. Further, if the "delayed" queue is empty when + * a stripe is put on it then we "plug" the queue and do not process it + * until an unplug call is made. (the unplug_io_fn() is called). + * + * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add + * it to the count of prereading stripes. + * When write is initiated, or the stripe refcnt == 0 (just in case) we + * clear the PREREAD_ACTIVE flag and decrement the count + * Whenever the 'handle' queue is empty and the device is not plugged, we + * move any strips from delayed to handle and clear the DELAYED flag and set + * PREREAD_ACTIVE. + * In stripe_handle, if we find pre-reading is necessary, we do it if + * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. + * HANDLE gets cleared if stripe_handle leave nothing locked. + */ + + +struct disk_info { + mdk_rdev_t *rdev; +}; + +struct raid5_private_data { + struct hlist_head *stripe_hashtbl; + mddev_t *mddev; + struct disk_info *spare; + int chunk_size, level, algorithm; + int max_degraded; + int raid_disks; + int max_nr_stripes; + + /* reshape_progress is the leading edge of a 'reshape' + * It has value MaxSector when no reshape is happening + * If delta_disks < 0, it is the last sector we started work on, + * else is it the next sector to work on. + */ + sector_t reshape_progress; + /* reshape_safe is the trailing edge of a reshape. We know that + * before (or after) this address, all reshape has completed. + */ + sector_t reshape_safe; + int previous_raid_disks; + int prev_chunk, prev_algo; + short generation; /* increments with every reshape */ + unsigned long reshape_checkpoint; /* Time we last updated + * metadata */ + + struct list_head handle_list; /* stripes needing handling */ + struct list_head hold_list; /* preread ready stripes */ + struct list_head delayed_list; /* stripes that have plugged requests */ + struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ + struct bio *retry_read_aligned; /* currently retrying aligned bios */ + struct bio *retry_read_aligned_list; /* aligned bios retry list */ + atomic_t preread_active_stripes; /* stripes with scheduled io */ + atomic_t active_aligned_reads; + atomic_t pending_full_writes; /* full write backlog */ + int bypass_count; /* bypassed prereads */ + int bypass_threshold; /* preread nice */ + struct list_head *last_hold; /* detect hold_list promotions */ + + atomic_t reshape_stripes; /* stripes with pending writes for reshape */ + /* unfortunately we need two cache names as we temporarily have + * two caches. + */ + int active_name; + char cache_name[2][20]; + struct kmem_cache *slab_cache; /* for allocating stripes */ + + int seq_flush, seq_write; + int quiesce; + + int fullsync; /* set to 1 if a full sync is needed, + * (fresh device added). + * Cleared when a sync completes. + */ + + struct page *spare_page; /* Used when checking P/Q in raid6 */ + + /* + * Free stripes pool + */ + atomic_t active_stripes; + struct list_head inactive_list; + wait_queue_head_t wait_for_stripe; + wait_queue_head_t wait_for_overlap; + int inactive_blocked; /* release of inactive stripes blocked, + * waiting for 25% to be free + */ + int pool_size; /* number of disks in stripeheads in pool */ + spinlock_t device_lock; + struct disk_info *disks; + + /* When taking over an array from a different personality, we store + * the new thread here until we fully activate the array. + */ + struct mdk_thread_s *thread; +}; + +typedef struct raid5_private_data raid5_conf_t; + +#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private) + +/* + * Our supported algorithms + */ +#define ALGORITHM_LEFT_ASYMMETRIC 0 /* Rotating Parity N with Data Restart */ +#define ALGORITHM_RIGHT_ASYMMETRIC 1 /* Rotating Parity 0 with Data Restart */ +#define ALGORITHM_LEFT_SYMMETRIC 2 /* Rotating Parity N with Data Continuation */ +#define ALGORITHM_RIGHT_SYMMETRIC 3 /* Rotating Parity 0 with Data Continuation */ + +/* Define non-rotating (raid4) algorithms. These allow + * conversion of raid4 to raid5. + */ +#define ALGORITHM_PARITY_0 4 /* P or P,Q are initial devices */ +#define ALGORITHM_PARITY_N 5 /* P or P,Q are final devices. */ + +/* DDF RAID6 layouts differ from md/raid6 layouts in two ways. + * Firstly, the exact positioning of the parity block is slightly + * different between the 'LEFT_*' modes of md and the "_N_*" modes + * of DDF. + * Secondly, or order of datablocks over which the Q syndrome is computed + * is different. + * Consequently we have different layouts for DDF/raid6 than md/raid6. + * These layouts are from the DDFv1.2 spec. + * Interestingly DDFv1.2-Errata-A does not specify N_CONTINUE but + * leaves RLQ=3 as 'Vendor Specific' + */ + +#define ALGORITHM_ROTATING_ZERO_RESTART 8 /* DDF PRL=6 RLQ=1 */ +#define ALGORITHM_ROTATING_N_RESTART 9 /* DDF PRL=6 RLQ=2 */ +#define ALGORITHM_ROTATING_N_CONTINUE 10 /*DDF PRL=6 RLQ=3 */ + + +/* For every RAID5 algorithm we define a RAID6 algorithm + * with exactly the same layout for data and parity, and + * with the Q block always on the last device (N-1). + * This allows trivial conversion from RAID5 to RAID6 + */ +#define ALGORITHM_LEFT_ASYMMETRIC_6 16 +#define ALGORITHM_RIGHT_ASYMMETRIC_6 17 +#define ALGORITHM_LEFT_SYMMETRIC_6 18 +#define ALGORITHM_RIGHT_SYMMETRIC_6 19 +#define ALGORITHM_PARITY_0_6 20 +#define ALGORITHM_PARITY_N_6 ALGORITHM_PARITY_N + +static inline int algorithm_valid_raid5(int layout) +{ + return (layout >= 0) && + (layout <= 5); +} +static inline int algorithm_valid_raid6(int layout) +{ + return (layout >= 0 && layout <= 5) + || + (layout == 8 || layout == 10) + || + (layout >= 16 && layout <= 20); +} + +static inline int algorithm_is_DDF(int layout) +{ + return layout >= 8 && layout <= 10; +} +#endif diff --git a/drivers/md/raid6.h b/drivers/md/raid6.h deleted file mode 100644 index 98dcde88470..00000000000 --- a/drivers/md/raid6.h +++ /dev/null @@ -1,130 +0,0 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * Copyright 2003 H. Peter Anvin - All Rights Reserved - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -#ifndef LINUX_RAID_RAID6_H -#define LINUX_RAID_RAID6_H - -#ifdef __KERNEL__ - -/* Set to 1 to use kernel-wide empty_zero_page */ -#define RAID6_USE_EMPTY_ZERO_PAGE 0 - -#include <linux/raid/md.h> -#include <linux/raid/raid5.h> - -typedef raid5_conf_t raid6_conf_t; /* Same configuration */ - -/* Additional compute_parity mode -- updates the parity w/o LOCKING */ -#define UPDATE_PARITY 4 - -/* We need a pre-zeroed page... if we don't want to use the kernel-provided - one define it here */ -#if RAID6_USE_EMPTY_ZERO_PAGE -# define raid6_empty_zero_page empty_zero_page -#else -extern const char raid6_empty_zero_page[PAGE_SIZE]; -#endif - -#else /* ! __KERNEL__ */ -/* Used for testing in user space */ - -#include <errno.h> -#include <inttypes.h> -#include <limits.h> -#include <stddef.h> -#include <sys/mman.h> -#include <sys/types.h> - -/* Not standard, but glibc defines it */ -#define BITS_PER_LONG __WORDSIZE - -typedef uint8_t u8; -typedef uint16_t u16; -typedef uint32_t u32; -typedef uint64_t u64; - -#ifndef PAGE_SIZE -# define PAGE_SIZE 4096 -#endif -extern const char raid6_empty_zero_page[PAGE_SIZE]; - -#define __init -#define __exit -#define __attribute_const__ __attribute__((const)) -#define noinline __attribute__((noinline)) - -#define preempt_enable() -#define preempt_disable() -#define cpu_has_feature(x) 1 -#define enable_kernel_altivec() -#define disable_kernel_altivec() - -#endif /* __KERNEL__ */ - -/* Routine choices */ -struct raid6_calls { - void (*gen_syndrome)(int, size_t, void **); - int (*valid)(void); /* Returns 1 if this routine set is usable */ - const char *name; /* Name of this routine set */ - int prefer; /* Has special performance attribute */ -}; - -/* Selected algorithm */ -extern struct raid6_calls raid6_call; - -/* Algorithm list */ -extern const struct raid6_calls * const raid6_algos[]; -int raid6_select_algo(void); - -/* Return values from chk_syndrome */ -#define RAID6_OK 0 -#define RAID6_P_BAD 1 -#define RAID6_Q_BAD 2 -#define RAID6_PQ_BAD 3 - -/* Galois field tables */ -extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256))); -extern const u8 raid6_gfexp[256] __attribute__((aligned(256))); -extern const u8 raid6_gfinv[256] __attribute__((aligned(256))); -extern const u8 raid6_gfexi[256] __attribute__((aligned(256))); - -/* Recovery routines */ -void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, void **ptrs); -void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs); -void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs); - -/* Some definitions to allow code to be compiled for testing in userspace */ -#ifndef __KERNEL__ - -# define jiffies raid6_jiffies() -# define printk printf -# define GFP_KERNEL 0 -# define __get_free_pages(x,y) ((unsigned long)mmap(NULL, PAGE_SIZE << (y), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0)) -# define free_pages(x,y) munmap((void *)(x), (y)*PAGE_SIZE) - -static inline void cpu_relax(void) -{ - /* Nothing */ -} - -#undef HZ -#define HZ 1000 -static inline uint32_t raid6_jiffies(void) -{ - struct timeval tv; - gettimeofday(&tv, NULL); - return tv.tv_sec*1000 + tv.tv_usec/1000; -} - -#endif /* ! __KERNEL__ */ - -#endif /* LINUX_RAID_RAID6_H */ diff --git a/drivers/md/raid6algos.c b/drivers/md/raid6algos.c index 21987e3dbe6..866215ac7f2 100644 --- a/drivers/md/raid6algos.c +++ b/drivers/md/raid6algos.c @@ -5,7 +5,7 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or + * Boston MA 02111-1307, USA; either version 2 of the License, or * (at your option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ @@ -16,13 +16,20 @@ * Algorithm list and algorithm selection for RAID-6 */ -#include "raid6.h" +#include <linux/raid/pq.h> #ifndef __KERNEL__ #include <sys/mman.h> #include <stdio.h> +#else +#if !RAID6_USE_EMPTY_ZERO_PAGE +/* In .bss so it's zeroed */ +const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); +EXPORT_SYMBOL(raid6_empty_zero_page); +#endif #endif struct raid6_calls raid6_call; +EXPORT_SYMBOL_GPL(raid6_call); /* Various routine sets */ extern const struct raid6_calls raid6_intx1; @@ -79,6 +86,7 @@ const struct raid6_calls * const raid6_algos[] = { #else /* Need more time to be stable in userspace */ #define RAID6_TIME_JIFFIES_LG2 9 +#define time_before(x, y) ((x) < (y)) #endif /* Try to pick the best algorithm */ @@ -152,3 +160,12 @@ int __init raid6_select_algo(void) return best ? 0 : -EINVAL; } + +static void raid6_exit(void) +{ + do { } while (0); +} + +subsys_initcall(raid6_select_algo); +module_exit(raid6_exit); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/raid6altivec.uc b/drivers/md/raid6altivec.uc index b9afd35b881..699dfeee494 100644 --- a/drivers/md/raid6altivec.uc +++ b/drivers/md/raid6altivec.uc @@ -5,7 +5,7 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or + * Boston MA 02111-1307, USA; either version 2 of the License, or * (at your option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ @@ -22,7 +22,7 @@ * bracked this with preempt_disable/enable or in a lock) */ -#include "raid6.h" +#include <linux/raid/pq.h> #ifdef CONFIG_ALTIVEC diff --git a/drivers/md/raid6int.uc b/drivers/md/raid6int.uc index ad004cee0e2..f9bf9cba357 100644 --- a/drivers/md/raid6int.uc +++ b/drivers/md/raid6int.uc @@ -5,7 +5,7 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or + * Boston MA 02111-1307, USA; either version 2 of the License, or * (at your option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ @@ -18,7 +18,7 @@ * This file is postprocessed using unroll.pl */ -#include "raid6.h" +#include <linux/raid/pq.h> /* * This is the C data type to use diff --git a/drivers/md/raid6mmx.c b/drivers/md/raid6mmx.c index d4e4a1bd70a..e7f6c13132b 100644 --- a/drivers/md/raid6mmx.c +++ b/drivers/md/raid6mmx.c @@ -5,7 +5,7 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or + * Boston MA 02111-1307, USA; either version 2 of the License, or * (at your option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ @@ -18,7 +18,7 @@ #if defined(__i386__) && !defined(__arch_um__) -#include "raid6.h" +#include <linux/raid/pq.h> #include "raid6x86.h" /* Shared with raid6sse1.c */ diff --git a/drivers/md/raid6recov.c b/drivers/md/raid6recov.c index a8c4d9451bd..2609f00e0d6 100644 --- a/drivers/md/raid6recov.c +++ b/drivers/md/raid6recov.c @@ -5,7 +5,7 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or + * Boston MA 02111-1307, USA; either version 2 of the License, or * (at your option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ @@ -18,7 +18,7 @@ * the syndrome.) */ -#include "raid6.h" +#include <linux/raid/pq.h> /* Recover two failed data blocks. */ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, @@ -63,9 +63,7 @@ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, p++; q++; } } - - - +EXPORT_SYMBOL_GPL(raid6_2data_recov); /* Recover failure of one data block plus the P block */ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) @@ -97,9 +95,10 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) q++; dq++; } } +EXPORT_SYMBOL_GPL(raid6_datap_recov); - -#ifndef __KERNEL__ /* Testing only */ +#ifndef __KERNEL__ +/* Testing only */ /* Recover two failed blocks. */ void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs) diff --git a/drivers/md/raid6sse1.c b/drivers/md/raid6sse1.c index 0666237276f..b274dd5eab8 100644 --- a/drivers/md/raid6sse1.c +++ b/drivers/md/raid6sse1.c @@ -5,7 +5,7 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or + * Boston MA 02111-1307, USA; either version 2 of the License, or * (at your option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ @@ -23,7 +23,7 @@ #if defined(__i386__) && !defined(__arch_um__) -#include "raid6.h" +#include <linux/raid/pq.h> #include "raid6x86.h" /* Defined in raid6mmx.c */ diff --git a/drivers/md/raid6sse2.c b/drivers/md/raid6sse2.c index b034ad86803..6ed6c6c0389 100644 --- a/drivers/md/raid6sse2.c +++ b/drivers/md/raid6sse2.c @@ -5,7 +5,7 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or + * Boston MA 02111-1307, USA; either version 2 of the License, or * (at your option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ @@ -19,7 +19,7 @@ #if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) -#include "raid6.h" +#include <linux/raid/pq.h> #include "raid6x86.h" static const struct raid6_sse_constants { diff --git a/drivers/md/raid6test/Makefile b/drivers/md/raid6test/Makefile index 78e0396adf2..58ffdf4f516 100644 --- a/drivers/md/raid6test/Makefile +++ b/drivers/md/raid6test/Makefile @@ -5,7 +5,7 @@ CC = gcc OPTFLAGS = -O2 # Adjust as desired -CFLAGS = -I.. -g $(OPTFLAGS) +CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS) LD = ld PERL = perl AR = ar diff --git a/drivers/md/raid6test/test.c b/drivers/md/raid6test/test.c index 559cc41b258..7a930318b17 100644 --- a/drivers/md/raid6test/test.c +++ b/drivers/md/raid6test/test.c @@ -17,7 +17,7 @@ #include <stdlib.h> #include <stdio.h> #include <string.h> -#include "raid6.h" +#include <linux/raid/pq.h> #define NDISKS 16 /* Including P and Q */ diff --git a/drivers/md/raid6x86.h b/drivers/md/raid6x86.h index 99fea7a70ca..4c22c156855 100644 --- a/drivers/md/raid6x86.h +++ b/drivers/md/raid6x86.h @@ -5,7 +5,7 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or + * Boston MA 02111-1307, USA; either version 2 of the License, or * (at your option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ diff --git a/drivers/mfd/twl4030-core.c b/drivers/mfd/twl4030-core.c index 68826f1e36b..ec90e953adc 100644 --- a/drivers/mfd/twl4030-core.c +++ b/drivers/mfd/twl4030-core.c @@ -592,11 +592,9 @@ add_children(struct twl4030_platform_data *pdata, unsigned long features) /* maybe add LDOs that are omitted on cost-reduced parts */ if (twl_has_regulator() && !(features & TPS_SUBSET)) { - /* child = add_regulator(TWL4030_REG_VPLL2, pdata->vpll2); if (IS_ERR(child)) return PTR_ERR(child); - */ child = add_regulator(TWL4030_REG_VMMC2, pdata->vmmc2); if (IS_ERR(child)) diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index df6ce4a06cf..1445ea8f10a 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -21,6 +21,7 @@ #include <linux/leds.h> #include <linux/scatterlist.h> #include <linux/log2.h> +#include <linux/regulator/consumer.h> #include <linux/mmc/card.h> #include <linux/mmc/host.h> @@ -523,6 +524,105 @@ u32 mmc_vddrange_to_ocrmask(int vdd_min, int vdd_max) } EXPORT_SYMBOL(mmc_vddrange_to_ocrmask); +#ifdef CONFIG_REGULATOR + +/** + * mmc_regulator_get_ocrmask - return mask of supported voltages + * @supply: regulator to use + * + * This returns either a negative errno, or a mask of voltages that + * can be provided to MMC/SD/SDIO devices using the specified voltage + * regulator. This would normally be called before registering the + * MMC host adapter. + */ +int mmc_regulator_get_ocrmask(struct regulator *supply) +{ + int result = 0; + int count; + int i; + + count = regulator_count_voltages(supply); + if (count < 0) + return count; + + for (i = 0; i < count; i++) { + int vdd_uV; + int vdd_mV; + + vdd_uV = regulator_list_voltage(supply, i); + if (vdd_uV <= 0) + continue; + + vdd_mV = vdd_uV / 1000; + result |= mmc_vddrange_to_ocrmask(vdd_mV, vdd_mV); + } + + return result; +} +EXPORT_SYMBOL(mmc_regulator_get_ocrmask); + +/** + * mmc_regulator_set_ocr - set regulator to match host->ios voltage + * @vdd_bit: zero for power off, else a bit number (host->ios.vdd) + * @supply: regulator to use + * + * Returns zero on success, else negative errno. + * + * MMC host drivers may use this to enable or disable a regulator using + * a particular supply voltage. This would normally be called from the + * set_ios() method. + */ +int mmc_regulator_set_ocr(struct regulator *supply, unsigned short vdd_bit) +{ + int result = 0; + int min_uV, max_uV; + int enabled; + + enabled = regulator_is_enabled(supply); + if (enabled < 0) + return enabled; + + if (vdd_bit) { + int tmp; + int voltage; + + /* REVISIT mmc_vddrange_to_ocrmask() may have set some + * bits this regulator doesn't quite support ... don't + * be too picky, most cards and regulators are OK with + * a 0.1V range goof (it's a small error percentage). + */ + tmp = vdd_bit - ilog2(MMC_VDD_165_195); + if (tmp == 0) { + min_uV = 1650 * 1000; + max_uV = 1950 * 1000; + } else { + min_uV = 1900 * 1000 + tmp * 100 * 1000; + max_uV = min_uV + 100 * 1000; + } + + /* avoid needless changes to this voltage; the regulator + * might not allow this operation + */ + voltage = regulator_get_voltage(supply); + if (voltage < 0) + result = voltage; + else if (voltage < min_uV || voltage > max_uV) + result = regulator_set_voltage(supply, min_uV, max_uV); + else + result = 0; + + if (result == 0 && !enabled) + result = regulator_enable(supply); + } else if (enabled) { + result = regulator_disable(supply); + } + + return result; +} +EXPORT_SYMBOL(mmc_regulator_set_ocr); + +#endif + /* * Mask off any voltages we don't support and select * the lowest voltage diff --git a/drivers/mtd/maps/pxa2xx-flash.c b/drivers/mtd/maps/pxa2xx-flash.c index e9026cb1c5b..572d32fdf38 100644 --- a/drivers/mtd/maps/pxa2xx-flash.c +++ b/drivers/mtd/maps/pxa2xx-flash.c @@ -117,7 +117,7 @@ static int __init pxa2xx_flash_probe(struct platform_device *pdev) return 0; } -static int __exit pxa2xx_flash_remove(struct platform_device *dev) +static int __devexit pxa2xx_flash_remove(struct platform_device *dev) { struct pxa2xx_flash_info *info = platform_get_drvdata(dev); diff --git a/drivers/parisc/asp.c b/drivers/parisc/asp.c index 7931133526c..9ca21098b14 100644 --- a/drivers/parisc/asp.c +++ b/drivers/parisc/asp.c @@ -81,7 +81,7 @@ static int __init asp_init_chip(struct parisc_device *dev) asp.hpa = ASP_INTERRUPT_ADDR; printk(KERN_INFO "%s version %d at 0x%lx found.\n", - asp.name, asp.version, dev->hpa.start); + asp.name, asp.version, (unsigned long)dev->hpa.start); /* the IRQ ASP should use */ ret = -EBUSY; diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c index cd4dd7ed2c0..5d610cbcfe8 100644 --- a/drivers/parisc/ccio-dma.c +++ b/drivers/parisc/ccio-dma.c @@ -406,8 +406,6 @@ resource_found: } ioc->avg_search[ioc->avg_idx++] = cr_start; ioc->avg_idx &= CCIO_SEARCH_SAMPLE - 1; -#endif -#ifdef CCIO_COLLECT_STATS ioc->used_pages += pages_needed; #endif /* @@ -453,10 +451,10 @@ ccio_free_range(struct ioc *ioc, dma_addr_t iova, unsigned long pages_mapped) unsigned long mask = ~(~0UL >> pages_mapped); CCIO_FREE_MAPPINGS(ioc, res_idx, mask, 8); #else - CCIO_FREE_MAPPINGS(ioc, res_idx, 0xff, 8); + CCIO_FREE_MAPPINGS(ioc, res_idx, 0xffUL, 8); #endif } else if(pages_mapped <= 16) { - CCIO_FREE_MAPPINGS(ioc, res_idx, 0xffff, 16); + CCIO_FREE_MAPPINGS(ioc, res_idx, 0xffffUL, 16); } else if(pages_mapped <= 32) { CCIO_FREE_MAPPINGS(ioc, res_idx, ~(unsigned int)0, 32); #ifdef __LP64__ @@ -1028,8 +1026,10 @@ static int ccio_proc_info(struct seq_file *m, void *p) while (ioc != NULL) { unsigned int total_pages = ioc->res_size << 3; +#ifdef CCIO_COLLECT_STATS unsigned long avg = 0, min, max; int j; +#endif len += seq_printf(m, "%s\n", ioc->name); @@ -1060,8 +1060,7 @@ static int ccio_proc_info(struct seq_file *m, void *p) avg /= CCIO_SEARCH_SAMPLE; len += seq_printf(m, " Bitmap search : %ld/%ld/%ld (min/avg/max CPU Cycles)\n", min, avg, max); -#endif -#ifdef CCIO_COLLECT_STATS + len += seq_printf(m, "pci_map_single(): %8ld calls %8ld pages (avg %d/1000)\n", ioc->msingle_calls, ioc->msingle_pages, (int)((ioc->msingle_pages * 1000)/ioc->msingle_calls)); @@ -1400,7 +1399,7 @@ ccio_init_resource(struct resource *res, char *name, void __iomem *ioaddr) result = insert_resource(&iomem_resource, res); if (result < 0) { printk(KERN_ERR "%s() failed to claim CCIO bus address space (%08lx,%08lx)\n", - __func__, res->start, res->end); + __func__, (unsigned long)res->start, (unsigned long)res->end); } } @@ -1551,7 +1550,8 @@ static int __init ccio_probe(struct parisc_device *dev) ioc->name = dev->id.hversion == U2_IOA_RUNWAY ? "U2" : "UTurn"; - printk(KERN_INFO "Found %s at 0x%lx\n", ioc->name, dev->hpa.start); + printk(KERN_INFO "Found %s at 0x%lx\n", ioc->name, + (unsigned long)dev->hpa.start); for (i = 0; i < ioc_count; i++) { ioc_p = &(*ioc_p)->next; diff --git a/drivers/parisc/dino.c b/drivers/parisc/dino.c index bb5a1c9597c..52ae0b1d470 100644 --- a/drivers/parisc/dino.c +++ b/drivers/parisc/dino.c @@ -819,7 +819,9 @@ dino_bridge_init(struct dino_device *dino_dev, const char *name) result = ccio_request_resource(dino_dev->hba.dev, &res[i]); if (result < 0) { - printk(KERN_ERR "%s: failed to claim PCI Bus address space %d (0x%lx-0x%lx)!\n", name, i, res[i].start, res[i].end); + printk(KERN_ERR "%s: failed to claim PCI Bus address " + "space %d (0x%lx-0x%lx)!\n", name, i, + (unsigned long)res[i].start, (unsigned long)res[i].end); return result; } } @@ -899,7 +901,8 @@ static int __init dino_common_init(struct parisc_device *dev, if (request_resource(&ioport_resource, res) < 0) { printk(KERN_ERR "%s: request I/O Port region failed " "0x%lx/%lx (hpa 0x%p)\n", - name, res->start, res->end, dino_dev->hba.base_addr); + name, (unsigned long)res->start, (unsigned long)res->end, + dino_dev->hba.base_addr); return 1; } diff --git a/drivers/parisc/eisa.c b/drivers/parisc/eisa.c index 7891db50c48..f415fdd9a88 100644 --- a/drivers/parisc/eisa.c +++ b/drivers/parisc/eisa.c @@ -314,7 +314,7 @@ static int __init eisa_probe(struct parisc_device *dev) char *name = is_mongoose(dev) ? "Mongoose" : "Wax"; printk(KERN_INFO "%s EISA Adapter found at 0x%08lx\n", - name, dev->hpa.start); + name, (unsigned long)dev->hpa.start); eisa_dev.hba.dev = dev; eisa_dev.hba.iommu = ccio_get_iommu(dev); diff --git a/drivers/parisc/eisa_enumerator.c b/drivers/parisc/eisa_enumerator.c index 6d8aae003f6..c709ecc2b7f 100644 --- a/drivers/parisc/eisa_enumerator.c +++ b/drivers/parisc/eisa_enumerator.c @@ -98,7 +98,7 @@ static int configure_memory(const unsigned char *buf, res->start = mem_parent->start + get_24(buf+len+2); res->end = res->start + get_16(buf+len+5)*1024; res->flags = IORESOURCE_MEM; - printk("memory %lx-%lx ", res->start, res->end); + printk("memory %lx-%lx ", (unsigned long)res->start, (unsigned long)res->end); result = request_resource(mem_parent, res); if (result < 0) { printk("\n" KERN_ERR "EISA Enumerator: failed to claim EISA Bus address space!\n"); @@ -188,7 +188,7 @@ static int configure_port(const unsigned char *buf, struct resource *io_parent, res->start = get_16(buf+len+1); res->end = get_16(buf+len+1)+(c&HPEE_PORT_SIZE_MASK)+1; res->flags = IORESOURCE_IO; - printk("ioports %lx-%lx ", res->start, res->end); + printk("ioports %lx-%lx ", (unsigned long)res->start, (unsigned long)res->end); result = request_resource(io_parent, res); if (result < 0) { printk("\n" KERN_ERR "EISA Enumerator: failed to claim EISA Bus address space!\n"); diff --git a/drivers/parisc/iosapic.c b/drivers/parisc/iosapic.c index 501aaf1f253..73348c4047e 100644 --- a/drivers/parisc/iosapic.c +++ b/drivers/parisc/iosapic.c @@ -714,7 +714,7 @@ static void iosapic_set_affinity_irq(unsigned int irq, if (dest_cpu < 0) return; - irq_desc[irq].affinity = cpumask_of_cpu(dest_cpu); + cpumask_copy(irq_desc[irq].affinity, cpumask_of(dest_cpu)); vi->txn_addr = txn_affinity_addr(irq, dest_cpu); spin_lock_irqsave(&iosapic_lock, flags); diff --git a/drivers/parisc/led.c b/drivers/parisc/led.c index 454b6532e40..9581d361945 100644 --- a/drivers/parisc/led.c +++ b/drivers/parisc/led.c @@ -3,7 +3,7 @@ * * (c) Copyright 2000 Red Hat Software * (c) Copyright 2000 Helge Deller <hdeller@redhat.com> - * (c) Copyright 2001-2005 Helge Deller <deller@gmx.de> + * (c) Copyright 2001-2009 Helge Deller <deller@gmx.de> * (c) Copyright 2001 Randolph Chung <tausq@debian.org> * * This program is free software; you can redistribute it and/or modify @@ -243,13 +243,11 @@ static int __init led_create_procfs(void) proc_pdc_root = proc_mkdir("pdc", 0); if (!proc_pdc_root) return -1; - proc_pdc_root->owner = THIS_MODULE; ent = create_proc_entry("led", S_IFREG|S_IRUGO|S_IWUSR, proc_pdc_root); if (!ent) return -1; ent->data = (void *)LED_NOLCD; /* LED */ ent->read_proc = led_proc_read; ent->write_proc = led_proc_write; - ent->owner = THIS_MODULE; if (led_type == LED_HASLCD) { @@ -258,7 +256,6 @@ static int __init led_create_procfs(void) ent->data = (void *)LED_HASLCD; /* LCD */ ent->read_proc = led_proc_read; ent->write_proc = led_proc_write; - ent->owner = THIS_MODULE; } return 0; @@ -463,9 +460,20 @@ static void led_work_func (struct work_struct *unused) if (likely(led_lanrxtx)) currentleds |= led_get_net_activity(); if (likely(led_diskio)) currentleds |= led_get_diskio_activity(); - /* blink all LEDs twice a second if we got an Oops (HPMC) */ - if (unlikely(oops_in_progress)) - currentleds = (count_HZ<=(HZ/2)) ? 0 : 0xff; + /* blink LEDs if we got an Oops (HPMC) */ + if (unlikely(oops_in_progress)) { + if (boot_cpu_data.cpu_type >= pcxl2) { + /* newer machines don't have loadavg. LEDs, so we + * let all LEDs blink twice per second instead */ + currentleds = (count_HZ <= (HZ/2)) ? 0 : 0xff; + } else { + /* old machines: blink loadavg. LEDs twice per second */ + if (count_HZ <= (HZ/2)) + currentleds &= ~(LED4|LED5|LED6|LED7); + else + currentleds |= (LED4|LED5|LED6|LED7); + } + } if (currentleds != lastleds) { @@ -511,7 +519,7 @@ static int led_halt(struct notifier_block *nb, unsigned long event, void *buf) /* Cancel the work item and delete the queue */ if (led_wq) { - cancel_rearming_delayed_workqueue(led_wq, &led_task); + cancel_delayed_work_sync(&led_task); destroy_workqueue(led_wq); led_wq = NULL; } @@ -630,7 +638,7 @@ int lcd_print( const char *str ) /* temporarily disable the led work task */ if (led_wq) - cancel_rearming_delayed_workqueue(led_wq, &led_task); + cancel_delayed_work_sync(&led_task); /* copy display string to buffer for procfs */ strlcpy(lcd_text, str, sizeof(lcd_text)); diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 9dbd5066aca..23e56a564e0 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -164,7 +164,8 @@ static inline void context_clear_entry(struct context_entry *context) * 1: writable * 2-6: reserved * 7: super page - * 8-11: available + * 8-10: available + * 11: snoop behavior * 12-63: Host physcial address */ struct dma_pte { @@ -186,6 +187,11 @@ static inline void dma_set_pte_writable(struct dma_pte *pte) pte->val |= DMA_PTE_WRITE; } +static inline void dma_set_pte_snp(struct dma_pte *pte) +{ + pte->val |= DMA_PTE_SNP; +} + static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot) { pte->val = (pte->val & ~3) | (prot & 3); @@ -231,6 +237,7 @@ struct dmar_domain { int flags; /* flags to find out type of domain */ int iommu_coherency;/* indicate coherency of iommu access */ + int iommu_snooping; /* indicate snooping control feature*/ int iommu_count; /* reference count of iommu */ spinlock_t iommu_lock; /* protect iommu set in domain */ u64 max_addr; /* maximum mapped address */ @@ -421,7 +428,6 @@ static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain) return g_iommus[iommu_id]; } -/* "Coherency" capability may be different across iommus */ static void domain_update_iommu_coherency(struct dmar_domain *domain) { int i; @@ -438,6 +444,29 @@ static void domain_update_iommu_coherency(struct dmar_domain *domain) } } +static void domain_update_iommu_snooping(struct dmar_domain *domain) +{ + int i; + + domain->iommu_snooping = 1; + + i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus); + for (; i < g_num_of_iommus; ) { + if (!ecap_sc_support(g_iommus[i]->ecap)) { + domain->iommu_snooping = 0; + break; + } + i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1); + } +} + +/* Some capabilities may be different across iommus */ +static void domain_update_iommu_cap(struct dmar_domain *domain) +{ + domain_update_iommu_coherency(domain); + domain_update_iommu_snooping(domain); +} + static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn) { struct dmar_drhd_unit *drhd = NULL; @@ -689,15 +718,17 @@ static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr) static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end) { int addr_width = agaw_to_width(domain->agaw); + int npages; start &= (((u64)1) << addr_width) - 1; end &= (((u64)1) << addr_width) - 1; /* in case it's partial page */ start = PAGE_ALIGN(start); end &= PAGE_MASK; + npages = (end - start) / VTD_PAGE_SIZE; /* we don't need lock here, nobody else touches the iova range */ - while (start < end) { + while (npages--) { dma_pte_clear_one(domain, start); start += VTD_PAGE_SIZE; } @@ -1241,6 +1272,11 @@ static int domain_init(struct dmar_domain *domain, int guest_width) else domain->iommu_coherency = 0; + if (ecap_sc_support(iommu->ecap)) + domain->iommu_snooping = 1; + else + domain->iommu_snooping = 0; + domain->iommu_count = 1; /* always allocate the top pgd */ @@ -1369,7 +1405,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain, spin_lock_irqsave(&domain->iommu_lock, flags); if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) { domain->iommu_count++; - domain_update_iommu_coherency(domain); + domain_update_iommu_cap(domain); } spin_unlock_irqrestore(&domain->iommu_lock, flags); return 0; @@ -1469,6 +1505,8 @@ domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova, BUG_ON(dma_pte_addr(pte)); dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT); dma_set_pte_prot(pte, prot); + if (prot & DMA_PTE_SNP) + dma_set_pte_snp(pte); domain_flush_cache(domain, pte, sizeof(*pte)); start_pfn++; index++; @@ -2119,7 +2157,7 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr, error: if (iova) __free_iova(&domain->iovad, iova); - printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n", + printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n", pci_name(pdev), size, (unsigned long long)paddr, dir); return 0; } @@ -2218,7 +2256,7 @@ static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr, start_addr = iova->pfn_lo << PAGE_SHIFT; size = aligned_size((u64)dev_addr, size); - pr_debug("Device %s unmapping: %lx@%llx\n", + pr_debug("Device %s unmapping: %zx@%llx\n", pci_name(pdev), size, (unsigned long long)start_addr); /* clear the whole page */ @@ -2282,8 +2320,6 @@ static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr, free_pages((unsigned long)vaddr, order); } -#define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg))) - static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist, int nelems, enum dma_data_direction dir, struct dma_attrs *attrs) @@ -2294,7 +2330,7 @@ static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist, unsigned long start_addr; struct iova *iova; size_t size = 0; - void *addr; + phys_addr_t addr; struct scatterlist *sg; struct intel_iommu *iommu; @@ -2310,7 +2346,7 @@ static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist, if (!iova) return; for_each_sg(sglist, sg, nelems, i) { - addr = SG_ENT_VIRT_ADDRESS(sg); + addr = page_to_phys(sg_page(sg)) + sg->offset; size += aligned_size((u64)addr, sg->length); } @@ -2337,7 +2373,7 @@ static int intel_nontranslate_map_sg(struct device *hddev, for_each_sg(sglist, sg, nelems, i) { BUG_ON(!sg_page(sg)); - sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg)); + sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset; sg->dma_length = sg->length; } return nelems; @@ -2346,7 +2382,7 @@ static int intel_nontranslate_map_sg(struct device *hddev, static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems, enum dma_data_direction dir, struct dma_attrs *attrs) { - void *addr; + phys_addr_t addr; int i; struct pci_dev *pdev = to_pci_dev(hwdev); struct dmar_domain *domain; @@ -2370,8 +2406,7 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne iommu = domain_get_iommu(domain); for_each_sg(sglist, sg, nelems, i) { - addr = SG_ENT_VIRT_ADDRESS(sg); - addr = (void *)virt_to_phys(addr); + addr = page_to_phys(sg_page(sg)) + sg->offset; size += aligned_size((u64)addr, sg->length); } @@ -2394,8 +2429,7 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne start_addr = iova->pfn_lo << PAGE_SHIFT; offset = 0; for_each_sg(sglist, sg, nelems, i) { - addr = SG_ENT_VIRT_ADDRESS(sg); - addr = (void *)virt_to_phys(addr); + addr = page_to_phys(sg_page(sg)) + sg->offset; size = aligned_size((u64)addr, sg->length); ret = domain_page_mapping(domain, start_addr + offset, ((u64)addr) & PAGE_MASK, @@ -2628,6 +2662,33 @@ static int vm_domain_add_dev_info(struct dmar_domain *domain, return 0; } +static void iommu_detach_dependent_devices(struct intel_iommu *iommu, + struct pci_dev *pdev) +{ + struct pci_dev *tmp, *parent; + + if (!iommu || !pdev) + return; + + /* dependent device detach */ + tmp = pci_find_upstream_pcie_bridge(pdev); + /* Secondary interface's bus number and devfn 0 */ + if (tmp) { + parent = pdev->bus->self; + while (parent != tmp) { + iommu_detach_dev(iommu, parent->bus->number, + parent->devfn); + parent = parent->bus->self; + } + if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */ + iommu_detach_dev(iommu, + tmp->subordinate->number, 0); + else /* this is a legacy PCI bridge */ + iommu_detach_dev(iommu, + tmp->bus->number, tmp->devfn); + } +} + static void vm_domain_remove_one_dev_info(struct dmar_domain *domain, struct pci_dev *pdev) { @@ -2653,6 +2714,7 @@ static void vm_domain_remove_one_dev_info(struct dmar_domain *domain, spin_unlock_irqrestore(&device_domain_lock, flags); iommu_detach_dev(iommu, info->bus, info->devfn); + iommu_detach_dependent_devices(iommu, pdev); free_devinfo_mem(info); spin_lock_irqsave(&device_domain_lock, flags); @@ -2676,7 +2738,7 @@ static void vm_domain_remove_one_dev_info(struct dmar_domain *domain, spin_lock_irqsave(&domain->iommu_lock, tmp_flags); clear_bit(iommu->seq_id, &domain->iommu_bmp); domain->iommu_count--; - domain_update_iommu_coherency(domain); + domain_update_iommu_cap(domain); spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags); } @@ -2702,15 +2764,16 @@ static void vm_domain_remove_all_dev_info(struct dmar_domain *domain) iommu = device_to_iommu(info->bus, info->devfn); iommu_detach_dev(iommu, info->bus, info->devfn); + iommu_detach_dependent_devices(iommu, info->dev); /* clear this iommu in iommu_bmp, update iommu count - * and coherency + * and capabilities */ spin_lock_irqsave(&domain->iommu_lock, flags2); if (test_and_clear_bit(iommu->seq_id, &domain->iommu_bmp)) { domain->iommu_count--; - domain_update_iommu_coherency(domain); + domain_update_iommu_cap(domain); } spin_unlock_irqrestore(&domain->iommu_lock, flags2); @@ -2933,6 +2996,8 @@ static int intel_iommu_map_range(struct iommu_domain *domain, prot |= DMA_PTE_READ; if (iommu_prot & IOMMU_WRITE) prot |= DMA_PTE_WRITE; + if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping) + prot |= DMA_PTE_SNP; max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size); if (dmar_domain->max_addr < max_addr) { @@ -2986,6 +3051,17 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, return phys; } +static int intel_iommu_domain_has_cap(struct iommu_domain *domain, + unsigned long cap) +{ + struct dmar_domain *dmar_domain = domain->priv; + + if (cap == IOMMU_CAP_CACHE_COHERENCY) + return dmar_domain->iommu_snooping; + + return 0; +} + static struct iommu_ops intel_iommu_ops = { .domain_init = intel_iommu_domain_init, .domain_destroy = intel_iommu_domain_destroy, @@ -2994,6 +3070,7 @@ static struct iommu_ops intel_iommu_ops = { .map = intel_iommu_map_range, .unmap = intel_iommu_unmap_range, .iova_to_phys = intel_iommu_iova_to_phys, + .domain_has_cap = intel_iommu_domain_has_cap, }; static void __devinit quirk_iommu_rwbf(struct pci_dev *dev) diff --git a/drivers/pcmcia/pxa2xx_cm_x255.c b/drivers/pcmcia/pxa2xx_cm_x255.c index 4ed64d8e95e..5143a760153 100644 --- a/drivers/pcmcia/pxa2xx_cm_x255.c +++ b/drivers/pcmcia/pxa2xx_cm_x255.c @@ -63,7 +63,7 @@ static void cmx255_pcmcia_socket_state(struct soc_pcmcia_socket *skt, struct pcmcia_state *state) { int cd = skt->nr ? GPIO_PCMCIA_S1_CD_VALID : GPIO_PCMCIA_S0_CD_VALID; - int rdy = skt->nr ? GPIO_PCMCIA_S0_RDYINT : GPIO_PCMCIA_S1_RDYINT; + int rdy = skt->nr ? GPIO_PCMCIA_S1_RDYINT : GPIO_PCMCIA_S0_RDYINT; state->detect = !gpio_get_value(cd); state->ready = !!gpio_get_value(rdy); diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig index e7e0cf102d6..e58c0ce65aa 100644 --- a/drivers/regulator/Kconfig +++ b/drivers/regulator/Kconfig @@ -29,8 +29,12 @@ config REGULATOR_DEBUG Say yes here to enable debugging support. config REGULATOR_FIXED_VOLTAGE - tristate + tristate "Fixed voltage regulator support" default n + help + This driver provides support for fixed voltage regulators, + useful for systems which use a combination of software + managed regulators and simple non-configurable regulators. config REGULATOR_VIRTUAL_CONSUMER tristate "Virtual regulator consumer support" @@ -52,6 +56,13 @@ config REGULATOR_BQ24022 charging select between 100 mA and 500 mA charging current limit. +config REGULATOR_TWL4030 + bool "TI TWL4030/TWL5030/TPS695x0 PMIC" + depends on TWL4030_CORE + help + This driver supports the voltage regulators provided by + this family of companion chips. + config REGULATOR_WM8350 tristate "Wolfson Microelectroncis WM8350 AudioPlus PMIC" depends on MFD_WM8350 diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile index 61b30c6ddec..bac133afc06 100644 --- a/drivers/regulator/Makefile +++ b/drivers/regulator/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_REGULATOR_FIXED_VOLTAGE) += fixed.o obj-$(CONFIG_REGULATOR_VIRTUAL_CONSUMER) += virtual.o obj-$(CONFIG_REGULATOR_BQ24022) += bq24022.o +obj-$(CONFIG_REGULATOR_TWL4030) += twl4030-regulator.o obj-$(CONFIG_REGULATOR_WM8350) += wm8350-regulator.o obj-$(CONFIG_REGULATOR_WM8400) += wm8400-regulator.o obj-$(CONFIG_REGULATOR_DA903X) += da903x.o diff --git a/drivers/regulator/bq24022.c b/drivers/regulator/bq24022.c index c175e38a4cd..7ecb820ceeb 100644 --- a/drivers/regulator/bq24022.c +++ b/drivers/regulator/bq24022.c @@ -105,7 +105,8 @@ static int __init bq24022_probe(struct platform_device *pdev) ret = gpio_direction_output(pdata->gpio_iset2, 0); ret = gpio_direction_output(pdata->gpio_nce, 1); - bq24022 = regulator_register(&bq24022_desc, &pdev->dev, pdata); + bq24022 = regulator_register(&bq24022_desc, &pdev->dev, + pdata->init_data, pdata); if (IS_ERR(bq24022)) { dev_dbg(&pdev->dev, "couldn't register regulator\n"); ret = PTR_ERR(bq24022); diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index f511a406fca..01f7702a805 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -28,33 +28,7 @@ static DEFINE_MUTEX(regulator_list_mutex); static LIST_HEAD(regulator_list); static LIST_HEAD(regulator_map_list); - -/* - * struct regulator_dev - * - * Voltage / Current regulator class device. One for each regulator. - */ -struct regulator_dev { - struct regulator_desc *desc; - int use_count; - - /* lists we belong to */ - struct list_head list; /* list of all regulators */ - struct list_head slist; /* list of supplied regulators */ - - /* lists we own */ - struct list_head consumer_list; /* consumers we supply */ - struct list_head supply_list; /* regulators we supply */ - - struct blocking_notifier_head notifier; - struct mutex mutex; /* consumer lock */ - struct module *owner; - struct device dev; - struct regulation_constraints *constraints; - struct regulator_dev *supply; /* for tree */ - - void *reg_data; /* regulator_dev data */ -}; +static int has_full_constraints; /* * struct regulator_map @@ -79,7 +53,6 @@ struct regulator { int uA_load; int min_uV; int max_uV; - int enabled; /* count of client enables */ char *supply_name; struct device_attribute dev_attr; struct regulator_dev *rdev; @@ -312,6 +285,47 @@ static ssize_t regulator_state_show(struct device *dev, } static DEVICE_ATTR(state, 0444, regulator_state_show, NULL); +static ssize_t regulator_status_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct regulator_dev *rdev = dev_get_drvdata(dev); + int status; + char *label; + + status = rdev->desc->ops->get_status(rdev); + if (status < 0) + return status; + + switch (status) { + case REGULATOR_STATUS_OFF: + label = "off"; + break; + case REGULATOR_STATUS_ON: + label = "on"; + break; + case REGULATOR_STATUS_ERROR: + label = "error"; + break; + case REGULATOR_STATUS_FAST: + label = "fast"; + break; + case REGULATOR_STATUS_NORMAL: + label = "normal"; + break; + case REGULATOR_STATUS_IDLE: + label = "idle"; + break; + case REGULATOR_STATUS_STANDBY: + label = "standby"; + break; + default: + return -ERANGE; + } + + return sprintf(buf, "%s\n", label); +} +static DEVICE_ATTR(status, 0444, regulator_status_show, NULL); + static ssize_t regulator_min_uA_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -678,6 +692,73 @@ static int set_machine_constraints(struct regulator_dev *rdev, else name = "regulator"; + /* constrain machine-level voltage specs to fit + * the actual range supported by this regulator. + */ + if (ops->list_voltage && rdev->desc->n_voltages) { + int count = rdev->desc->n_voltages; + int i; + int min_uV = INT_MAX; + int max_uV = INT_MIN; + int cmin = constraints->min_uV; + int cmax = constraints->max_uV; + + /* it's safe to autoconfigure fixed-voltage supplies */ + if (count == 1 && !cmin) { + cmin = INT_MIN; + cmax = INT_MAX; + } + + /* voltage constraints are optional */ + if ((cmin == 0) && (cmax == 0)) + goto out; + + /* else require explicit machine-level constraints */ + if (cmin <= 0 || cmax <= 0 || cmax < cmin) { + pr_err("%s: %s '%s' voltage constraints\n", + __func__, "invalid", name); + ret = -EINVAL; + goto out; + } + + /* initial: [cmin..cmax] valid, [min_uV..max_uV] not */ + for (i = 0; i < count; i++) { + int value; + + value = ops->list_voltage(rdev, i); + if (value <= 0) + continue; + + /* maybe adjust [min_uV..max_uV] */ + if (value >= cmin && value < min_uV) + min_uV = value; + if (value <= cmax && value > max_uV) + max_uV = value; + } + + /* final: [min_uV..max_uV] valid iff constraints valid */ + if (max_uV < min_uV) { + pr_err("%s: %s '%s' voltage constraints\n", + __func__, "unsupportable", name); + ret = -EINVAL; + goto out; + } + + /* use regulator's subset of machine constraints */ + if (constraints->min_uV < min_uV) { + pr_debug("%s: override '%s' %s, %d -> %d\n", + __func__, name, "min_uV", + constraints->min_uV, min_uV); + constraints->min_uV = min_uV; + } + if (constraints->max_uV > max_uV) { + pr_debug("%s: override '%s' %s, %d -> %d\n", + __func__, name, "max_uV", + constraints->max_uV, max_uV); + constraints->max_uV = max_uV; + } + } + rdev->constraints = constraints; /* do we need to apply the constraint voltage */ @@ -695,10 +776,6 @@ static int set_machine_constraints(struct regulator_dev *rdev, } } - /* are we enabled at boot time by firmware / bootloader */ - if (rdev->constraints->boot_on) - rdev->use_count = 1; - /* do we need to setup our suspend state */ if (constraints->initial_state) { ret = suspend_prepare(rdev, constraints->initial_state); @@ -710,11 +787,27 @@ static int set_machine_constraints(struct regulator_dev *rdev, } } - /* if always_on is set then turn the regulator on if it's not - * already on. */ - if (constraints->always_on && ops->enable && - ((ops->is_enabled && !ops->is_enabled(rdev)) || - (!ops->is_enabled && !constraints->boot_on))) { + if (constraints->initial_mode) { + if (!ops->set_mode) { + printk(KERN_ERR "%s: no set_mode operation for %s\n", + __func__, name); + ret = -EINVAL; + goto out; + } + + ret = ops->set_mode(rdev, constraints->initial_mode); + if (ret < 0) { + printk(KERN_ERR + "%s: failed to set initial mode for %s: %d\n", + __func__, name, ret); + goto out; + } + } + + /* If the constraints say the regulator should be on at this point + * and we have control then make sure it is enabled. + */ + if ((constraints->always_on || constraints->boot_on) && ops->enable) { ret = ops->enable(rdev); if (ret < 0) { printk(KERN_ERR "%s: failed to enable %s\n", @@ -817,6 +910,19 @@ static void unset_consumer_device_supply(struct regulator_dev *rdev, } } +static void unset_regulator_supplies(struct regulator_dev *rdev) +{ + struct regulator_map *node, *n; + + list_for_each_entry_safe(node, n, ®ulator_map_list, list) { + if (rdev == node->regulator) { + list_del(&node->list); + kfree(node); + return; + } + } +} + #define REG_STR_SIZE 32 static struct regulator *create_regulator(struct regulator_dev *rdev, @@ -898,9 +1004,12 @@ overflow_err: * @id: Supply name or regulator ID. * * Returns a struct regulator corresponding to the regulator producer, - * or IS_ERR() condition containing errno. Use of supply names - * configured via regulator_set_device_supply() is strongly - * encouraged. + * or IS_ERR() condition containing errno. + * + * Use of supply names configured via regulator_set_device_supply() is + * strongly encouraged. It is recommended that the supply name used + * should match the name used for the supply and/or the relevant + * device pins in the datasheet. */ struct regulator *regulator_get(struct device *dev, const char *id) { @@ -922,8 +1031,6 @@ struct regulator *regulator_get(struct device *dev, const char *id) goto found; } } - printk(KERN_ERR "regulator: Unable to get requested regulator: %s\n", - id); mutex_unlock(®ulator_list_mutex); return regulator; @@ -961,10 +1068,6 @@ void regulator_put(struct regulator *regulator) mutex_lock(®ulator_list_mutex); rdev = regulator->rdev; - if (WARN(regulator->enabled, "Releasing supply %s while enabled\n", - regulator->supply_name)) - _regulator_disable(rdev); - /* remove any sysfs entries */ if (regulator->dev) { sysfs_remove_link(&rdev->dev.kobj, regulator->supply_name); @@ -1039,12 +1142,7 @@ int regulator_enable(struct regulator *regulator) int ret = 0; mutex_lock(&rdev->mutex); - if (regulator->enabled == 0) - ret = _regulator_enable(rdev); - else if (regulator->enabled < 0) - ret = -EIO; - if (ret == 0) - regulator->enabled++; + ret = _regulator_enable(rdev); mutex_unlock(&rdev->mutex); return ret; } @@ -1055,6 +1153,11 @@ static int _regulator_disable(struct regulator_dev *rdev) { int ret = 0; + if (WARN(rdev->use_count <= 0, + "unbalanced disables for %s\n", + rdev->desc->name)) + return -EIO; + /* are we the last user and permitted to disable ? */ if (rdev->use_count == 1 && !rdev->constraints->always_on) { @@ -1103,16 +1206,7 @@ int regulator_disable(struct regulator *regulator) int ret = 0; mutex_lock(&rdev->mutex); - if (regulator->enabled == 1) { - ret = _regulator_disable(rdev); - if (ret == 0) - regulator->uA_load = 0; - } else if (WARN(regulator->enabled <= 0, - "unbalanced disables for supply %s\n", - regulator->supply_name)) - ret = -EIO; - if (ret == 0) - regulator->enabled--; + ret = _regulator_disable(rdev); mutex_unlock(&rdev->mutex); return ret; } @@ -1159,7 +1253,6 @@ int regulator_force_disable(struct regulator *regulator) int ret; mutex_lock(®ulator->rdev->mutex); - regulator->enabled = 0; regulator->uA_load = 0; ret = _regulator_force_disable(regulator->rdev); mutex_unlock(®ulator->rdev->mutex); @@ -1204,6 +1297,56 @@ int regulator_is_enabled(struct regulator *regulator) EXPORT_SYMBOL_GPL(regulator_is_enabled); /** + * regulator_count_voltages - count regulator_list_voltage() selectors + * @regulator: regulator source + * + * Returns number of selectors, or negative errno. Selectors are + * numbered starting at zero, and typically correspond to bitfields + * in hardware registers. + */ +int regulator_count_voltages(struct regulator *regulator) +{ + struct regulator_dev *rdev = regulator->rdev; + + return rdev->desc->n_voltages ? : -EINVAL; +} +EXPORT_SYMBOL_GPL(regulator_count_voltages); + +/** + * regulator_list_voltage - enumerate supported voltages + * @regulator: regulator source + * @selector: identify voltage to list + * Context: can sleep + * + * Returns a voltage that can be passed to @regulator_set_voltage(), + * zero if this selector code can't be used on this sytem, or a + * negative errno. + */ +int regulator_list_voltage(struct regulator *regulator, unsigned selector) +{ + struct regulator_dev *rdev = regulator->rdev; + struct regulator_ops *ops = rdev->desc->ops; + int ret; + + if (!ops->list_voltage || selector >= rdev->desc->n_voltages) + return -EINVAL; + + mutex_lock(&rdev->mutex); + ret = ops->list_voltage(rdev, selector); + mutex_unlock(&rdev->mutex); + + if (ret > 0) { + if (ret < rdev->constraints->min_uV) + ret = 0; + else if (ret > rdev->constraints->max_uV) + ret = 0; + } + + return ret; +} +EXPORT_SYMBOL_GPL(regulator_list_voltage); + +/** * regulator_set_voltage - set regulator output voltage * @regulator: regulator source * @min_uV: Minimum required voltage in uV @@ -1243,6 +1386,7 @@ int regulator_set_voltage(struct regulator *regulator, int min_uV, int max_uV) ret = rdev->desc->ops->set_voltage(rdev, min_uV, max_uV); out: + _notifier_call_chain(rdev, REGULATOR_EVENT_VOLTAGE_CHANGE, NULL); mutex_unlock(&rdev->mutex); return ret; } @@ -1543,20 +1687,23 @@ int regulator_unregister_notifier(struct regulator *regulator, } EXPORT_SYMBOL_GPL(regulator_unregister_notifier); -/* notify regulator consumers and downstream regulator consumers */ +/* notify regulator consumers and downstream regulator consumers. + * Note mutex must be held by caller. + */ static void _notifier_call_chain(struct regulator_dev *rdev, unsigned long event, void *data) { struct regulator_dev *_rdev; /* call rdev chain first */ - mutex_lock(&rdev->mutex); blocking_notifier_call_chain(&rdev->notifier, event, NULL); - mutex_unlock(&rdev->mutex); /* now notify regulator we supply */ - list_for_each_entry(_rdev, &rdev->supply_list, slist) - _notifier_call_chain(_rdev, event, data); + list_for_each_entry(_rdev, &rdev->supply_list, slist) { + mutex_lock(&_rdev->mutex); + _notifier_call_chain(_rdev, event, data); + mutex_unlock(&_rdev->mutex); + } } /** @@ -1703,6 +1850,7 @@ EXPORT_SYMBOL_GPL(regulator_bulk_free); * * Called by regulator drivers to notify clients a regulator event has * occurred. We also notify regulator clients downstream. + * Note lock must be held by caller. */ int regulator_notifier_call_chain(struct regulator_dev *rdev, unsigned long event, void *data) @@ -1744,6 +1892,11 @@ static int add_regulator_attributes(struct regulator_dev *rdev) if (status < 0) return status; } + if (ops->get_status) { + status = device_create_file(dev, &dev_attr_status); + if (status < 0) + return status; + } /* some attributes are type-specific */ if (rdev->desc->type == REGULATOR_CURRENT) { @@ -1828,17 +1981,18 @@ static int add_regulator_attributes(struct regulator_dev *rdev) * regulator_register - register regulator * @regulator_desc: regulator to register * @dev: struct device for the regulator + * @init_data: platform provided init data, passed through by driver * @driver_data: private regulator data * * Called by regulator drivers to register a regulator. * Returns 0 on success. */ struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc, - struct device *dev, void *driver_data) + struct device *dev, struct regulator_init_data *init_data, + void *driver_data) { static atomic_t regulator_no = ATOMIC_INIT(0); struct regulator_dev *rdev; - struct regulator_init_data *init_data = dev->platform_data; int ret, i; if (regulator_desc == NULL) @@ -1945,6 +2099,7 @@ void regulator_unregister(struct regulator_dev *rdev) return; mutex_lock(®ulator_list_mutex); + unset_regulator_supplies(rdev); list_del(&rdev->list); if (rdev->supply) sysfs_remove_link(&rdev->dev.kobj, "supply"); @@ -1989,6 +2144,23 @@ out: EXPORT_SYMBOL_GPL(regulator_suspend_prepare); /** + * regulator_has_full_constraints - the system has fully specified constraints + * + * Calling this function will cause the regulator API to disable all + * regulators which have a zero use count and don't have an always_on + * constraint in a late_initcall. + * + * The intention is that this will become the default behaviour in a + * future kernel release so users are encouraged to use this facility + * now. + */ +void regulator_has_full_constraints(void) +{ + has_full_constraints = 1; +} +EXPORT_SYMBOL_GPL(regulator_has_full_constraints); + +/** * rdev_get_drvdata - get rdev regulator driver data * @rdev: regulator * @@ -2055,3 +2227,77 @@ static int __init regulator_init(void) /* init early to allow our consumers to complete system booting */ core_initcall(regulator_init); + +static int __init regulator_init_complete(void) +{ + struct regulator_dev *rdev; + struct regulator_ops *ops; + struct regulation_constraints *c; + int enabled, ret; + const char *name; + + mutex_lock(®ulator_list_mutex); + + /* If we have a full configuration then disable any regulators + * which are not in use or always_on. This will become the + * default behaviour in the future. + */ + list_for_each_entry(rdev, ®ulator_list, list) { + ops = rdev->desc->ops; + c = rdev->constraints; + + if (c->name) + name = c->name; + else if (rdev->desc->name) + name = rdev->desc->name; + else + name = "regulator"; + + if (!ops->disable || c->always_on) + continue; + + mutex_lock(&rdev->mutex); + + if (rdev->use_count) + goto unlock; + + /* If we can't read the status assume it's on. */ + if (ops->is_enabled) + enabled = ops->is_enabled(rdev); + else + enabled = 1; + + if (!enabled) + goto unlock; + + if (has_full_constraints) { + /* We log since this may kill the system if it + * goes wrong. */ + printk(KERN_INFO "%s: disabling %s\n", + __func__, name); + ret = ops->disable(rdev); + if (ret != 0) { + printk(KERN_ERR + "%s: couldn't disable %s: %d\n", + __func__, name, ret); + } + } else { + /* The intention is that in future we will + * assume that full constraints are provided + * so warn even if we aren't going to do + * anything here. + */ + printk(KERN_WARNING + "%s: incomplete constraints, leaving %s on\n", + __func__, name); + } + +unlock: + mutex_unlock(&rdev->mutex); + } + + mutex_unlock(®ulator_list_mutex); + + return 0; +} +late_initcall(regulator_init_complete); diff --git a/drivers/regulator/da903x.c b/drivers/regulator/da903x.c index fe77730a7ed..72b15495183 100644 --- a/drivers/regulator/da903x.c +++ b/drivers/regulator/da903x.c @@ -471,7 +471,8 @@ static int __devinit da903x_regulator_probe(struct platform_device *pdev) if (ri->desc.id == DA9030_ID_LDO1 || ri->desc.id == DA9030_ID_LDO15) ri->desc.ops = &da9030_regulator_ldo1_15_ops; - rdev = regulator_register(&ri->desc, &pdev->dev, ri); + rdev = regulator_register(&ri->desc, &pdev->dev, + pdev->dev.platform_data, ri); if (IS_ERR(rdev)) { dev_err(&pdev->dev, "failed to register regulator %s\n", ri->desc.name); diff --git a/drivers/regulator/fixed.c b/drivers/regulator/fixed.c index d31db3e1491..23d554628a7 100644 --- a/drivers/regulator/fixed.c +++ b/drivers/regulator/fixed.c @@ -73,7 +73,8 @@ static int regulator_fixed_voltage_probe(struct platform_device *pdev) drvdata->microvolts = config->microvolts; - drvdata->dev = regulator_register(&drvdata->desc, drvdata); + drvdata->dev = regulator_register(&drvdata->desc, &pdev->dev, + config->init_data, drvdata); if (IS_ERR(drvdata->dev)) { ret = PTR_ERR(drvdata->dev); goto err_name; diff --git a/drivers/regulator/pcf50633-regulator.c b/drivers/regulator/pcf50633-regulator.c index 4cc85ec6e12..cd761d85c8f 100644 --- a/drivers/regulator/pcf50633-regulator.c +++ b/drivers/regulator/pcf50633-regulator.c @@ -284,7 +284,8 @@ static int __devinit pcf50633_regulator_probe(struct platform_device *pdev) /* Already set by core driver */ pcf = platform_get_drvdata(pdev); - rdev = regulator_register(®ulators[pdev->id], &pdev->dev, pcf); + rdev = regulator_register(®ulators[pdev->id], &pdev->dev, + pdev->dev.platform_data, pcf); if (IS_ERR(rdev)) return PTR_ERR(rdev); diff --git a/drivers/regulator/twl4030-regulator.c b/drivers/regulator/twl4030-regulator.c new file mode 100644 index 00000000000..e2032fb60b5 --- /dev/null +++ b/drivers/regulator/twl4030-regulator.c @@ -0,0 +1,500 @@ +/* + * twl4030-regulator.c -- support regulators in twl4030 family chips + * + * Copyright (C) 2008 David Brownell + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/err.h> +#include <linux/platform_device.h> +#include <linux/regulator/driver.h> +#include <linux/regulator/machine.h> +#include <linux/i2c/twl4030.h> + + +/* + * The TWL4030/TW5030/TPS659x0 family chips include power management, a + * USB OTG transceiver, an RTC, ADC, PWM, and lots more. Some versions + * include an audio codec, battery charger, and more voltage regulators. + * These chips are often used in OMAP-based systems. + * + * This driver implements software-based resource control for various + * voltage regulators. This is usually augmented with state machine + * based control. + */ + +struct twlreg_info { + /* start of regulator's PM_RECEIVER control register bank */ + u8 base; + + /* twl4030 resource ID, for resource control state machine */ + u8 id; + + /* voltage in mV = table[VSEL]; table_len must be a power-of-two */ + u8 table_len; + const u16 *table; + + /* chip constraints on regulator behavior */ + u16 min_mV; + + /* used by regulator core */ + struct regulator_desc desc; +}; + + +/* LDO control registers ... offset is from the base of its register bank. + * The first three registers of all power resource banks help hardware to + * manage the various resource groups. + */ +#define VREG_GRP 0 +#define VREG_TYPE 1 +#define VREG_REMAP 2 +#define VREG_DEDICATED 3 /* LDO control */ + + +static inline int +twl4030reg_read(struct twlreg_info *info, unsigned offset) +{ + u8 value; + int status; + + status = twl4030_i2c_read_u8(TWL4030_MODULE_PM_RECEIVER, + &value, info->base + offset); + return (status < 0) ? status : value; +} + +static inline int +twl4030reg_write(struct twlreg_info *info, unsigned offset, u8 value) +{ + return twl4030_i2c_write_u8(TWL4030_MODULE_PM_RECEIVER, + value, info->base + offset); +} + +/*----------------------------------------------------------------------*/ + +/* generic power resource operations, which work on all regulators */ + +static int twl4030reg_grp(struct regulator_dev *rdev) +{ + return twl4030reg_read(rdev_get_drvdata(rdev), VREG_GRP); +} + +/* + * Enable/disable regulators by joining/leaving the P1 (processor) group. + * We assume nobody else is updating the DEV_GRP registers. + */ + +#define P3_GRP BIT(7) /* "peripherals" */ +#define P2_GRP BIT(6) /* secondary processor, modem, etc */ +#define P1_GRP BIT(5) /* CPU/Linux */ + +static int twl4030reg_is_enabled(struct regulator_dev *rdev) +{ + int state = twl4030reg_grp(rdev); + + if (state < 0) + return state; + + return (state & P1_GRP) != 0; +} + +static int twl4030reg_enable(struct regulator_dev *rdev) +{ + struct twlreg_info *info = rdev_get_drvdata(rdev); + int grp; + + grp = twl4030reg_read(info, VREG_GRP); + if (grp < 0) + return grp; + + grp |= P1_GRP; + return twl4030reg_write(info, VREG_GRP, grp); +} + +static int twl4030reg_disable(struct regulator_dev *rdev) +{ + struct twlreg_info *info = rdev_get_drvdata(rdev); + int grp; + + grp = twl4030reg_read(info, VREG_GRP); + if (grp < 0) + return grp; + + grp &= ~P1_GRP; + return twl4030reg_write(info, VREG_GRP, grp); +} + +static int twl4030reg_get_status(struct regulator_dev *rdev) +{ + int state = twl4030reg_grp(rdev); + + if (state < 0) + return state; + state &= 0x0f; + + /* assume state != WARM_RESET; we'd not be running... */ + if (!state) + return REGULATOR_STATUS_OFF; + return (state & BIT(3)) + ? REGULATOR_STATUS_NORMAL + : REGULATOR_STATUS_STANDBY; +} + +static int twl4030reg_set_mode(struct regulator_dev *rdev, unsigned mode) +{ + struct twlreg_info *info = rdev_get_drvdata(rdev); + unsigned message; + int status; + + /* We can only set the mode through state machine commands... */ + switch (mode) { + case REGULATOR_MODE_NORMAL: + message = MSG_SINGULAR(DEV_GRP_P1, info->id, RES_STATE_ACTIVE); + break; + case REGULATOR_MODE_STANDBY: + message = MSG_SINGULAR(DEV_GRP_P1, info->id, RES_STATE_SLEEP); + break; + default: + return -EINVAL; + } + + /* Ensure the resource is associated with some group */ + status = twl4030reg_grp(rdev); + if (status < 0) + return status; + if (!(status & (P3_GRP | P2_GRP | P1_GRP))) + return -EACCES; + + status = twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER, + message >> 8, 0x15 /* PB_WORD_MSB */ ); + if (status >= 0) + return status; + + return twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER, + message, 0x16 /* PB_WORD_LSB */ ); +} + +/*----------------------------------------------------------------------*/ + +/* + * Support for adjustable-voltage LDOs uses a four bit (or less) voltage + * select field in its control register. We use tables indexed by VSEL + * to record voltages in milliVolts. (Accuracy is about three percent.) + * + * Note that VSEL values for VAUX2 changed in twl5030 and newer silicon; + * currently handled by listing two slightly different VAUX2 regulators, + * only one of which will be configured. + * + * VSEL values documented as "TI cannot support these values" are flagged + * in these tables as UNSUP() values; we normally won't assign them. + * + * VAUX3 at 3V is incorrectly listed in some TI manuals as unsupported. + * TI are revising the twl5030/tps659x0 specs to support that 3.0V setting. + */ +#ifdef CONFIG_TWL4030_ALLOW_UNSUPPORTED +#define UNSUP_MASK 0x0000 +#else +#define UNSUP_MASK 0x8000 +#endif + +#define UNSUP(x) (UNSUP_MASK | (x)) +#define IS_UNSUP(x) (UNSUP_MASK & (x)) +#define LDO_MV(x) (~UNSUP_MASK & (x)) + + +static const u16 VAUX1_VSEL_table[] = { + UNSUP(1500), UNSUP(1800), 2500, 2800, + 3000, 3000, 3000, 3000, +}; +static const u16 VAUX2_4030_VSEL_table[] = { + UNSUP(1000), UNSUP(1000), UNSUP(1200), 1300, + 1500, 1800, UNSUP(1850), 2500, + UNSUP(2600), 2800, UNSUP(2850), UNSUP(3000), + UNSUP(3150), UNSUP(3150), UNSUP(3150), UNSUP(3150), +}; +static const u16 VAUX2_VSEL_table[] = { + 1700, 1700, 1900, 1300, + 1500, 1800, 2000, 2500, + 2100, 2800, 2200, 2300, + 2400, 2400, 2400, 2400, +}; +static const u16 VAUX3_VSEL_table[] = { + 1500, 1800, 2500, 2800, + 3000, 3000, 3000, 3000, +}; +static const u16 VAUX4_VSEL_table[] = { + 700, 1000, 1200, UNSUP(1300), + 1500, 1800, UNSUP(1850), 2500, + UNSUP(2600), 2800, UNSUP(2850), UNSUP(3000), + UNSUP(3150), UNSUP(3150), UNSUP(3150), UNSUP(3150), +}; +static const u16 VMMC1_VSEL_table[] = { + 1850, 2850, 3000, 3150, +}; +static const u16 VMMC2_VSEL_table[] = { + UNSUP(1000), UNSUP(1000), UNSUP(1200), UNSUP(1300), + UNSUP(1500), UNSUP(1800), 1850, UNSUP(2500), + 2600, 2800, 2850, 3000, + 3150, 3150, 3150, 3150, +}; +static const u16 VPLL1_VSEL_table[] = { + 1000, 1200, 1300, 1800, + UNSUP(2800), UNSUP(3000), UNSUP(3000), UNSUP(3000), +}; +static const u16 VPLL2_VSEL_table[] = { + 700, 1000, 1200, 1300, + UNSUP(1500), 1800, UNSUP(1850), UNSUP(2500), + UNSUP(2600), UNSUP(2800), UNSUP(2850), UNSUP(3000), + UNSUP(3150), UNSUP(3150), UNSUP(3150), UNSUP(3150), +}; +static const u16 VSIM_VSEL_table[] = { + UNSUP(1000), UNSUP(1200), UNSUP(1300), 1800, + 2800, 3000, 3000, 3000, +}; +static const u16 VDAC_VSEL_table[] = { + 1200, 1300, 1800, 1800, +}; + + +static int twl4030ldo_list_voltage(struct regulator_dev *rdev, unsigned index) +{ + struct twlreg_info *info = rdev_get_drvdata(rdev); + int mV = info->table[index]; + + return IS_UNSUP(mV) ? 0 : (LDO_MV(mV) * 1000); +} + +static int +twl4030ldo_set_voltage(struct regulator_dev *rdev, int min_uV, int max_uV) +{ + struct twlreg_info *info = rdev_get_drvdata(rdev); + int vsel; + + for (vsel = 0; vsel < info->table_len; vsel++) { + int mV = info->table[vsel]; + int uV; + + if (IS_UNSUP(mV)) + continue; + uV = LDO_MV(mV) * 1000; + + /* REVISIT for VAUX2, first match may not be best/lowest */ + + /* use the first in-range value */ + if (min_uV <= uV && uV <= max_uV) + return twl4030reg_write(info, VREG_DEDICATED, vsel); + } + + return -EDOM; +} + +static int twl4030ldo_get_voltage(struct regulator_dev *rdev) +{ + struct twlreg_info *info = rdev_get_drvdata(rdev); + int vsel = twl4030reg_read(info, VREG_DEDICATED); + + if (vsel < 0) + return vsel; + + vsel &= info->table_len - 1; + return LDO_MV(info->table[vsel]) * 1000; +} + +static struct regulator_ops twl4030ldo_ops = { + .list_voltage = twl4030ldo_list_voltage, + + .set_voltage = twl4030ldo_set_voltage, + .get_voltage = twl4030ldo_get_voltage, + + .enable = twl4030reg_enable, + .disable = twl4030reg_disable, + .is_enabled = twl4030reg_is_enabled, + + .set_mode = twl4030reg_set_mode, + + .get_status = twl4030reg_get_status, +}; + +/*----------------------------------------------------------------------*/ + +/* + * Fixed voltage LDOs don't have a VSEL field to update. + */ +static int twl4030fixed_list_voltage(struct regulator_dev *rdev, unsigned index) +{ + struct twlreg_info *info = rdev_get_drvdata(rdev); + + return info->min_mV * 1000; +} + +static int twl4030fixed_get_voltage(struct regulator_dev *rdev) +{ + struct twlreg_info *info = rdev_get_drvdata(rdev); + + return info->min_mV * 1000; +} + +static struct regulator_ops twl4030fixed_ops = { + .list_voltage = twl4030fixed_list_voltage, + + .get_voltage = twl4030fixed_get_voltage, + + .enable = twl4030reg_enable, + .disable = twl4030reg_disable, + .is_enabled = twl4030reg_is_enabled, + + .set_mode = twl4030reg_set_mode, + + .get_status = twl4030reg_get_status, +}; + +/*----------------------------------------------------------------------*/ + +#define TWL_ADJUSTABLE_LDO(label, offset, num) { \ + .base = offset, \ + .id = num, \ + .table_len = ARRAY_SIZE(label##_VSEL_table), \ + .table = label##_VSEL_table, \ + .desc = { \ + .name = #label, \ + .id = TWL4030_REG_##label, \ + .n_voltages = ARRAY_SIZE(label##_VSEL_table), \ + .ops = &twl4030ldo_ops, \ + .type = REGULATOR_VOLTAGE, \ + .owner = THIS_MODULE, \ + }, \ + } + +#define TWL_FIXED_LDO(label, offset, mVolts, num) { \ + .base = offset, \ + .id = num, \ + .min_mV = mVolts, \ + .desc = { \ + .name = #label, \ + .id = TWL4030_REG_##label, \ + .n_voltages = 1, \ + .ops = &twl4030fixed_ops, \ + .type = REGULATOR_VOLTAGE, \ + .owner = THIS_MODULE, \ + }, \ + } + +/* + * We list regulators here if systems need some level of + * software control over them after boot. + */ +static struct twlreg_info twl4030_regs[] = { + TWL_ADJUSTABLE_LDO(VAUX1, 0x17, 1), + TWL_ADJUSTABLE_LDO(VAUX2_4030, 0x1b, 2), + TWL_ADJUSTABLE_LDO(VAUX2, 0x1b, 2), + TWL_ADJUSTABLE_LDO(VAUX3, 0x1f, 3), + TWL_ADJUSTABLE_LDO(VAUX4, 0x23, 4), + TWL_ADJUSTABLE_LDO(VMMC1, 0x27, 5), + TWL_ADJUSTABLE_LDO(VMMC2, 0x2b, 6), + /* + TWL_ADJUSTABLE_LDO(VPLL1, 0x2f, 7), + */ + TWL_ADJUSTABLE_LDO(VPLL2, 0x33, 8), + TWL_ADJUSTABLE_LDO(VSIM, 0x37, 9), + TWL_ADJUSTABLE_LDO(VDAC, 0x3b, 10), + /* + TWL_ADJUSTABLE_LDO(VINTANA1, 0x3f, 11), + TWL_ADJUSTABLE_LDO(VINTANA2, 0x43, 12), + TWL_ADJUSTABLE_LDO(VINTDIG, 0x47, 13), + TWL_SMPS(VIO, 0x4b, 14), + TWL_SMPS(VDD1, 0x55, 15), + TWL_SMPS(VDD2, 0x63, 16), + */ + TWL_FIXED_LDO(VUSB1V5, 0x71, 1500, 17), + TWL_FIXED_LDO(VUSB1V8, 0x74, 1800, 18), + TWL_FIXED_LDO(VUSB3V1, 0x77, 3100, 19), + /* VUSBCP is managed *only* by the USB subchip */ +}; + +static int twl4030reg_probe(struct platform_device *pdev) +{ + int i; + struct twlreg_info *info; + struct regulator_init_data *initdata; + struct regulation_constraints *c; + struct regulator_dev *rdev; + + for (i = 0, info = NULL; i < ARRAY_SIZE(twl4030_regs); i++) { + if (twl4030_regs[i].desc.id != pdev->id) + continue; + info = twl4030_regs + i; + break; + } + if (!info) + return -ENODEV; + + initdata = pdev->dev.platform_data; + if (!initdata) + return -EINVAL; + + /* Constrain board-specific capabilities according to what + * this driver and the chip itself can actually do. + */ + c = &initdata->constraints; + c->valid_modes_mask &= REGULATOR_MODE_NORMAL | REGULATOR_MODE_STANDBY; + c->valid_ops_mask &= REGULATOR_CHANGE_VOLTAGE + | REGULATOR_CHANGE_MODE + | REGULATOR_CHANGE_STATUS; + + rdev = regulator_register(&info->desc, &pdev->dev, initdata, info); + if (IS_ERR(rdev)) { + dev_err(&pdev->dev, "can't register %s, %ld\n", + info->desc.name, PTR_ERR(rdev)); + return PTR_ERR(rdev); + } + platform_set_drvdata(pdev, rdev); + + /* NOTE: many regulators support short-circuit IRQs (presentable + * as REGULATOR_OVER_CURRENT notifications?) configured via: + * - SC_CONFIG + * - SC_DETECT1 (vintana2, vmmc1/2, vaux1/2/3/4) + * - SC_DETECT2 (vusb, vdac, vio, vdd1/2, vpll2) + * - IT_CONFIG + */ + + return 0; +} + +static int __devexit twl4030reg_remove(struct platform_device *pdev) +{ + regulator_unregister(platform_get_drvdata(pdev)); + return 0; +} + +MODULE_ALIAS("platform:twl4030_reg"); + +static struct platform_driver twl4030reg_driver = { + .probe = twl4030reg_probe, + .remove = __devexit_p(twl4030reg_remove), + /* NOTE: short name, to work around driver model truncation of + * "twl4030_regulator.12" (and friends) to "twl4030_regulator.1". + */ + .driver.name = "twl4030_reg", + .driver.owner = THIS_MODULE, +}; + +static int __init twl4030reg_init(void) +{ + return platform_driver_register(&twl4030reg_driver); +} +subsys_initcall(twl4030reg_init); + +static void __exit twl4030reg_exit(void) +{ + platform_driver_unregister(&twl4030reg_driver); +} +module_exit(twl4030reg_exit) + +MODULE_DESCRIPTION("TWL4030 regulator driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/regulator/virtual.c b/drivers/regulator/virtual.c index 5ddb464b1c3..3d08348584e 100644 --- a/drivers/regulator/virtual.c +++ b/drivers/regulator/virtual.c @@ -226,13 +226,17 @@ static ssize_t set_mode(struct device *dev, struct device_attribute *attr, unsigned int mode; int ret; - if (strncmp(buf, "fast", strlen("fast")) == 0) + /* + * sysfs_streq() doesn't need the \n's, but we add them so the strings + * will be shared with show_mode(), above. + */ + if (sysfs_streq(buf, "fast\n") == 0) mode = REGULATOR_MODE_FAST; - else if (strncmp(buf, "normal", strlen("normal")) == 0) + else if (sysfs_streq(buf, "normal\n") == 0) mode = REGULATOR_MODE_NORMAL; - else if (strncmp(buf, "idle", strlen("idle")) == 0) + else if (sysfs_streq(buf, "idle\n") == 0) mode = REGULATOR_MODE_IDLE; - else if (strncmp(buf, "standby", strlen("standby")) == 0) + else if (sysfs_streq(buf, "standby\n") == 0) mode = REGULATOR_MODE_STANDBY; else { dev_err(dev, "Configuring invalid mode\n"); @@ -256,7 +260,7 @@ static DEVICE_ATTR(min_microamps, 0666, show_min_uA, set_min_uA); static DEVICE_ATTR(max_microamps, 0666, show_max_uA, set_max_uA); static DEVICE_ATTR(mode, 0666, show_mode, set_mode); -struct device_attribute *attributes[] = { +static struct device_attribute *attributes[] = { &dev_attr_min_microvolts, &dev_attr_max_microvolts, &dev_attr_min_microamps, diff --git a/drivers/regulator/wm8350-regulator.c b/drivers/regulator/wm8350-regulator.c index 5056e23e441..771eca1066b 100644 --- a/drivers/regulator/wm8350-regulator.c +++ b/drivers/regulator/wm8350-regulator.c @@ -24,6 +24,9 @@ #include <linux/regulator/driver.h> #include <linux/regulator/machine.h> +/* Maximum value possible for VSEL */ +#define WM8350_DCDC_MAX_VSEL 0x66 + /* Microamps */ static const int isink_cur[] = { 4, @@ -385,6 +388,14 @@ static int wm8350_dcdc_get_voltage(struct regulator_dev *rdev) return wm8350_dcdc_val_to_mvolts(val) * 1000; } +static int wm8350_dcdc_list_voltage(struct regulator_dev *rdev, + unsigned selector) +{ + if (selector > WM8350_DCDC_MAX_VSEL) + return -EINVAL; + return wm8350_dcdc_val_to_mvolts(selector) * 1000; +} + static int wm8350_dcdc_set_suspend_voltage(struct regulator_dev *rdev, int uV) { struct wm8350 *wm8350 = rdev_get_drvdata(rdev); @@ -775,6 +786,14 @@ static int wm8350_ldo_get_voltage(struct regulator_dev *rdev) return wm8350_ldo_val_to_mvolts(val) * 1000; } +static int wm8350_ldo_list_voltage(struct regulator_dev *rdev, + unsigned selector) +{ + if (selector > WM8350_LDO1_VSEL_MASK) + return -EINVAL; + return wm8350_ldo_val_to_mvolts(selector) * 1000; +} + int wm8350_dcdc_set_slot(struct wm8350 *wm8350, int dcdc, u16 start, u16 stop, u16 fault) { @@ -1031,18 +1050,30 @@ static unsigned int wm8350_dcdc_get_mode(struct regulator_dev *rdev) int dcdc = rdev_get_id(rdev); u16 mask, sleep, active, force; int mode = REGULATOR_MODE_NORMAL; + int reg; - if (dcdc < WM8350_DCDC_1 || dcdc > WM8350_DCDC_6) - return -EINVAL; - - if (dcdc == WM8350_DCDC_2 || dcdc == WM8350_DCDC_5) + switch (dcdc) { + case WM8350_DCDC_1: + reg = WM8350_DCDC1_FORCE_PWM; + break; + case WM8350_DCDC_3: + reg = WM8350_DCDC3_FORCE_PWM; + break; + case WM8350_DCDC_4: + reg = WM8350_DCDC4_FORCE_PWM; + break; + case WM8350_DCDC_6: + reg = WM8350_DCDC6_FORCE_PWM; + break; + default: return -EINVAL; + } mask = 1 << (dcdc - WM8350_DCDC_1); active = wm8350_reg_read(wm8350, WM8350_DCDC_ACTIVE_OPTIONS) & mask; + force = wm8350_reg_read(wm8350, reg) & WM8350_DCDC1_FORCE_PWM_ENA; sleep = wm8350_reg_read(wm8350, WM8350_DCDC_SLEEP_OPTIONS) & mask; - force = wm8350_reg_read(wm8350, WM8350_DCDC1_FORCE_PWM) - & WM8350_DCDC1_FORCE_PWM_ENA; + dev_dbg(wm8350->dev, "mask %x active %x sleep %x force %x", mask, active, sleep, force); @@ -1150,6 +1181,7 @@ static int wm8350_ldo_is_enabled(struct regulator_dev *rdev) static struct regulator_ops wm8350_dcdc_ops = { .set_voltage = wm8350_dcdc_set_voltage, .get_voltage = wm8350_dcdc_get_voltage, + .list_voltage = wm8350_dcdc_list_voltage, .enable = wm8350_dcdc_enable, .disable = wm8350_dcdc_disable, .get_mode = wm8350_dcdc_get_mode, @@ -1173,6 +1205,7 @@ static struct regulator_ops wm8350_dcdc2_5_ops = { static struct regulator_ops wm8350_ldo_ops = { .set_voltage = wm8350_ldo_set_voltage, .get_voltage = wm8350_ldo_get_voltage, + .list_voltage = wm8350_ldo_list_voltage, .enable = wm8350_ldo_enable, .disable = wm8350_ldo_disable, .is_enabled = wm8350_ldo_is_enabled, @@ -1197,6 +1230,7 @@ static struct regulator_desc wm8350_reg[NUM_WM8350_REGULATORS] = { .ops = &wm8350_dcdc_ops, .irq = WM8350_IRQ_UV_DC1, .type = REGULATOR_VOLTAGE, + .n_voltages = WM8350_DCDC_MAX_VSEL + 1, .owner = THIS_MODULE, }, { @@ -1213,6 +1247,7 @@ static struct regulator_desc wm8350_reg[NUM_WM8350_REGULATORS] = { .ops = &wm8350_dcdc_ops, .irq = WM8350_IRQ_UV_DC3, .type = REGULATOR_VOLTAGE, + .n_voltages = WM8350_DCDC_MAX_VSEL + 1, .owner = THIS_MODULE, }, { @@ -1221,6 +1256,7 @@ static struct regulator_desc wm8350_reg[NUM_WM8350_REGULATORS] = { .ops = &wm8350_dcdc_ops, .irq = WM8350_IRQ_UV_DC4, .type = REGULATOR_VOLTAGE, + .n_voltages = WM8350_DCDC_MAX_VSEL + 1, .owner = THIS_MODULE, }, { @@ -1237,6 +1273,7 @@ static struct regulator_desc wm8350_reg[NUM_WM8350_REGULATORS] = { .ops = &wm8350_dcdc_ops, .irq = WM8350_IRQ_UV_DC6, .type = REGULATOR_VOLTAGE, + .n_voltages = WM8350_DCDC_MAX_VSEL + 1, .owner = THIS_MODULE, }, { @@ -1245,6 +1282,7 @@ static struct regulator_desc wm8350_reg[NUM_WM8350_REGULATORS] = { .ops = &wm8350_ldo_ops, .irq = WM8350_IRQ_UV_LDO1, .type = REGULATOR_VOLTAGE, + .n_voltages = WM8350_LDO1_VSEL_MASK + 1, .owner = THIS_MODULE, }, { @@ -1253,6 +1291,7 @@ static struct regulator_desc wm8350_reg[NUM_WM8350_REGULATORS] = { .ops = &wm8350_ldo_ops, .irq = WM8350_IRQ_UV_LDO2, .type = REGULATOR_VOLTAGE, + .n_voltages = WM8350_LDO2_VSEL_MASK + 1, .owner = THIS_MODULE, }, { @@ -1261,6 +1300,7 @@ static struct regulator_desc wm8350_reg[NUM_WM8350_REGULATORS] = { .ops = &wm8350_ldo_ops, .irq = WM8350_IRQ_UV_LDO3, .type = REGULATOR_VOLTAGE, + .n_voltages = WM8350_LDO3_VSEL_MASK + 1, .owner = THIS_MODULE, }, { @@ -1269,6 +1309,7 @@ static struct regulator_desc wm8350_reg[NUM_WM8350_REGULATORS] = { .ops = &wm8350_ldo_ops, .irq = WM8350_IRQ_UV_LDO4, .type = REGULATOR_VOLTAGE, + .n_voltages = WM8350_LDO4_VSEL_MASK + 1, .owner = THIS_MODULE, }, { @@ -1293,6 +1334,7 @@ static void pmic_uv_handler(struct wm8350 *wm8350, int irq, void *data) { struct regulator_dev *rdev = (struct regulator_dev *)data; + mutex_lock(&rdev->mutex); if (irq == WM8350_IRQ_CS1 || irq == WM8350_IRQ_CS2) regulator_notifier_call_chain(rdev, REGULATOR_EVENT_REGULATION_OUT, @@ -1301,6 +1343,7 @@ static void pmic_uv_handler(struct wm8350 *wm8350, int irq, void *data) regulator_notifier_call_chain(rdev, REGULATOR_EVENT_UNDER_VOLTAGE, wm8350); + mutex_unlock(&rdev->mutex); } static int wm8350_regulator_probe(struct platform_device *pdev) @@ -1333,9 +1376,9 @@ static int wm8350_regulator_probe(struct platform_device *pdev) break; } - /* register regulator */ rdev = regulator_register(&wm8350_reg[pdev->id], &pdev->dev, + pdev->dev.platform_data, dev_get_drvdata(&pdev->dev)); if (IS_ERR(rdev)) { dev_err(&pdev->dev, "failed to register %s\n", diff --git a/drivers/regulator/wm8400-regulator.c b/drivers/regulator/wm8400-regulator.c index 56e23d44ba5..15742602907 100644 --- a/drivers/regulator/wm8400-regulator.c +++ b/drivers/regulator/wm8400-regulator.c @@ -43,6 +43,18 @@ static int wm8400_ldo_disable(struct regulator_dev *dev) WM8400_LDO1_ENA, 0); } +static int wm8400_ldo_list_voltage(struct regulator_dev *dev, + unsigned selector) +{ + if (selector > WM8400_LDO1_VSEL_MASK) + return -EINVAL; + + if (selector < 15) + return 900000 + (selector * 50000); + else + return 1600000 + ((selector - 14) * 100000); +} + static int wm8400_ldo_get_voltage(struct regulator_dev *dev) { struct wm8400 *wm8400 = rdev_get_drvdata(dev); @@ -51,10 +63,7 @@ static int wm8400_ldo_get_voltage(struct regulator_dev *dev) val = wm8400_reg_read(wm8400, WM8400_LDO1_CONTROL + rdev_get_id(dev)); val &= WM8400_LDO1_VSEL_MASK; - if (val < 15) - return 900000 + (val * 50000); - else - return 1600000 + ((val - 14) * 100000); + return wm8400_ldo_list_voltage(dev, val); } static int wm8400_ldo_set_voltage(struct regulator_dev *dev, @@ -92,6 +101,7 @@ static struct regulator_ops wm8400_ldo_ops = { .is_enabled = wm8400_ldo_is_enabled, .enable = wm8400_ldo_enable, .disable = wm8400_ldo_disable, + .list_voltage = wm8400_ldo_list_voltage, .get_voltage = wm8400_ldo_get_voltage, .set_voltage = wm8400_ldo_set_voltage, }; @@ -124,6 +134,15 @@ static int wm8400_dcdc_disable(struct regulator_dev *dev) WM8400_DC1_ENA, 0); } +static int wm8400_dcdc_list_voltage(struct regulator_dev *dev, + unsigned selector) +{ + if (selector > WM8400_DC1_VSEL_MASK) + return -EINVAL; + + return 850000 + (selector * 25000); +} + static int wm8400_dcdc_get_voltage(struct regulator_dev *dev) { struct wm8400 *wm8400 = rdev_get_drvdata(dev); @@ -237,6 +256,7 @@ static struct regulator_ops wm8400_dcdc_ops = { .is_enabled = wm8400_dcdc_is_enabled, .enable = wm8400_dcdc_enable, .disable = wm8400_dcdc_disable, + .list_voltage = wm8400_dcdc_list_voltage, .get_voltage = wm8400_dcdc_get_voltage, .set_voltage = wm8400_dcdc_set_voltage, .get_mode = wm8400_dcdc_get_mode, @@ -249,6 +269,7 @@ static struct regulator_desc regulators[] = { .name = "LDO1", .id = WM8400_LDO1, .ops = &wm8400_ldo_ops, + .n_voltages = WM8400_LDO1_VSEL_MASK + 1, .type = REGULATOR_VOLTAGE, .owner = THIS_MODULE, }, @@ -256,6 +277,7 @@ static struct regulator_desc regulators[] = { .name = "LDO2", .id = WM8400_LDO2, .ops = &wm8400_ldo_ops, + .n_voltages = WM8400_LDO2_VSEL_MASK + 1, .type = REGULATOR_VOLTAGE, .owner = THIS_MODULE, }, @@ -263,6 +285,7 @@ static struct regulator_desc regulators[] = { .name = "LDO3", .id = WM8400_LDO3, .ops = &wm8400_ldo_ops, + .n_voltages = WM8400_LDO3_VSEL_MASK + 1, .type = REGULATOR_VOLTAGE, .owner = THIS_MODULE, }, @@ -270,6 +293,7 @@ static struct regulator_desc regulators[] = { .name = "LDO4", .id = WM8400_LDO4, .ops = &wm8400_ldo_ops, + .n_voltages = WM8400_LDO4_VSEL_MASK + 1, .type = REGULATOR_VOLTAGE, .owner = THIS_MODULE, }, @@ -277,6 +301,7 @@ static struct regulator_desc regulators[] = { .name = "DCDC1", .id = WM8400_DCDC1, .ops = &wm8400_dcdc_ops, + .n_voltages = WM8400_DC1_VSEL_MASK + 1, .type = REGULATOR_VOLTAGE, .owner = THIS_MODULE, }, @@ -284,6 +309,7 @@ static struct regulator_desc regulators[] = { .name = "DCDC2", .id = WM8400_DCDC2, .ops = &wm8400_dcdc_ops, + .n_voltages = WM8400_DC2_VSEL_MASK + 1, .type = REGULATOR_VOLTAGE, .owner = THIS_MODULE, }, @@ -294,7 +320,7 @@ static int __devinit wm8400_regulator_probe(struct platform_device *pdev) struct regulator_dev *rdev; rdev = regulator_register(®ulators[pdev->id], &pdev->dev, - pdev->dev.driver_data); + pdev->dev.platform_data, pdev->dev.driver_data); if (IS_ERR(rdev)) return PTR_ERR(rdev); diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 56002f7d26b..ffe34a12f44 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -688,22 +688,16 @@ config RTC_DRV_RS5C313 help If you say yes here you get support for the Ricoh RS5C313 RTC chips. -config RTC_DRV_PARISC - tristate "PA-RISC firmware RTC support" - depends on PARISC - help - Say Y or M here to enable RTC support on PA-RISC systems using - firmware calls. If you do not know what you are doing, you should +config RTC_DRV_GENERIC + tristate "Generic RTC support" + # Please consider writing a new RTC driver instead of using the generic + # RTC abstraction + depends on PARISC || M68K || PPC + help + Say Y or M here to enable RTC support on systems using the generic + RTC abstraction. If you do not know what you are doing, you should just say Y. -config RTC_DRV_PPC - tristate "PowerPC machine dependent RTC support" - depends on PPC - help - The PowerPC kernel has machine-specific functions for accessing - the RTC. This exposes that functionality through the generic RTC - class. - config RTC_DRV_PXA tristate "PXA27x/PXA3xx" depends on ARCH_PXA @@ -747,4 +741,13 @@ config RTC_DRV_MV This driver can also be built as a module. If so, the module will be called rtc-mv. +config RTC_DRV_PS3 + tristate "PS3 RTC" + depends on PPC_PS3 + help + If you say yes here you will get support for the RTC on PS3. + + This driver can also be built as a module. If so, the module + will be called rtc-ps3. + endif # RTC_CLASS diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile index e7b09986d26..6c0639a14f0 100644 --- a/drivers/rtc/Makefile +++ b/drivers/rtc/Makefile @@ -56,8 +56,7 @@ obj-$(CONFIG_RTC_DRV_PCF8563) += rtc-pcf8563.o obj-$(CONFIG_RTC_DRV_PCF8583) += rtc-pcf8583.o obj-$(CONFIG_RTC_DRV_PL030) += rtc-pl030.o obj-$(CONFIG_RTC_DRV_PL031) += rtc-pl031.o -obj-$(CONFIG_RTC_DRV_PARISC) += rtc-parisc.o -obj-$(CONFIG_RTC_DRV_PPC) += rtc-ppc.o +obj-$(CONFIG_RTC_DRV_GENERIC) += rtc-generic.o obj-$(CONFIG_RTC_DRV_PXA) += rtc-pxa.o obj-$(CONFIG_RTC_DRV_R9701) += rtc-r9701.o obj-$(CONFIG_RTC_DRV_RS5C313) += rtc-rs5c313.o @@ -77,3 +76,4 @@ obj-$(CONFIG_RTC_DRV_VR41XX) += rtc-vr41xx.o obj-$(CONFIG_RTC_DRV_WM8350) += rtc-wm8350.o obj-$(CONFIG_RTC_DRV_X1205) += rtc-x1205.o obj-$(CONFIG_RTC_DRV_PCF50633) += rtc-pcf50633.o +obj-$(CONFIG_RTC_DRV_PS3) += rtc-ps3.o diff --git a/drivers/rtc/rtc-generic.c b/drivers/rtc/rtc-generic.c new file mode 100644 index 00000000000..98322004ad2 --- /dev/null +++ b/drivers/rtc/rtc-generic.c @@ -0,0 +1,84 @@ +/* rtc-generic: RTC driver using the generic RTC abstraction + * + * Copyright (C) 2008 Kyle McMartin <kyle@mcmartin.ca> + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/time.h> +#include <linux/platform_device.h> +#include <linux/rtc.h> + +#include <asm/rtc.h> + +static int generic_get_time(struct device *dev, struct rtc_time *tm) +{ + unsigned int ret = get_rtc_time(tm); + + if (ret & RTC_BATT_BAD) + return -EOPNOTSUPP; + + return rtc_valid_tm(tm); +} + +static int generic_set_time(struct device *dev, struct rtc_time *tm) +{ + if (set_rtc_time(tm) < 0) + return -EOPNOTSUPP; + + return 0; +} + +static const struct rtc_class_ops generic_rtc_ops = { + .read_time = generic_get_time, + .set_time = generic_set_time, +}; + +static int __init generic_rtc_probe(struct platform_device *dev) +{ + struct rtc_device *rtc; + + rtc = rtc_device_register("rtc-generic", &dev->dev, &generic_rtc_ops, + THIS_MODULE); + if (IS_ERR(rtc)) + return PTR_ERR(rtc); + + platform_set_drvdata(dev, rtc); + + return 0; +} + +static int __exit generic_rtc_remove(struct platform_device *dev) +{ + struct rtc_device *rtc = platform_get_drvdata(dev); + + rtc_device_unregister(rtc); + + return 0; +} + +static struct platform_driver generic_rtc_driver = { + .driver = { + .name = "rtc-generic", + .owner = THIS_MODULE, + }, + .remove = __exit_p(generic_rtc_remove), +}; + +static int __init generic_rtc_init(void) +{ + return platform_driver_probe(&generic_rtc_driver, generic_rtc_probe); +} + +static void __exit generic_rtc_fini(void) +{ + platform_driver_unregister(&generic_rtc_driver); +} + +module_init(generic_rtc_init); +module_exit(generic_rtc_fini); + +MODULE_AUTHOR("Kyle McMartin <kyle@mcmartin.ca>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Generic RTC driver"); +MODULE_ALIAS("platform:rtc-generic"); diff --git a/drivers/rtc/rtc-parisc.c b/drivers/rtc/rtc-parisc.c deleted file mode 100644 index b966f56da97..00000000000 --- a/drivers/rtc/rtc-parisc.c +++ /dev/null @@ -1,86 +0,0 @@ -/* rtc-parisc: RTC for HP PA-RISC firmware - * - * Copyright (C) 2008 Kyle McMartin <kyle@mcmartin.ca> - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/time.h> -#include <linux/platform_device.h> -#include <linux/rtc.h> - -#include <asm/rtc.h> - -static int parisc_get_time(struct device *dev, struct rtc_time *tm) -{ - unsigned long ret; - - ret = get_rtc_time(tm); - - if (ret & RTC_BATT_BAD) - return -EOPNOTSUPP; - - return rtc_valid_tm(tm); -} - -static int parisc_set_time(struct device *dev, struct rtc_time *tm) -{ - if (set_rtc_time(tm) < 0) - return -EOPNOTSUPP; - - return 0; -} - -static const struct rtc_class_ops parisc_rtc_ops = { - .read_time = parisc_get_time, - .set_time = parisc_set_time, -}; - -static int __init parisc_rtc_probe(struct platform_device *dev) -{ - struct rtc_device *rtc; - - rtc = rtc_device_register("rtc-parisc", &dev->dev, &parisc_rtc_ops, - THIS_MODULE); - if (IS_ERR(rtc)) - return PTR_ERR(rtc); - - platform_set_drvdata(dev, rtc); - - return 0; -} - -static int __exit parisc_rtc_remove(struct platform_device *dev) -{ - struct rtc_device *rtc = platform_get_drvdata(dev); - - rtc_device_unregister(rtc); - - return 0; -} - -static struct platform_driver parisc_rtc_driver = { - .driver = { - .name = "rtc-parisc", - .owner = THIS_MODULE, - }, - .probe = parisc_rtc_probe, - .remove = __devexit_p(parisc_rtc_remove), -}; - -static int __init parisc_rtc_init(void) -{ - return platform_driver_probe(&parisc_rtc_driver, parisc_rtc_probe); -} - -static void __exit parisc_rtc_fini(void) -{ - platform_driver_unregister(&parisc_rtc_driver); -} - -module_init(parisc_rtc_init); -module_exit(parisc_rtc_fini); - -MODULE_AUTHOR("Kyle McMartin <kyle@mcmartin.ca>"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("HP PA-RISC RTC driver"); diff --git a/drivers/rtc/rtc-ppc.c b/drivers/rtc/rtc-ppc.c deleted file mode 100644 index c8e97e25ef7..00000000000 --- a/drivers/rtc/rtc-ppc.c +++ /dev/null @@ -1,69 +0,0 @@ -/* - * RTC driver for ppc_md RTC functions - * - * © 2007 Red Hat, Inc. - * - * Author: David Woodhouse <dwmw2@infradead.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - - -#include <linux/module.h> -#include <linux/err.h> -#include <linux/rtc.h> -#include <linux/platform_device.h> -#include <asm/machdep.h> - -static int ppc_rtc_read_time(struct device *dev, struct rtc_time *tm) -{ - ppc_md.get_rtc_time(tm); - return 0; -} - -static int ppc_rtc_set_time(struct device *dev, struct rtc_time *tm) -{ - return ppc_md.set_rtc_time(tm); -} - -static const struct rtc_class_ops ppc_rtc_ops = { - .set_time = ppc_rtc_set_time, - .read_time = ppc_rtc_read_time, -}; - -static struct rtc_device *rtc; -static struct platform_device *ppc_rtc_pdev; - -static int __init ppc_rtc_init(void) -{ - if (!ppc_md.get_rtc_time || !ppc_md.set_rtc_time) - return -ENODEV; - - ppc_rtc_pdev = platform_device_register_simple("ppc-rtc", 0, NULL, 0); - if (IS_ERR(ppc_rtc_pdev)) - return PTR_ERR(ppc_rtc_pdev); - - rtc = rtc_device_register("ppc_md", &ppc_rtc_pdev->dev, - &ppc_rtc_ops, THIS_MODULE); - if (IS_ERR(rtc)) { - platform_device_unregister(ppc_rtc_pdev); - return PTR_ERR(rtc); - } - - return 0; -} - -static void __exit ppc_rtc_exit(void) -{ - rtc_device_unregister(rtc); - platform_device_unregister(ppc_rtc_pdev); -} - -module_init(ppc_rtc_init); -module_exit(ppc_rtc_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("David Woodhouse <dwmw2@infradead.org>"); -MODULE_DESCRIPTION("Generic RTC class driver for PowerPC"); diff --git a/drivers/rtc/rtc-ps3.c b/drivers/rtc/rtc-ps3.c new file mode 100644 index 00000000000..968133ce1ee --- /dev/null +++ b/drivers/rtc/rtc-ps3.c @@ -0,0 +1,104 @@ +/* + * PS3 RTC Driver + * + * Copyright 2009 Sony Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. + * If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/platform_device.h> +#include <linux/rtc.h> + +#include <asm/lv1call.h> +#include <asm/ps3.h> + + +static u64 read_rtc(void) +{ + int result; + u64 rtc_val; + u64 tb_val; + + result = lv1_get_rtc(&rtc_val, &tb_val); + BUG_ON(result); + + return rtc_val; +} + +static int ps3_get_time(struct device *dev, struct rtc_time *tm) +{ + rtc_time_to_tm(read_rtc() + ps3_os_area_get_rtc_diff(), tm); + return rtc_valid_tm(tm); +} + +static int ps3_set_time(struct device *dev, struct rtc_time *tm) +{ + unsigned long now; + + rtc_tm_to_time(tm, &now); + ps3_os_area_set_rtc_diff(now - read_rtc()); + return 0; +} + +static const struct rtc_class_ops ps3_rtc_ops = { + .read_time = ps3_get_time, + .set_time = ps3_set_time, +}; + +static int __init ps3_rtc_probe(struct platform_device *dev) +{ + struct rtc_device *rtc; + + rtc = rtc_device_register("rtc-ps3", &dev->dev, &ps3_rtc_ops, + THIS_MODULE); + if (IS_ERR(rtc)) + return PTR_ERR(rtc); + + platform_set_drvdata(dev, rtc); + return 0; +} + +static int __exit ps3_rtc_remove(struct platform_device *dev) +{ + rtc_device_unregister(platform_get_drvdata(dev)); + return 0; +} + +static struct platform_driver ps3_rtc_driver = { + .driver = { + .name = "rtc-ps3", + .owner = THIS_MODULE, + }, + .remove = __exit_p(ps3_rtc_remove), +}; + +static int __init ps3_rtc_init(void) +{ + return platform_driver_probe(&ps3_rtc_driver, ps3_rtc_probe); +} + +static void __exit ps3_rtc_fini(void) +{ + platform_driver_unregister(&ps3_rtc_driver); +} + +module_init(ps3_rtc_init); +module_exit(ps3_rtc_fini); + +MODULE_AUTHOR("Sony Corporation"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ps3 RTC driver"); +MODULE_ALIAS("platform:rtc-ps3"); diff --git a/drivers/serial/mcf.c b/drivers/serial/mcf.c index 56841fe5f48..0eefb07beba 100644 --- a/drivers/serial/mcf.c +++ b/drivers/serial/mcf.c @@ -513,7 +513,7 @@ static int __init mcf_console_setup(struct console *co, char *options) int parity = 'n'; int flow = 'n'; - if ((co->index >= 0) && (co->index <= MCF_MAXPORTS)) + if ((co->index < 0) || (co->index >= MCF_MAXPORTS)) co->index = 0; port = &mcf_ports[co->index].port; if (port->membase == 0) diff --git a/drivers/usb/storage/isd200.c b/drivers/usb/storage/isd200.c index 882c57b399f..fdba2f69d4c 100644 --- a/drivers/usb/storage/isd200.c +++ b/drivers/usb/storage/isd200.c @@ -46,6 +46,7 @@ #include <linux/errno.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/ata.h> #include <linux/hdreg.h> #include <linux/scatterlist.h> @@ -328,7 +329,7 @@ struct isd200_config { struct isd200_info { struct inquiry_data InquiryData; - struct hd_driveid *id; + u16 *id; struct isd200_config ConfigData; unsigned char *RegsBuf; unsigned char ATARegs[8]; @@ -419,19 +420,19 @@ static void isd200_build_sense(struct us_data *us, struct scsi_cmnd *srb) buf->Flags = UNIT_ATTENTION; buf->AdditionalSenseCode = 0; buf->AdditionalSenseCodeQualifier = 0; - } else if(error & MCR_ERR) { + } else if (error & ATA_MCR) { buf->ErrorCode = 0x70 | SENSE_ERRCODE_VALID; buf->AdditionalSenseLength = 0xb; buf->Flags = UNIT_ATTENTION; buf->AdditionalSenseCode = 0; buf->AdditionalSenseCodeQualifier = 0; - } else if(error & TRK0_ERR) { + } else if (error & ATA_TRK0NF) { buf->ErrorCode = 0x70 | SENSE_ERRCODE_VALID; buf->AdditionalSenseLength = 0xb; buf->Flags = NOT_READY; buf->AdditionalSenseCode = 0; buf->AdditionalSenseCodeQualifier = 0; - } else if(error & ECC_ERR) { + } else if (error & ATA_UNC) { buf->ErrorCode = 0x70 | SENSE_ERRCODE_VALID; buf->AdditionalSenseLength = 0xb; buf->Flags = DATA_PROTECT; @@ -547,16 +548,16 @@ static int isd200_action( struct us_data *us, int action, ata.generic.ActionSelect = ACTION_SELECT_1|ACTION_SELECT_5; ata.generic.RegisterSelect = REG_DEVICE_HEAD | REG_COMMAND; ata.write.DeviceHeadByte = info->DeviceHead; - ata.write.CommandByte = WIN_SRST; + ata.write.CommandByte = ATA_CMD_DEV_RESET; isd200_set_srb(info, DMA_NONE, NULL, 0); break; case ACTION_IDENTIFY: US_DEBUGP(" isd200_action(IDENTIFY)\n"); ata.generic.RegisterSelect = REG_COMMAND; - ata.write.CommandByte = WIN_IDENTIFY; + ata.write.CommandByte = ATA_CMD_ID_ATA; isd200_set_srb(info, DMA_FROM_DEVICE, info->id, - sizeof(struct hd_driveid)); + ATA_ID_WORDS * 2); break; default: @@ -944,22 +945,22 @@ static int isd200_try_enum(struct us_data *us, unsigned char master_slave, break; if (!detect) { - if (regs[ATA_REG_STATUS_OFFSET] & BUSY_STAT) { + if (regs[ATA_REG_STATUS_OFFSET] & ATA_BUSY) { US_DEBUGP(" %s status is still BSY, try again...\n",mstr); } else { US_DEBUGP(" %s status !BSY, continue with next operation\n",mstr); break; } } - /* check for BUSY_STAT and */ - /* WRERR_STAT (workaround ATA Zip drive) and */ - /* ERR_STAT (workaround for Archos CD-ROM) */ + /* check for ATA_BUSY and */ + /* ATA_DF (workaround ATA Zip drive) and */ + /* ATA_ERR (workaround for Archos CD-ROM) */ else if (regs[ATA_REG_STATUS_OFFSET] & - (BUSY_STAT | WRERR_STAT | ERR_STAT )) { + (ATA_BUSY | ATA_DF | ATA_ERR)) { US_DEBUGP(" Status indicates it is not ready, try again...\n"); } /* check for DRDY, ATA devices set DRDY after SRST */ - else if (regs[ATA_REG_STATUS_OFFSET] & READY_STAT) { + else if (regs[ATA_REG_STATUS_OFFSET] & ATA_DRDY) { US_DEBUGP(" Identified ATA device\n"); info->DeviceFlags |= DF_ATA_DEVICE; info->DeviceHead = master_slave; @@ -1053,103 +1054,50 @@ static int isd200_manual_enum(struct us_data *us) return(retStatus); } -static void isd200_fix_driveid (struct hd_driveid *id) +static void isd200_fix_driveid(u16 *id) { #ifndef __LITTLE_ENDIAN # ifdef __BIG_ENDIAN int i; - u16 *stringcast; - - id->config = __le16_to_cpu(id->config); - id->cyls = __le16_to_cpu(id->cyls); - id->reserved2 = __le16_to_cpu(id->reserved2); - id->heads = __le16_to_cpu(id->heads); - id->track_bytes = __le16_to_cpu(id->track_bytes); - id->sector_bytes = __le16_to_cpu(id->sector_bytes); - id->sectors = __le16_to_cpu(id->sectors); - id->vendor0 = __le16_to_cpu(id->vendor0); - id->vendor1 = __le16_to_cpu(id->vendor1); - id->vendor2 = __le16_to_cpu(id->vendor2); - stringcast = (u16 *)&id->serial_no[0]; - for (i = 0; i < (20/2); i++) - stringcast[i] = __le16_to_cpu(stringcast[i]); - id->buf_type = __le16_to_cpu(id->buf_type); - id->buf_size = __le16_to_cpu(id->buf_size); - id->ecc_bytes = __le16_to_cpu(id->ecc_bytes); - stringcast = (u16 *)&id->fw_rev[0]; - for (i = 0; i < (8/2); i++) - stringcast[i] = __le16_to_cpu(stringcast[i]); - stringcast = (u16 *)&id->model[0]; - for (i = 0; i < (40/2); i++) - stringcast[i] = __le16_to_cpu(stringcast[i]); - id->dword_io = __le16_to_cpu(id->dword_io); - id->reserved50 = __le16_to_cpu(id->reserved50); - id->field_valid = __le16_to_cpu(id->field_valid); - id->cur_cyls = __le16_to_cpu(id->cur_cyls); - id->cur_heads = __le16_to_cpu(id->cur_heads); - id->cur_sectors = __le16_to_cpu(id->cur_sectors); - id->cur_capacity0 = __le16_to_cpu(id->cur_capacity0); - id->cur_capacity1 = __le16_to_cpu(id->cur_capacity1); - id->lba_capacity = __le32_to_cpu(id->lba_capacity); - id->dma_1word = __le16_to_cpu(id->dma_1word); - id->dma_mword = __le16_to_cpu(id->dma_mword); - id->eide_pio_modes = __le16_to_cpu(id->eide_pio_modes); - id->eide_dma_min = __le16_to_cpu(id->eide_dma_min); - id->eide_dma_time = __le16_to_cpu(id->eide_dma_time); - id->eide_pio = __le16_to_cpu(id->eide_pio); - id->eide_pio_iordy = __le16_to_cpu(id->eide_pio_iordy); - for (i = 0; i < 2; ++i) - id->words69_70[i] = __le16_to_cpu(id->words69_70[i]); - for (i = 0; i < 4; ++i) - id->words71_74[i] = __le16_to_cpu(id->words71_74[i]); - id->queue_depth = __le16_to_cpu(id->queue_depth); - for (i = 0; i < 4; ++i) - id->words76_79[i] = __le16_to_cpu(id->words76_79[i]); - id->major_rev_num = __le16_to_cpu(id->major_rev_num); - id->minor_rev_num = __le16_to_cpu(id->minor_rev_num); - id->command_set_1 = __le16_to_cpu(id->command_set_1); - id->command_set_2 = __le16_to_cpu(id->command_set_2); - id->cfsse = __le16_to_cpu(id->cfsse); - id->cfs_enable_1 = __le16_to_cpu(id->cfs_enable_1); - id->cfs_enable_2 = __le16_to_cpu(id->cfs_enable_2); - id->csf_default = __le16_to_cpu(id->csf_default); - id->dma_ultra = __le16_to_cpu(id->dma_ultra); - id->trseuc = __le16_to_cpu(id->trseuc); - id->trsEuc = __le16_to_cpu(id->trsEuc); - id->CurAPMvalues = __le16_to_cpu(id->CurAPMvalues); - id->mprc = __le16_to_cpu(id->mprc); - id->hw_config = __le16_to_cpu(id->hw_config); - id->acoustic = __le16_to_cpu(id->acoustic); - id->msrqs = __le16_to_cpu(id->msrqs); - id->sxfert = __le16_to_cpu(id->sxfert); - id->sal = __le16_to_cpu(id->sal); - id->spg = __le32_to_cpu(id->spg); - id->lba_capacity_2 = __le64_to_cpu(id->lba_capacity_2); - for (i = 0; i < 22; i++) - id->words104_125[i] = __le16_to_cpu(id->words104_125[i]); - id->last_lun = __le16_to_cpu(id->last_lun); - id->word127 = __le16_to_cpu(id->word127); - id->dlf = __le16_to_cpu(id->dlf); - id->csfo = __le16_to_cpu(id->csfo); - for (i = 0; i < 26; i++) - id->words130_155[i] = __le16_to_cpu(id->words130_155[i]); - id->word156 = __le16_to_cpu(id->word156); - for (i = 0; i < 3; i++) - id->words157_159[i] = __le16_to_cpu(id->words157_159[i]); - id->cfa_power = __le16_to_cpu(id->cfa_power); - for (i = 0; i < 14; i++) - id->words161_175[i] = __le16_to_cpu(id->words161_175[i]); - for (i = 0; i < 31; i++) - id->words176_205[i] = __le16_to_cpu(id->words176_205[i]); - for (i = 0; i < 48; i++) - id->words206_254[i] = __le16_to_cpu(id->words206_254[i]); - id->integrity_word = __le16_to_cpu(id->integrity_word); + + for (i = 0; i < ATA_ID_WORDS; i++) + id[i] = __le16_to_cpu(id[i]); # else # error "Please fix <asm/byteorder.h>" # endif #endif } +static void isd200_dump_driveid(u16 *id) +{ + US_DEBUGP(" Identify Data Structure:\n"); + US_DEBUGP(" config = 0x%x\n", id[ATA_ID_CONFIG]); + US_DEBUGP(" cyls = 0x%x\n", id[ATA_ID_CYLS]); + US_DEBUGP(" heads = 0x%x\n", id[ATA_ID_HEADS]); + US_DEBUGP(" track_bytes = 0x%x\n", id[4]); + US_DEBUGP(" sector_bytes = 0x%x\n", id[5]); + US_DEBUGP(" sectors = 0x%x\n", id[ATA_ID_SECTORS]); + US_DEBUGP(" serial_no[0] = 0x%x\n", *(char *)&id[ATA_ID_SERNO]); + US_DEBUGP(" buf_type = 0x%x\n", id[20]); + US_DEBUGP(" buf_size = 0x%x\n", id[ATA_ID_BUF_SIZE]); + US_DEBUGP(" ecc_bytes = 0x%x\n", id[22]); + US_DEBUGP(" fw_rev[0] = 0x%x\n", *(char *)&id[ATA_ID_FW_REV]); + US_DEBUGP(" model[0] = 0x%x\n", *(char *)&id[ATA_ID_PROD]); + US_DEBUGP(" max_multsect = 0x%x\n", id[ATA_ID_MAX_MULTSECT] & 0xff); + US_DEBUGP(" dword_io = 0x%x\n", id[ATA_ID_DWORD_IO]); + US_DEBUGP(" capability = 0x%x\n", id[ATA_ID_CAPABILITY] >> 8); + US_DEBUGP(" tPIO = 0x%x\n", id[ATA_ID_OLD_PIO_MODES] >> 8); + US_DEBUGP(" tDMA = 0x%x\n", id[ATA_ID_OLD_DMA_MODES] >> 8); + US_DEBUGP(" field_valid = 0x%x\n", id[ATA_ID_FIELD_VALID]); + US_DEBUGP(" cur_cyls = 0x%x\n", id[ATA_ID_CUR_CYLS]); + US_DEBUGP(" cur_heads = 0x%x\n", id[ATA_ID_CUR_HEADS]); + US_DEBUGP(" cur_sectors = 0x%x\n", id[ATA_ID_CUR_SECTORS]); + US_DEBUGP(" cur_capacity = 0x%x\n", ata_id_u32(id, 57)); + US_DEBUGP(" multsect = 0x%x\n", id[ATA_ID_MULTSECT] & 0xff); + US_DEBUGP(" lba_capacity = 0x%x\n", ata_id_u32(id, ATA_ID_LBA_CAPACITY)); + US_DEBUGP(" command_set_1 = 0x%x\n", id[ATA_ID_COMMAND_SET_1]); + US_DEBUGP(" command_set_2 = 0x%x\n", id[ATA_ID_COMMAND_SET_2]); +} /************************************************************************** * isd200_get_inquiry_data @@ -1163,7 +1111,7 @@ static int isd200_get_inquiry_data( struct us_data *us ) { struct isd200_info *info = (struct isd200_info *)us->extra; int retStatus = ISD200_GOOD; - struct hd_driveid *id = info->id; + u16 *id = info->id; US_DEBUGP("Entering isd200_get_inquiry_data\n"); @@ -1180,8 +1128,7 @@ static int isd200_get_inquiry_data( struct us_data *us ) /* this must be an ATA device */ /* perform an ATA Command Identify */ transferStatus = isd200_action( us, ACTION_IDENTIFY, - id, - sizeof(struct hd_driveid) ); + id, ATA_ID_WORDS * 2); if (transferStatus != ISD200_TRANSPORT_GOOD) { /* Error issuing ATA Command Identify */ US_DEBUGP(" Error issuing ATA Command Identify\n"); @@ -1191,35 +1138,9 @@ static int isd200_get_inquiry_data( struct us_data *us ) int i; __be16 *src; __u16 *dest; - isd200_fix_driveid(id); - US_DEBUGP(" Identify Data Structure:\n"); - US_DEBUGP(" config = 0x%x\n", id->config); - US_DEBUGP(" cyls = 0x%x\n", id->cyls); - US_DEBUGP(" heads = 0x%x\n", id->heads); - US_DEBUGP(" track_bytes = 0x%x\n", id->track_bytes); - US_DEBUGP(" sector_bytes = 0x%x\n", id->sector_bytes); - US_DEBUGP(" sectors = 0x%x\n", id->sectors); - US_DEBUGP(" serial_no[0] = 0x%x\n", id->serial_no[0]); - US_DEBUGP(" buf_type = 0x%x\n", id->buf_type); - US_DEBUGP(" buf_size = 0x%x\n", id->buf_size); - US_DEBUGP(" ecc_bytes = 0x%x\n", id->ecc_bytes); - US_DEBUGP(" fw_rev[0] = 0x%x\n", id->fw_rev[0]); - US_DEBUGP(" model[0] = 0x%x\n", id->model[0]); - US_DEBUGP(" max_multsect = 0x%x\n", id->max_multsect); - US_DEBUGP(" dword_io = 0x%x\n", id->dword_io); - US_DEBUGP(" capability = 0x%x\n", id->capability); - US_DEBUGP(" tPIO = 0x%x\n", id->tPIO); - US_DEBUGP(" tDMA = 0x%x\n", id->tDMA); - US_DEBUGP(" field_valid = 0x%x\n", id->field_valid); - US_DEBUGP(" cur_cyls = 0x%x\n", id->cur_cyls); - US_DEBUGP(" cur_heads = 0x%x\n", id->cur_heads); - US_DEBUGP(" cur_sectors = 0x%x\n", id->cur_sectors); - US_DEBUGP(" cur_capacity = 0x%x\n", (id->cur_capacity1 << 16) + id->cur_capacity0 ); - US_DEBUGP(" multsect = 0x%x\n", id->multsect); - US_DEBUGP(" lba_capacity = 0x%x\n", id->lba_capacity); - US_DEBUGP(" command_set_1 = 0x%x\n", id->command_set_1); - US_DEBUGP(" command_set_2 = 0x%x\n", id->command_set_2); + isd200_fix_driveid(id); + isd200_dump_driveid(id); memset(&info->InquiryData, 0, sizeof(info->InquiryData)); @@ -1229,30 +1150,30 @@ static int isd200_get_inquiry_data( struct us_data *us ) /* The length must be at least 36 (5 + 31) */ info->InquiryData.AdditionalLength = 0x1F; - if (id->command_set_1 & COMMANDSET_MEDIA_STATUS) { + if (id[ATA_ID_COMMAND_SET_1] & COMMANDSET_MEDIA_STATUS) { /* set the removable bit */ info->InquiryData.DeviceTypeModifier = DEVICE_REMOVABLE; info->DeviceFlags |= DF_REMOVABLE_MEDIA; } /* Fill in vendor identification fields */ - src = (__be16*)id->model; + src = (__be16 *)&id[ATA_ID_PROD]; dest = (__u16*)info->InquiryData.VendorId; for (i=0;i<4;i++) dest[i] = be16_to_cpu(src[i]); - src = (__be16*)(id->model+8); + src = (__be16 *)&id[ATA_ID_PROD + 8/2]; dest = (__u16*)info->InquiryData.ProductId; for (i=0;i<8;i++) dest[i] = be16_to_cpu(src[i]); - src = (__be16*)id->fw_rev; + src = (__be16 *)&id[ATA_ID_FW_REV]; dest = (__u16*)info->InquiryData.ProductRevisionLevel; for (i=0;i<2;i++) dest[i] = be16_to_cpu(src[i]); /* determine if it supports Media Status Notification */ - if (id->command_set_2 & COMMANDSET_MEDIA_STATUS) { + if (id[ATA_ID_COMMAND_SET_2] & COMMANDSET_MEDIA_STATUS) { US_DEBUGP(" Device supports Media Status Notification\n"); /* Indicate that it is enabled, even though it is not @@ -1301,7 +1222,7 @@ static int isd200_scsi_to_ata(struct scsi_cmnd *srb, struct us_data *us, union ata_cdb * ataCdb) { struct isd200_info *info = (struct isd200_info *)us->extra; - struct hd_driveid *id = info->id; + u16 *id = info->id; int sendToTransport = 1; unsigned char sectnum, head; unsigned short cylinder; @@ -1369,13 +1290,12 @@ static int isd200_scsi_to_ata(struct scsi_cmnd *srb, struct us_data *us, US_DEBUGP(" ATA OUT - SCSIOP_READ_CAPACITY\n"); - if (id->capability & CAPABILITY_LBA ) { - capacity = id->lba_capacity - 1; - } else { - capacity = (id->heads * - id->cyls * - id->sectors) - 1; - } + if (ata_id_has_lba(id)) + capacity = ata_id_u32(id, ATA_ID_LBA_CAPACITY) - 1; + else + capacity = (id[ATA_ID_HEADS] * id[ATA_ID_CYLS] * + id[ATA_ID_SECTORS]) - 1; + readCapacityData.LogicalBlockAddress = cpu_to_be32(capacity); readCapacityData.BytesPerBlock = cpu_to_be32(0x200); @@ -1392,16 +1312,16 @@ static int isd200_scsi_to_ata(struct scsi_cmnd *srb, struct us_data *us, lba = be32_to_cpu(*(__be32 *)&srb->cmnd[2]); blockCount = (unsigned long)srb->cmnd[7]<<8 | (unsigned long)srb->cmnd[8]; - if (id->capability & CAPABILITY_LBA) { + if (ata_id_has_lba(id)) { sectnum = (unsigned char)(lba); cylinder = (unsigned short)(lba>>8); head = ATA_ADDRESS_DEVHEAD_LBA_MODE | (unsigned char)(lba>>24 & 0x0F); } else { - sectnum = (unsigned char)((lba % id->sectors) + 1); - cylinder = (unsigned short)(lba / (id->sectors * - id->heads)); - head = (unsigned char)((lba / id->sectors) % - id->heads); + sectnum = (u8)((lba % id[ATA_ID_SECTORS]) + 1); + cylinder = (u16)(lba / (id[ATA_ID_SECTORS] * + id[ATA_ID_HEADS])); + head = (u8)((lba / id[ATA_ID_SECTORS]) % + id[ATA_ID_HEADS]); } ataCdb->generic.SignatureByte0 = info->ConfigData.ATAMajorCommand; ataCdb->generic.SignatureByte1 = info->ConfigData.ATAMinorCommand; @@ -1415,7 +1335,7 @@ static int isd200_scsi_to_ata(struct scsi_cmnd *srb, struct us_data *us, ataCdb->write.CylinderHighByte = (unsigned char)(cylinder>>8); ataCdb->write.CylinderLowByte = (unsigned char)cylinder; ataCdb->write.DeviceHeadByte = (head | ATA_ADDRESS_DEVHEAD_STD); - ataCdb->write.CommandByte = WIN_READ; + ataCdb->write.CommandByte = ATA_CMD_PIO_READ; break; case WRITE_10: @@ -1424,14 +1344,16 @@ static int isd200_scsi_to_ata(struct scsi_cmnd *srb, struct us_data *us, lba = be32_to_cpu(*(__be32 *)&srb->cmnd[2]); blockCount = (unsigned long)srb->cmnd[7]<<8 | (unsigned long)srb->cmnd[8]; - if (id->capability & CAPABILITY_LBA) { + if (ata_id_has_lba(id)) { sectnum = (unsigned char)(lba); cylinder = (unsigned short)(lba>>8); head = ATA_ADDRESS_DEVHEAD_LBA_MODE | (unsigned char)(lba>>24 & 0x0F); } else { - sectnum = (unsigned char)((lba % id->sectors) + 1); - cylinder = (unsigned short)(lba / (id->sectors * id->heads)); - head = (unsigned char)((lba / id->sectors) % id->heads); + sectnum = (u8)((lba % id[ATA_ID_SECTORS]) + 1); + cylinder = (u16)(lba / (id[ATA_ID_SECTORS] * + id[ATA_ID_HEADS])); + head = (u8)((lba / id[ATA_ID_SECTORS]) % + id[ATA_ID_HEADS]); } ataCdb->generic.SignatureByte0 = info->ConfigData.ATAMajorCommand; ataCdb->generic.SignatureByte1 = info->ConfigData.ATAMinorCommand; @@ -1445,7 +1367,7 @@ static int isd200_scsi_to_ata(struct scsi_cmnd *srb, struct us_data *us, ataCdb->write.CylinderHighByte = (unsigned char)(cylinder>>8); ataCdb->write.CylinderLowByte = (unsigned char)cylinder; ataCdb->write.DeviceHeadByte = (head | ATA_ADDRESS_DEVHEAD_STD); - ataCdb->write.CommandByte = WIN_WRITE; + ataCdb->write.CommandByte = ATA_CMD_PIO_WRITE; break; case ALLOW_MEDIUM_REMOVAL: @@ -1459,7 +1381,7 @@ static int isd200_scsi_to_ata(struct scsi_cmnd *srb, struct us_data *us, ataCdb->generic.TransferBlockSize = 1; ataCdb->generic.RegisterSelect = REG_COMMAND; ataCdb->write.CommandByte = (srb->cmnd[4] & 0x1) ? - WIN_DOORLOCK : WIN_DOORUNLOCK; + ATA_CMD_MEDIA_LOCK : ATA_CMD_MEDIA_UNLOCK; isd200_srb_set_bufflen(srb, 0); } else { US_DEBUGP(" Not removeable media, just report okay\n"); @@ -1539,8 +1461,7 @@ static int isd200_init_info(struct us_data *us) if (!info) retStatus = ISD200_ERROR; else { - info->id = (struct hd_driveid *) - kzalloc(sizeof(struct hd_driveid), GFP_KERNEL); + info->id = kzalloc(ATA_ID_WORDS * 2, GFP_KERNEL); info->RegsBuf = (unsigned char *) kmalloc(sizeof(info->ATARegs), GFP_KERNEL); info->srb.sense_buffer = |