diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Makefile | 6 | ||||
-rw-r--r-- | drivers/md/dm-emc.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-exception-store.c | 10 | ||||
-rw-r--r-- | drivers/md/dm-io.c | 38 | ||||
-rw-r--r-- | drivers/md/dm-io.h | 79 | ||||
-rw-r--r-- | drivers/md/dm-kcopyd.c (renamed from drivers/md/kcopyd.c) | 298 | ||||
-rw-r--r-- | drivers/md/dm-log.c | 254 | ||||
-rw-r--r-- | drivers/md/dm-log.h | 131 | ||||
-rw-r--r-- | drivers/md/dm-mpath-hp-sw.c | 1 | ||||
-rw-r--r-- | drivers/md/dm-mpath-rdac.c | 1 | ||||
-rw-r--r-- | drivers/md/dm-raid1.c | 132 | ||||
-rw-r--r-- | drivers/md/dm-snap.c | 22 | ||||
-rw-r--r-- | drivers/md/dm-snap.h | 4 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 47 | ||||
-rw-r--r-- | drivers/md/dm-uevent.c | 22 | ||||
-rw-r--r-- | drivers/md/dm.c | 16 | ||||
-rw-r--r-- | drivers/md/dm.h | 98 | ||||
-rw-r--r-- | drivers/md/kcopyd.h | 42 | ||||
-rw-r--r-- | drivers/md/md.c | 129 | ||||
-rw-r--r-- | drivers/md/multipath.c | 3 | ||||
-rw-r--r-- | drivers/md/raid1.c | 31 | ||||
-rw-r--r-- | drivers/md/raid10.c | 33 | ||||
-rw-r--r-- | drivers/md/raid5.c | 191 | ||||
-rw-r--r-- | drivers/md/raid6algos.c | 3 |
24 files changed, 748 insertions, 845 deletions
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index d9aa7edb878..7be09eeea29 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -3,10 +3,10 @@ # dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ - dm-ioctl.o dm-io.o kcopyd.o + dm-ioctl.o dm-io.o dm-kcopyd.o dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o dm-snapshot-objs := dm-snap.o dm-exception-store.o -dm-mirror-objs := dm-log.o dm-raid1.o +dm-mirror-objs := dm-raid1.o dm-rdac-objs := dm-mpath-rdac.o dm-hp-sw-objs := dm-mpath-hp-sw.o md-mod-objs := md.o bitmap.o @@ -39,7 +39,7 @@ obj-$(CONFIG_DM_MULTIPATH_EMC) += dm-emc.o obj-$(CONFIG_DM_MULTIPATH_HP) += dm-hp-sw.o obj-$(CONFIG_DM_MULTIPATH_RDAC) += dm-rdac.o obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o -obj-$(CONFIG_DM_MIRROR) += dm-mirror.o +obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o obj-$(CONFIG_DM_ZERO) += dm-zero.o quiet_cmd_unroll = UNROLL $@ diff --git a/drivers/md/dm-emc.c b/drivers/md/dm-emc.c index 6b91b9ab1d4..3ea5ad4b780 100644 --- a/drivers/md/dm-emc.c +++ b/drivers/md/dm-emc.c @@ -110,8 +110,6 @@ static struct request *get_failover_req(struct emc_handler *h, memset(rq->sense, 0, SCSI_SENSE_BUFFERSIZE); rq->sense_len = 0; - memset(&rq->cmd, 0, BLK_MAX_CDB); - rq->timeout = EMC_FAILOVER_TIMEOUT; rq->cmd_type = REQ_TYPE_BLOCK_PC; rq->cmd_flags |= REQ_FAILFAST | REQ_NOMERGE; diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index 5bbce29f143..41f408068a7 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c @@ -9,13 +9,13 @@ #include "dm.h" #include "dm-snap.h" -#include "dm-io.h" -#include "kcopyd.h" #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/vmalloc.h> #include <linux/slab.h> +#include <linux/dm-io.h> +#include <linux/dm-kcopyd.h> #define DM_MSG_PREFIX "snapshots" #define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */ @@ -131,7 +131,7 @@ struct pstore { static unsigned sectors_to_pages(unsigned sectors) { - return sectors / (PAGE_SIZE >> 9); + return DIV_ROUND_UP(sectors, PAGE_SIZE >> 9); } static int alloc_area(struct pstore *ps) @@ -159,7 +159,7 @@ static void free_area(struct pstore *ps) } struct mdata_req { - struct io_region *where; + struct dm_io_region *where; struct dm_io_request *io_req; struct work_struct work; int result; @@ -177,7 +177,7 @@ static void do_metadata(struct work_struct *work) */ static int chunk_io(struct pstore *ps, uint32_t chunk, int rw, int metadata) { - struct io_region where = { + struct dm_io_region where = { .bdev = ps->snap->cow->bdev, .sector = ps->snap->chunk_size * chunk, .count = ps->snap->chunk_size, diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 8f25f628ef1..4789c42d9a3 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -5,13 +5,14 @@ * This file is released under the GPL. */ -#include "dm-io.h" +#include "dm.h" #include <linux/bio.h> #include <linux/mempool.h> #include <linux/module.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/dm-io.h> struct dm_io_client { mempool_t *pool; @@ -20,7 +21,7 @@ struct dm_io_client { /* FIXME: can we shrink this ? */ struct io { - unsigned long error; + unsigned long error_bits; atomic_t count; struct task_struct *sleeper; struct dm_io_client *client; @@ -107,14 +108,14 @@ static inline unsigned bio_get_region(struct bio *bio) static void dec_count(struct io *io, unsigned int region, int error) { if (error) - set_bit(region, &io->error); + set_bit(region, &io->error_bits); if (atomic_dec_and_test(&io->count)) { if (io->sleeper) wake_up_process(io->sleeper); else { - unsigned long r = io->error; + unsigned long r = io->error_bits; io_notify_fn fn = io->callback; void *context = io->context; @@ -271,7 +272,7 @@ static void km_dp_init(struct dpages *dp, void *data) /*----------------------------------------------------------------- * IO routines that accept a list of pages. *---------------------------------------------------------------*/ -static void do_region(int rw, unsigned int region, struct io_region *where, +static void do_region(int rw, unsigned region, struct dm_io_region *where, struct dpages *dp, struct io *io) { struct bio *bio; @@ -320,7 +321,7 @@ static void do_region(int rw, unsigned int region, struct io_region *where, } static void dispatch_io(int rw, unsigned int num_regions, - struct io_region *where, struct dpages *dp, + struct dm_io_region *where, struct dpages *dp, struct io *io, int sync) { int i; @@ -347,17 +348,17 @@ static void dispatch_io(int rw, unsigned int num_regions, } static int sync_io(struct dm_io_client *client, unsigned int num_regions, - struct io_region *where, int rw, struct dpages *dp, + struct dm_io_region *where, int rw, struct dpages *dp, unsigned long *error_bits) { struct io io; - if (num_regions > 1 && rw != WRITE) { + if (num_regions > 1 && (rw & RW_MASK) != WRITE) { WARN_ON(1); return -EIO; } - io.error = 0; + io.error_bits = 0; atomic_set(&io.count, 1); /* see dispatch_io() */ io.sleeper = current; io.client = client; @@ -378,25 +379,25 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, return -EINTR; if (error_bits) - *error_bits = io.error; + *error_bits = io.error_bits; - return io.error ? -EIO : 0; + return io.error_bits ? -EIO : 0; } static int async_io(struct dm_io_client *client, unsigned int num_regions, - struct io_region *where, int rw, struct dpages *dp, + struct dm_io_region *where, int rw, struct dpages *dp, io_notify_fn fn, void *context) { struct io *io; - if (num_regions > 1 && rw != WRITE) { + if (num_regions > 1 && (rw & RW_MASK) != WRITE) { WARN_ON(1); fn(1, context); return -EIO; } io = mempool_alloc(client->pool, GFP_NOIO); - io->error = 0; + io->error_bits = 0; atomic_set(&io->count, 1); /* see dispatch_io() */ io->sleeper = NULL; io->client = client; @@ -435,10 +436,15 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp) } /* - * New collapsed (a)synchronous interface + * New collapsed (a)synchronous interface. + * + * If the IO is asynchronous (i.e. it has notify.fn), you must either unplug + * the queue with blk_unplug() some time later or set the BIO_RW_SYNC bit in + * io_req->bi_rw. If you fail to do one of these, the IO will be submitted to + * the disk after q->unplug_delay, which defaults to 3ms in blk-settings.c. */ int dm_io(struct dm_io_request *io_req, unsigned num_regions, - struct io_region *where, unsigned long *sync_error_bits) + struct dm_io_region *where, unsigned long *sync_error_bits) { int r; struct dpages dp; diff --git a/drivers/md/dm-io.h b/drivers/md/dm-io.h deleted file mode 100644 index f647e2cceaa..00000000000 --- a/drivers/md/dm-io.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (C) 2003 Sistina Software - * - * This file is released under the GPL. - */ - -#ifndef _DM_IO_H -#define _DM_IO_H - -#include "dm.h" - -struct io_region { - struct block_device *bdev; - sector_t sector; - sector_t count; /* If this is zero the region is ignored. */ -}; - -struct page_list { - struct page_list *next; - struct page *page; -}; - -typedef void (*io_notify_fn)(unsigned long error, void *context); - -enum dm_io_mem_type { - DM_IO_PAGE_LIST,/* Page list */ - DM_IO_BVEC, /* Bio vector */ - DM_IO_VMA, /* Virtual memory area */ - DM_IO_KMEM, /* Kernel memory */ -}; - -struct dm_io_memory { - enum dm_io_mem_type type; - - union { - struct page_list *pl; - struct bio_vec *bvec; - void *vma; - void *addr; - } ptr; - - unsigned offset; -}; - -struct dm_io_notify { - io_notify_fn fn; /* Callback for asynchronous requests */ - void *context; /* Passed to callback */ -}; - -/* - * IO request structure - */ -struct dm_io_client; -struct dm_io_request { - int bi_rw; /* READ|WRITE - not READA */ - struct dm_io_memory mem; /* Memory to use for io */ - struct dm_io_notify notify; /* Synchronous if notify.fn is NULL */ - struct dm_io_client *client; /* Client memory handler */ -}; - -/* - * For async io calls, users can alternatively use the dm_io() function below - * and dm_io_client_create() to create private mempools for the client. - * - * Create/destroy may block. - */ -struct dm_io_client *dm_io_client_create(unsigned num_pages); -int dm_io_client_resize(unsigned num_pages, struct dm_io_client *client); -void dm_io_client_destroy(struct dm_io_client *client); - -/* - * IO interface using private per-client pools. - * Each bit in the optional 'sync_error_bits' bitset indicates whether an - * error occurred doing io to the corresponding region. - */ -int dm_io(struct dm_io_request *io_req, unsigned num_regions, - struct io_region *region, unsigned long *sync_error_bits); - -#endif diff --git a/drivers/md/kcopyd.c b/drivers/md/dm-kcopyd.c index e76b52ade69..996802b8a45 100644 --- a/drivers/md/kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -9,9 +9,8 @@ * completion notification. */ -#include <asm/types.h> +#include <linux/types.h> #include <asm/atomic.h> - #include <linux/blkdev.h> #include <linux/fs.h> #include <linux/init.h> @@ -23,24 +22,15 @@ #include <linux/vmalloc.h> #include <linux/workqueue.h> #include <linux/mutex.h> +#include <linux/dm-kcopyd.h> -#include "kcopyd.h" - -static struct workqueue_struct *_kcopyd_wq; -static struct work_struct _kcopyd_work; - -static void wake(void) -{ - queue_work(_kcopyd_wq, &_kcopyd_work); -} +#include "dm.h" /*----------------------------------------------------------------- * Each kcopyd client has its own little pool of preallocated * pages for kcopyd io. *---------------------------------------------------------------*/ -struct kcopyd_client { - struct list_head list; - +struct dm_kcopyd_client { spinlock_t lock; struct page_list *pages; unsigned int nr_pages; @@ -50,8 +40,32 @@ struct kcopyd_client { wait_queue_head_t destroyq; atomic_t nr_jobs; + + mempool_t *job_pool; + + struct workqueue_struct *kcopyd_wq; + struct work_struct kcopyd_work; + +/* + * We maintain three lists of jobs: + * + * i) jobs waiting for pages + * ii) jobs that have pages, and are waiting for the io to be issued. + * iii) jobs that have completed. + * + * All three of these are protected by job_lock. + */ + spinlock_t job_lock; + struct list_head complete_jobs; + struct list_head io_jobs; + struct list_head pages_jobs; }; +static void wake(struct dm_kcopyd_client *kc) +{ + queue_work(kc->kcopyd_wq, &kc->kcopyd_work); +} + static struct page_list *alloc_pl(void) { struct page_list *pl; @@ -75,7 +89,7 @@ static void free_pl(struct page_list *pl) kfree(pl); } -static int kcopyd_get_pages(struct kcopyd_client *kc, +static int kcopyd_get_pages(struct dm_kcopyd_client *kc, unsigned int nr, struct page_list **pages) { struct page_list *pl; @@ -98,7 +112,7 @@ static int kcopyd_get_pages(struct kcopyd_client *kc, return 0; } -static void kcopyd_put_pages(struct kcopyd_client *kc, struct page_list *pl) +static void kcopyd_put_pages(struct dm_kcopyd_client *kc, struct page_list *pl) { struct page_list *cursor; @@ -126,7 +140,7 @@ static void drop_pages(struct page_list *pl) } } -static int client_alloc_pages(struct kcopyd_client *kc, unsigned int nr) +static int client_alloc_pages(struct dm_kcopyd_client *kc, unsigned int nr) { unsigned int i; struct page_list *pl = NULL, *next; @@ -147,7 +161,7 @@ static int client_alloc_pages(struct kcopyd_client *kc, unsigned int nr) return 0; } -static void client_free_pages(struct kcopyd_client *kc) +static void client_free_pages(struct dm_kcopyd_client *kc) { BUG_ON(kc->nr_free_pages != kc->nr_pages); drop_pages(kc->pages); @@ -161,7 +175,7 @@ static void client_free_pages(struct kcopyd_client *kc) * ever having to do io (which could cause a deadlock). *---------------------------------------------------------------*/ struct kcopyd_job { - struct kcopyd_client *kc; + struct dm_kcopyd_client *kc; struct list_head list; unsigned long flags; @@ -175,13 +189,13 @@ struct kcopyd_job { * Either READ or WRITE */ int rw; - struct io_region source; + struct dm_io_region source; /* * The destinations for the transfer. */ unsigned int num_dests; - struct io_region dests[KCOPYD_MAX_REGIONS]; + struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS]; sector_t offset; unsigned int nr_pages; @@ -191,7 +205,7 @@ struct kcopyd_job { * Set this to ensure you are notified when the job has * completed. 'context' is for callback to use. */ - kcopyd_notify_fn fn; + dm_kcopyd_notify_fn fn; void *context; /* @@ -207,47 +221,19 @@ struct kcopyd_job { #define MIN_JOBS 512 static struct kmem_cache *_job_cache; -static mempool_t *_job_pool; -/* - * We maintain three lists of jobs: - * - * i) jobs waiting for pages - * ii) jobs that have pages, and are waiting for the io to be issued. - * iii) jobs that have completed. - * - * All three of these are protected by job_lock. - */ -static DEFINE_SPINLOCK(_job_lock); - -static LIST_HEAD(_complete_jobs); -static LIST_HEAD(_io_jobs); -static LIST_HEAD(_pages_jobs); - -static int jobs_init(void) +int __init dm_kcopyd_init(void) { _job_cache = KMEM_CACHE(kcopyd_job, 0); if (!_job_cache) return -ENOMEM; - _job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache); - if (!_job_pool) { - kmem_cache_destroy(_job_cache); - return -ENOMEM; - } - return 0; } -static void jobs_exit(void) +void dm_kcopyd_exit(void) { - BUG_ON(!list_empty(&_complete_jobs)); - BUG_ON(!list_empty(&_io_jobs)); - BUG_ON(!list_empty(&_pages_jobs)); - - mempool_destroy(_job_pool); kmem_cache_destroy(_job_cache); - _job_pool = NULL; _job_cache = NULL; } @@ -255,18 +241,19 @@ static void jobs_exit(void) * Functions to push and pop a job onto the head of a given job * list. */ -static struct kcopyd_job *pop(struct list_head *jobs) +static struct kcopyd_job *pop(struct list_head *jobs, + struct dm_kcopyd_client *kc) { struct kcopyd_job *job = NULL; unsigned long flags; - spin_lock_irqsave(&_job_lock, flags); + spin_lock_irqsave(&kc->job_lock, flags); if (!list_empty(jobs)) { job = list_entry(jobs->next, struct kcopyd_job, list); list_del(&job->list); } - spin_unlock_irqrestore(&_job_lock, flags); + spin_unlock_irqrestore(&kc->job_lock, flags); return job; } @@ -274,10 +261,11 @@ static struct kcopyd_job *pop(struct list_head *jobs) static void push(struct list_head *jobs, struct kcopyd_job *job) { unsigned long flags; + struct dm_kcopyd_client *kc = job->kc; - spin_lock_irqsave(&_job_lock, flags); + spin_lock_irqsave(&kc->job_lock, flags); list_add_tail(&job->list, jobs); - spin_unlock_irqrestore(&_job_lock, flags); + spin_unlock_irqrestore(&kc->job_lock, flags); } /* @@ -294,11 +282,11 @@ static int run_complete_job(struct kcopyd_job *job) void *context = job->context; int read_err = job->read_err; unsigned long write_err = job->write_err; - kcopyd_notify_fn fn = job->fn; - struct kcopyd_client *kc = job->kc; + dm_kcopyd_notify_fn fn = job->fn; + struct dm_kcopyd_client *kc = job->kc; kcopyd_put_pages(kc, job->pages); - mempool_free(job, _job_pool); + mempool_free(job, kc->job_pool); fn(read_err, write_err, context); if (atomic_dec_and_test(&kc->nr_jobs)) @@ -310,6 +298,7 @@ static int run_complete_job(struct kcopyd_job *job) static void complete_io(unsigned long error, void *context) { struct kcopyd_job *job = (struct kcopyd_job *) context; + struct dm_kcopyd_client *kc = job->kc; if (error) { if (job->rw == WRITE) @@ -317,22 +306,22 @@ static void complete_io(unsigned long error, void *context) else job->read_err = 1; - if (!test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) { - push(&_complete_jobs, job); - wake(); + if (!test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) { + push(&kc->complete_jobs, job); + wake(kc); return; } } if (job->rw == WRITE) - push(&_complete_jobs, job); + push(&kc->complete_jobs, job); else { job->rw = WRITE; - push(&_io_jobs, job); + push(&kc->io_jobs, job); } - wake(); + wake(kc); } /* @@ -343,7 +332,7 @@ static int run_io_job(struct kcopyd_job *job) { int r; struct dm_io_request io_req = { - .bi_rw = job->rw, + .bi_rw = job->rw | (1 << BIO_RW_SYNC), .mem.type = DM_IO_PAGE_LIST, .mem.ptr.pl = job->pages, .mem.offset = job->offset, @@ -369,7 +358,7 @@ static int run_pages_job(struct kcopyd_job *job) r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages); if (!r) { /* this job is ready for io */ - push(&_io_jobs, job); + push(&job->kc->io_jobs, job); return 0; } @@ -384,12 +373,13 @@ static int run_pages_job(struct kcopyd_job *job) * Run through a list for as long as possible. Returns the count * of successful jobs. */ -static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *)) +static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc, + int (*fn) (struct kcopyd_job *)) { struct kcopyd_job *job; int r, count = 0; - while ((job = pop(jobs))) { + while ((job = pop(jobs, kc))) { r = fn(job); @@ -399,7 +389,7 @@ static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *)) job->write_err = (unsigned long) -1L; else job->read_err = 1; - push(&_complete_jobs, job); + push(&kc->complete_jobs, job); break; } @@ -421,8 +411,11 @@ static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *)) /* * kcopyd does this every time it's woken up. */ -static void do_work(struct work_struct *ignored) +static void do_work(struct work_struct *work) { + struct dm_kcopyd_client *kc = container_of(work, + struct dm_kcopyd_client, kcopyd_work); + /* * The order that these are called is *very* important. * complete jobs can free some pages for pages jobs. @@ -430,9 +423,9 @@ static void do_work(struct work_struct *ignored) * list. io jobs call wake when they complete and it all * starts again. */ - process_jobs(&_complete_jobs, run_complete_job); - process_jobs(&_pages_jobs, run_pages_job); - process_jobs(&_io_jobs, run_io_job); + process_jobs(&kc->complete_jobs, kc, run_complete_job); + process_jobs(&kc->pages_jobs, kc, run_pages_job); + process_jobs(&kc->io_jobs, kc, run_io_job); } /* @@ -442,9 +435,10 @@ static void do_work(struct work_struct *ignored) */ static void dispatch_job(struct kcopyd_job *job) { - atomic_inc(&job->kc->nr_jobs); - push(&_pages_jobs, job); - wake(); + struct dm_kcopyd_client *kc = job->kc; + atomic_inc(&kc->nr_jobs); + push(&kc->pages_jobs, job); + wake(kc); } #define SUB_JOB_SIZE 128 @@ -469,7 +463,7 @@ static void segment_complete(int read_err, unsigned long write_err, * Only dispatch more work if there hasn't been an error. */ if ((!job->read_err && !job->write_err) || - test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) { + test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) { /* get the next chunk of work */ progress = job->progress; count = job->source.count - progress; @@ -484,7 +478,8 @@ static void segment_complete(int read_err, unsigned long write_err, if (count) { int i; - struct kcopyd_job *sub_job = mempool_alloc(_job_pool, GFP_NOIO); + struct kcopyd_job *sub_job = mempool_alloc(job->kc->job_pool, + GFP_NOIO); *sub_job = *job; sub_job->source.sector += progress; @@ -508,7 +503,7 @@ static void segment_complete(int read_err, unsigned long write_err, * after we've completed. */ job->fn(read_err, write_err, job->context); - mempool_free(job, _job_pool); + mempool_free(job, job->kc->job_pool); } } @@ -526,16 +521,16 @@ static void split_job(struct kcopyd_job *job) segment_complete(0, 0u, job); } -int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from, - unsigned int num_dests, struct io_region *dests, - unsigned int flags, kcopyd_notify_fn fn, void *context) +int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, + unsigned int num_dests, struct dm_io_region *dests, + unsigned int flags, dm_kcopyd_notify_fn fn, void *context) { struct kcopyd_job *job; /* * Allocate a new job. */ - job = mempool_alloc(_job_pool, GFP_NOIO); + job = mempool_alloc(kc->job_pool, GFP_NOIO); /* * set up for the read. @@ -569,6 +564,7 @@ int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from, return 0; } +EXPORT_SYMBOL(dm_kcopyd_copy); /* * Cancels a kcopyd job, eg. someone might be deactivating a @@ -583,126 +579,76 @@ int kcopyd_cancel(struct kcopyd_job *job, int block) #endif /* 0 */ /*----------------------------------------------------------------- - * Unit setup + * Client setup *---------------------------------------------------------------*/ -static DEFINE_MUTEX(_client_lock); -static LIST_HEAD(_clients); - -static void client_add(struct kcopyd_client *kc) +int dm_kcopyd_client_create(unsigned int nr_pages, + struct dm_kcopyd_client **result) { - mutex_lock(&_client_lock); - list_add(&kc->list, &_clients); - mutex_unlock(&_client_lock); -} - -static void client_del(struct kcopyd_client *kc) -{ - mutex_lock(&_client_lock); - list_del(&kc->list); - mutex_unlock(&_client_lock); -} - -static DEFINE_MUTEX(kcopyd_init_lock); -static int kcopyd_clients = 0; + int r = -ENOMEM; + struct dm_kcopyd_client *kc; -static int kcopyd_init(void) -{ - int r; - - mutex_lock(&kcopyd_init_lock); - - if (kcopyd_clients) { - /* Already initialized. */ - kcopyd_clients++; - mutex_unlock(&kcopyd_init_lock); - return 0; - } - - r = jobs_init(); - if (r) { - mutex_unlock(&kcopyd_init_lock); - return r; - } - - _kcopyd_wq = create_singlethread_workqueue("kcopyd"); - if (!_kcopyd_wq) { - jobs_exit(); - mutex_unlock(&kcopyd_init_lock); + kc = kmalloc(sizeof(*kc), GFP_KERNEL); + if (!kc) return -ENOMEM; - } - - kcopyd_clients++; - INIT_WORK(&_kcopyd_work, do_work); - mutex_unlock(&kcopyd_init_lock); - return 0; -} -static void kcopyd_exit(void) -{ - mutex_lock(&kcopyd_init_lock); - kcopyd_clients--; - if (!kcopyd_clients) { - jobs_exit(); - destroy_workqueue(_kcopyd_wq); - _kcopyd_wq = NULL; - } - mutex_unlock(&kcopyd_init_lock); -} - -int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result) -{ - int r = 0; - struct kcopyd_client *kc; + spin_lock_init(&kc->lock); + spin_lock_init(&kc->job_lock); + INIT_LIST_HEAD(&kc->complete_jobs); + INIT_LIST_HEAD(&kc->io_jobs); + INIT_LIST_HEAD(&kc->pages_jobs); - r = kcopyd_init(); - if (r) - return r; + kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache); + if (!kc->job_pool) + goto bad_slab; - kc = kmalloc(sizeof(*kc), GFP_KERNEL); - if (!kc) { - kcopyd_exit(); - return -ENOMEM; - } + INIT_WORK(&kc->kcopyd_work, do_work); + kc->kcopyd_wq = create_singlethread_workqueue("kcopyd"); + if (!kc->kcopyd_wq) + goto bad_workqueue; - spin_lock_init(&kc->lock); kc->pages = NULL; kc->nr_pages = kc->nr_free_pages = 0; r = client_alloc_pages(kc, nr_pages); - if (r) { - kfree(kc); - kcopyd_exit(); - return r; - } + if (r) + goto bad_client_pages; kc->io_client = dm_io_client_create(nr_pages); if (IS_ERR(kc->io_client)) { r = PTR_ERR(kc->io_client); - client_free_pages(kc); - kfree(kc); - kcopyd_exit(); - return r; + goto bad_io_client; } init_waitqueue_head(&kc->destroyq); atomic_set(&kc->nr_jobs, 0); - client_add(kc); *result = kc; return 0; + +bad_io_client: + client_free_pages(kc); +bad_client_pages: + destroy_workqueue(kc->kcopyd_wq); +bad_workqueue: + mempool_destroy(kc->job_pool); +bad_slab: + kfree(kc); + + return r; } +EXPORT_SYMBOL(dm_kcopyd_client_create); -void kcopyd_client_destroy(struct kcopyd_client *kc) +void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc) { /* Wait for completion of all jobs submitted by this client. */ wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs)); + BUG_ON(!list_empty(&kc->complete_jobs)); + BUG_ON(!list_empty(&kc->io_jobs)); + BUG_ON(!list_empty(&kc->pages_jobs)); + destroy_workqueue(kc->kcopyd_wq); dm_io_client_destroy(kc->io_client); client_free_pages(kc); - client_del(kc); + mempool_destroy(kc->job_pool); kfree(kc); - kcopyd_exit(); } - -EXPORT_SYMBOL(kcopyd_client_create); -EXPORT_SYMBOL(kcopyd_client_destroy); -EXPORT_SYMBOL(kcopyd_copy); +EXPORT_SYMBOL(dm_kcopyd_client_destroy); diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 2a74b2142f5..67a6f31b7fc 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -1,5 +1,6 @@ /* * Copyright (C) 2003 Sistina Software + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. * * This file is released under the LGPL. */ @@ -8,64 +9,58 @@ #include <linux/slab.h> #include <linux/module.h> #include <linux/vmalloc.h> +#include <linux/dm-io.h> +#include <linux/dm-dirty-log.h> -#include "dm-log.h" -#include "dm-io.h" +#include "dm.h" -#define DM_MSG_PREFIX "mirror log" +#define DM_MSG_PREFIX "dirty region log" -static LIST_HEAD(_log_types); -static DEFINE_SPINLOCK(_lock); +struct dm_dirty_log_internal { + struct dm_dirty_log_type *type; -int dm_register_dirty_log_type(struct dirty_log_type *type) -{ - spin_lock(&_lock); - type->use_count = 0; - list_add(&type->list, &_log_types); - spin_unlock(&_lock); + struct list_head list; + long use; +}; - return 0; -} +static LIST_HEAD(_log_types); +static DEFINE_SPINLOCK(_lock); -int dm_unregister_dirty_log_type(struct dirty_log_type *type) +static struct dm_dirty_log_internal *__find_dirty_log_type(const char *name) { - spin_lock(&_lock); - - if (type->use_count) - DMWARN("Attempt to unregister a log type that is still in use"); - else - list_del(&type->list); + struct dm_dirty_log_internal *log_type; - spin_unlock(&_lock); + list_for_each_entry(log_type, &_log_types, list) + if (!strcmp(name, log_type->type->name)) + return log_type; - return 0; + return NULL; } -static struct dirty_log_type *_get_type(const char *type_name) +static struct dm_dirty_log_internal *_get_dirty_log_type(const char *name) { - struct dirty_log_type *type; + struct dm_dirty_log_internal *log_type; spin_lock(&_lock); - list_for_each_entry (type, &_log_types, list) - if (!strcmp(type_name, type->name)) { - if (!type->use_count && !try_module_get(type->module)){ - spin_unlock(&_lock); - return NULL; - } - type->use_count++; - spin_unlock(&_lock); - return type; - } + + log_type = __find_dirty_log_type(name); + if (log_type) { + if (!log_type->use && !try_module_get(log_type->type->module)) + log_type = NULL; + else + log_type->use++; + } spin_unlock(&_lock); - return NULL; + + return log_type; } /* * get_type * @type_name * - * Attempt to retrieve the dirty_log_type by name. If not already + * Attempt to retrieve the dm_dirty_log_type by name. If not already * available, attempt to load the appropriate module. * * Log modules are named "dm-log-" followed by the 'type_name'. @@ -78,14 +73,17 @@ static struct dirty_log_type *_get_type(const char *type_name) * * Returns: dirty_log_type* on success, NULL on failure */ -static struct dirty_log_type *get_type(const char *type_name) +static struct dm_dirty_log_type *get_type(const char *type_name) { char *p, *type_name_dup; - struct dirty_log_type *type; + struct dm_dirty_log_internal *log_type; + + if (!type_name) + return NULL; - type = _get_type(type_name); - if (type) - return type; + log_type = _get_dirty_log_type(type_name); + if (log_type) + return log_type->type; type_name_dup = kstrdup(type_name, GFP_KERNEL); if (!type_name_dup) { @@ -95,34 +93,106 @@ static struct dirty_log_type *get_type(const char *type_name) } while (request_module("dm-log-%s", type_name_dup) || - !(type = _get_type(type_name))) { + !(log_type = _get_dirty_log_type(type_name))) { p = strrchr(type_name_dup, '-'); if (!p) break; p[0] = '\0'; } - if (!type) + if (!log_type) DMWARN("Module for logging type \"%s\" not found.", type_name); kfree(type_name_dup); - return type; + return log_type ? log_type->type : NULL; } -static void put_type(struct dirty_log_type *type) +static void put_type(struct dm_dirty_log_type *type) { + struct dm_dirty_log_internal *log_type; + + if (!type) + return; + spin_lock(&_lock); - if (!--type->use_count) + log_type = __find_dirty_log_type(type->name); + if (!log_type) + goto out; + + if (!--log_type->use) module_put(type->module); + + BUG_ON(log_type->use < 0); + +out: spin_unlock(&_lock); } -struct dirty_log *dm_create_dirty_log(const char *type_name, struct dm_target *ti, - unsigned int argc, char **argv) +static struct dm_dirty_log_internal *_alloc_dirty_log_type(struct dm_dirty_log_type *type) { - struct dirty_log_type *type; - struct dirty_log *log; + struct dm_dirty_log_internal *log_type = kzalloc(sizeof(*log_type), + GFP_KERNEL); + + if (log_type) + log_type->type = type; + + return log_type; +} + +int dm_dirty_log_type_register(struct dm_dirty_log_type *type) +{ + struct dm_dirty_log_internal *log_type = _alloc_dirty_log_type(type); + int r = 0; + + if (!log_type) + return -ENOMEM; + + spin_lock(&_lock); + if (!__find_dirty_log_type(type->name)) + list_add(&log_type->list, &_log_types); + else { + kfree(log_type); + r = -EEXIST; + } + spin_unlock(&_lock); + + return r; +} +EXPORT_SYMBOL(dm_dirty_log_type_register); + +int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type) +{ + struct dm_dirty_log_internal *log_type; + + spin_lock(&_lock); + + log_type = __find_dirty_log_type(type->name); + if (!log_type) { + spin_unlock(&_lock); + return -EINVAL; + } + + if (log_type->use) { + spin_unlock(&_lock); + return -ETXTBSY; + } + + list_del(&log_type->list); + + spin_unlock(&_lock); + kfree(log_type); + + return 0; +} +EXPORT_SYMBOL(dm_dirty_log_type_unregister); + +struct dm_dirty_log *dm_dirty_log_create(const char *type_name, + struct dm_target *ti, + unsigned int argc, char **argv) +{ + struct dm_dirty_log_type *type; + struct dm_dirty_log *log; log = kmalloc(sizeof(*log), GFP_KERNEL); if (!log) @@ -143,13 +213,15 @@ struct dirty_log *dm_create_dirty_log(const char *type_name, struct dm_target *t return log; } +EXPORT_SYMBOL(dm_dirty_log_create); -void dm_destroy_dirty_log(struct dirty_log *log) +void dm_dirty_log_destroy(struct dm_dirty_log *log) { log->type->dtr(log); put_type(log->type); kfree(log); } +EXPORT_SYMBOL(dm_dirty_log_destroy); /*----------------------------------------------------------------- * Persistent and core logs share a lot of their implementation. @@ -207,7 +279,7 @@ struct log_c { struct dm_dev *log_dev; struct log_header header; - struct io_region header_location; + struct dm_io_region header_location; struct log_header *disk_header; }; @@ -215,7 +287,7 @@ struct log_c { * The touched member needs to be updated every time we access * one of the bitsets. */ -static inline int log_test_bit(uint32_t *bs, unsigned bit) +static inline int log_test_bit(uint32_t *bs, unsigned bit) { return ext2_test_bit(bit, (unsigned long *) bs) ? 1 : 0; } @@ -302,7 +374,7 @@ static inline int write_header(struct log_c *log) * argv contains region_size followed optionally by [no]sync *--------------------------------------------------------------*/ #define BYTE_SHIFT 3 -static int create_log_context(struct dirty_log *log, struct dm_target *ti, +static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, unsigned int argc, char **argv, struct dm_dev *dev) { @@ -315,7 +387,7 @@ static int create_log_context(struct dirty_log *log, struct dm_target *ti, int r; if (argc < 1 || argc > 2) { - DMWARN("wrong number of arguments to mirror log"); + DMWARN("wrong number of arguments to dirty region log"); return -EINVAL; } @@ -325,8 +397,8 @@ static int create_log_context(struct dirty_log *log, struct dm_target *ti, else if (!strcmp(argv[1], "nosync")) sync = NOSYNC; else { - DMWARN("unrecognised sync argument to mirror log: %s", - argv[1]); + DMWARN("unrecognised sync argument to " + "dirty region log: %s", argv[1]); return -EINVAL; } } @@ -434,7 +506,7 @@ static int create_log_context(struct dirty_log *log, struct dm_target *ti, return 0; } -static int core_ctr(struct dirty_log *log, struct dm_target *ti, +static int core_ctr(struct dm_dirty_log *log, struct dm_target *ti, unsigned int argc, char **argv) { return create_log_context(log, ti, argc, argv, NULL); @@ -447,7 +519,7 @@ static void destroy_log_context(struct log_c *lc) kfree(lc); } -static void core_dtr(struct dirty_log *log) +static void core_dtr(struct dm_dirty_log *log) { struct log_c *lc = (struct log_c *) log->context; @@ -460,14 +532,14 @@ static void core_dtr(struct dirty_log *log) * * argv contains log_device region_size followed optionally by [no]sync *--------------------------------------------------------------*/ -static int disk_ctr(struct dirty_log *log, struct dm_target *ti, +static int disk_ctr(struct dm_dirty_log *log, struct dm_target *ti, unsigned int argc, char **argv) { int r; struct dm_dev *dev; if (argc < 2 || argc > 3) { - DMWARN("wrong number of arguments to disk mirror log"); + DMWARN("wrong number of arguments to disk dirty region log"); return -EINVAL; } @@ -485,7 +557,7 @@ static int disk_ctr(struct dirty_log *log, struct dm_target *ti, return 0; } -static void disk_dtr(struct dirty_log *log) +static void disk_dtr(struct dm_dirty_log *log) { struct log_c *lc = (struct log_c *) log->context; @@ -514,7 +586,7 @@ static void fail_log_device(struct log_c *lc) dm_table_event(lc->ti->table); } -static int disk_resume(struct dirty_log *log) +static int disk_resume(struct dm_dirty_log *log) { int r; unsigned i; @@ -524,7 +596,7 @@ static int disk_resume(struct dirty_log *log) /* read the disk header */ r = read_header(lc); if (r) { - DMWARN("%s: Failed to read header on mirror log device", + DMWARN("%s: Failed to read header on dirty region log device", lc->log_dev->name); fail_log_device(lc); /* @@ -562,7 +634,7 @@ static int disk_resume(struct dirty_log *log) /* write the new header */ r = write_header(lc); if (r) { - DMWARN("%s: Failed to write header on mirror log device", + DMWARN("%s: Failed to write header on dirty region log device", lc->log_dev->name); fail_log_device(lc); } @@ -570,38 +642,38 @@ static int disk_resume(struct dirty_log *log) return r; } -static uint32_t core_get_region_size(struct dirty_log *log) +static uint32_t core_get_region_size(struct dm_dirty_log *log) { struct log_c *lc = (struct log_c *) log->context; return lc->region_size; } -static int core_resume(struct dirty_log *log) +static int core_resume(struct dm_dirty_log *log) { struct log_c *lc = (struct log_c *) log->context; lc->sync_search = 0; return 0; } -static int core_is_clean(struct dirty_log *log, region_t region) +static int core_is_clean(struct dm_dirty_log *log, region_t region) { struct log_c *lc = (struct log_c *) log->context; return log_test_bit(lc->clean_bits, region); } -static int core_in_sync(struct dirty_log *log, region_t region, int block) +static int core_in_sync(struct dm_dirty_log *log, region_t region, int block) { struct log_c *lc = (struct log_c *) log->context; return log_test_bit(lc->sync_bits, region); } -static int core_flush(struct dirty_log *log) +static int core_flush(struct dm_dirty_log *log) { /* no op */ return 0; } -static int disk_flush(struct dirty_log *log) +static int disk_flush(struct dm_dirty_log *log) { int r; struct log_c *lc = (struct log_c *) log->context; @@ -619,19 +691,19 @@ static int disk_flush(struct dirty_log *log) return r; } -static void core_mark_region(struct dirty_log *log, region_t region) +static void core_mark_region(struct dm_dirty_log *log, region_t region) { struct log_c *lc = (struct log_c *) log->context; log_clear_bit(lc, lc->clean_bits, region); } -static void core_clear_region(struct dirty_log *log, region_t region) +static void core_clear_region(struct dm_dirty_log *log, region_t region) { struct log_c *lc = (struct log_c *) log->context; log_set_bit(lc, lc->clean_bits, region); } -static int core_get_resync_work(struct dirty_log *log, region_t *region) +static int core_get_resync_work(struct dm_dirty_log *log, region_t *region) { struct log_c *lc = (struct log_c *) log->context; @@ -654,7 +726,7 @@ static int core_get_resync_work(struct dirty_log *log, region_t *region) return 1; } -static void core_set_region_sync(struct dirty_log *log, region_t region, +static void core_set_region_sync(struct dm_dirty_log *log, region_t region, int in_sync) { struct log_c *lc = (struct log_c *) log->context; @@ -669,7 +741,7 @@ static void core_set_region_sync(struct dirty_log *log, region_t region, } } -static region_t core_get_sync_count(struct dirty_log *log) +static region_t core_get_sync_count(struct dm_dirty_log *log) { struct log_c *lc = (struct log_c *) log->context; @@ -680,7 +752,7 @@ static region_t core_get_sync_count(struct dirty_log *log) if (lc->sync != DEFAULTSYNC) \ DMEMIT("%ssync ", lc->sync == NOSYNC ? "no" : "") -static int core_status(struct dirty_log *log, status_type_t status, +static int core_status(struct dm_dirty_log *log, status_type_t status, char *result, unsigned int maxlen) { int sz = 0; @@ -700,7 +772,7 @@ static int core_status(struct dirty_log *log, status_type_t status, return sz; } -static int disk_status(struct dirty_log *log, status_type_t status, +static int disk_status(struct dm_dirty_log *log, status_type_t status, char *result, unsigned int maxlen) { int sz = 0; @@ -722,7 +794,7 @@ static int disk_status(struct dirty_log *log, status_type_t status, return sz; } -static struct dirty_log_type _core_type = { +static struct dm_dirty_log_type _core_type = { .name = "core", .module = THIS_MODULE, .ctr = core_ctr, @@ -740,7 +812,7 @@ static struct dirty_log_type _core_type = { .status = core_status, }; -static struct dirty_log_type _disk_type = { +static struct dm_dirty_log_type _disk_type = { .name = "disk", .module = THIS_MODULE, .ctr = disk_ctr, @@ -763,26 +835,28 @@ int __init dm_dirty_log_init(void) { int r; - r = dm_register_dirty_log_type(&_core_type); + r = dm_dirty_log_type_register(&_core_type); if (r) DMWARN("couldn't register core log"); - r = dm_register_dirty_log_type(&_disk_type); + r = dm_dirty_log_type_register(&_disk_type); if (r) { DMWARN("couldn't register disk type"); - dm_unregister_dirty_log_type(&_core_type); + dm_dirty_log_type_unregister(&_core_type); } return r; } -void dm_dirty_log_exit(void) +void __exit dm_dirty_log_exit(void) { - dm_unregister_dirty_log_type(&_disk_type); - dm_unregister_dirty_log_type(&_core_type); + dm_dirty_log_type_unregister(&_disk_type); + dm_dirty_log_type_unregister(&_core_type); } -EXPORT_SYMBOL(dm_register_dirty_log_type); -EXPORT_SYMBOL(dm_unregister_dirty_log_type); -EXPORT_SYMBOL(dm_create_dirty_log); -EXPORT_SYMBOL(dm_destroy_dirty_log); +module_init(dm_dirty_log_init); +module_exit(dm_dirty_log_exit); + +MODULE_DESCRIPTION(DM_NAME " dirty region log"); +MODULE_AUTHOR("Joe Thornber, Heinz Mauelshagen <dm-devel@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-log.h b/drivers/md/dm-log.h deleted file mode 100644 index 3fae87eb596..00000000000 --- a/drivers/md/dm-log.h +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright (C) 2003 Sistina Software - * - * This file is released under the LGPL. - */ - -#ifndef DM_DIRTY_LOG -#define DM_DIRTY_LOG - -#include "dm.h" - -typedef sector_t region_t; - -struct dirty_log_type; - -struct dirty_log { - struct dirty_log_type *type; - void *context; -}; - -struct dirty_log_type { - struct list_head list; - const char *name; - struct module *module; - unsigned int use_count; - - int (*ctr)(struct dirty_log *log, struct dm_target *ti, - unsigned int argc, char **argv); - void (*dtr)(struct dirty_log *log); - - /* - * There are times when we don't want the log to touch - * the disk. - */ - int (*presuspend)(struct dirty_log *log); - int (*postsuspend)(struct dirty_log *log); - int (*resume)(struct dirty_log *log); - - /* - * Retrieves the smallest size of region that the log can - * deal with. - */ - uint32_t (*get_region_size)(struct dirty_log *log); - - /* - * A predicate to say whether a region is clean or not. - * May block. - */ - int (*is_clean)(struct dirty_log *log, region_t region); - - /* - * Returns: 0, 1, -EWOULDBLOCK, < 0 - * - * A predicate function to check the area given by - * [sector, sector + len) is in sync. - * - * If -EWOULDBLOCK is returned the state of the region is - * unknown, typically this will result in a read being - * passed to a daemon to deal with, since a daemon is - * allowed to block. - */ - int (*in_sync)(struct dirty_log *log, region_t region, int can_block); - - /* - * Flush the current log state (eg, to disk). This - * function may block. - */ - int (*flush)(struct dirty_log *log); - - /* - * Mark an area as clean or dirty. These functions may - * block, though for performance reasons blocking should - * be extremely rare (eg, allocating another chunk of - * memory for some reason). - */ - void (*mark_region)(struct dirty_log *log, region_t region); - void (*clear_region)(struct dirty_log *log, region_t region); - - /* - * Returns: <0 (error), 0 (no region), 1 (region) - * - * The mirrord will need perform recovery on regions of - * the mirror that are in the NOSYNC state. This - * function asks the log to tell the caller about the - * next region that this machine should recover. - * - * Do not confuse this function with 'in_sync()', one - * tells you if an area is synchronised, the other - * assigns recovery work. - */ - int (*get_resync_work)(struct dirty_log *log, region_t *region); - - /* - * This notifies the log that the resync status of a region - * has changed. It also clears the region from the recovering - * list (if present). - */ - void (*set_region_sync)(struct dirty_log *log, - region_t region, int in_sync); - - /* - * Returns the number of regions that are in sync. - */ - region_t (*get_sync_count)(struct dirty_log *log); - - /* - * Support function for mirror status requests. - */ - int (*status)(struct dirty_log *log, status_type_t status_type, - char *result, unsigned int maxlen); -}; - -int dm_register_dirty_log_type(struct dirty_log_type *type); -int dm_unregister_dirty_log_type(struct dirty_log_type *type); - - -/* - * Make sure you use these two functions, rather than calling - * type->constructor/destructor() directly. - */ -struct dirty_log *dm_create_dirty_log(const char *type_name, struct dm_target *ti, - unsigned int argc, char **argv); -void dm_destroy_dirty_log(struct dirty_log *log); - -/* - * init/exit functions. - */ -int dm_dirty_log_init(void); -void dm_dirty_log_exit(void); - -#endif diff --git a/drivers/md/dm-mpath-hp-sw.c b/drivers/md/dm-mpath-hp-sw.c index 204bf42c944..b63a0ab37c5 100644 --- a/drivers/md/dm-mpath-hp-sw.c +++ b/drivers/md/dm-mpath-hp-sw.c @@ -137,7 +137,6 @@ static struct request *hp_sw_get_request(struct dm_path *path) req->sense = h->sense; memset(req->sense, 0, SCSI_SENSE_BUFFERSIZE); - memset(&req->cmd, 0, BLK_MAX_CDB); req->cmd[0] = START_STOP; req->cmd[4] = 1; req->cmd_len = COMMAND_SIZE(req->cmd[0]); diff --git a/drivers/md/dm-mpath-rdac.c b/drivers/md/dm-mpath-rdac.c index e04eb5c697f..95e77734880 100644 --- a/drivers/md/dm-mpath-rdac.c +++ b/drivers/md/dm-mpath-rdac.c @@ -284,7 +284,6 @@ static struct request *get_rdac_req(struct rdac_handler *h, return NULL; } - memset(&rq->cmd, 0, BLK_MAX_CDB); rq->sense = h->sense; memset(rq->sense, 0, SCSI_SENSE_BUFFERSIZE); rq->sense_len = 0; diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 762cb086bb7..ff05fe89308 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -7,9 +7,6 @@ #include "dm.h" #include "dm-bio-list.h" #include "dm-bio-record.h" -#include "dm-io.h" -#include "dm-log.h" -#include "kcopyd.h" #include <linux/ctype.h> #include <linux/init.h> @@ -22,6 +19,9 @@ #include <linux/workqueue.h> #include <linux/log2.h> #include <linux/hardirq.h> +#include <linux/dm-io.h> +#include <linux/dm-dirty-log.h> +#include <linux/dm-kcopyd.h> #define DM_MSG_PREFIX "raid1" #define DM_IO_PAGES 64 @@ -74,7 +74,7 @@ struct region_hash { unsigned region_shift; /* holds persistent region state */ - struct dirty_log *log; + struct dm_dirty_log *log; /* hash table */ rwlock_t hash_lock; @@ -133,7 +133,7 @@ struct mirror_set { struct dm_target *ti; struct list_head list; struct region_hash rh; - struct kcopyd_client *kcopyd_client; + struct dm_kcopyd_client *kcopyd_client; uint64_t features; spinlock_t lock; /* protects the lists */ @@ -154,6 +154,9 @@ struct mirror_set { struct workqueue_struct *kmirrord_wq; struct work_struct kmirrord_work; + struct timer_list timer; + unsigned long timer_pending; + struct work_struct trigger_event; unsigned int nr_mirrors; @@ -178,13 +181,32 @@ static void wake(struct mirror_set *ms) queue_work(ms->kmirrord_wq, &ms->kmirrord_work); } +static void delayed_wake_fn(unsigned long data) +{ + struct mirror_set *ms = (struct mirror_set *) data; + + clear_bit(0, &ms->timer_pending); + wake(ms); +} + +static void delayed_wake(struct mirror_set *ms) +{ + if (test_and_set_bit(0, &ms->timer_pending)) + return; + + ms->timer.expires = jiffies + HZ / 5; + ms->timer.data = (unsigned long) ms; + ms->timer.function = delayed_wake_fn; + add_timer(&ms->timer); +} + /* FIXME move this */ static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw); #define MIN_REGIONS 64 #define MAX_RECOVERY 1 static int rh_init(struct region_hash *rh, struct mirror_set *ms, - struct dirty_log *log, uint32_t region_size, + struct dm_dirty_log *log, uint32_t region_size, region_t nr_regions) { unsigned int nr_buckets, max_buckets; @@ -249,7 +271,7 @@ static void rh_exit(struct region_hash *rh) } if (rh->log) - dm_destroy_dirty_log(rh->log); + dm_dirty_log_destroy(rh->log); if (rh->region_pool) mempool_destroy(rh->region_pool); vfree(rh->buckets); @@ -405,24 +427,22 @@ static void rh_update_states(struct region_hash *rh) write_lock_irq(&rh->hash_lock); spin_lock(&rh->region_lock); if (!list_empty(&rh->clean_regions)) { - list_splice(&rh->clean_regions, &clean); - INIT_LIST_HEAD(&rh->clean_regions); + list_splice_init(&rh->clean_regions, &clean); list_for_each_entry(reg, &clean, list) list_del(®->hash_list); } if (!list_empty(&rh->recovered_regions)) { - list_splice(&rh->recovered_regions, &recovered); - INIT_LIST_HEAD(&rh->recovered_regions); + list_splice_init(&rh->recovered_regions, &recovered); list_for_each_entry (reg, &recovered, list) list_del(®->hash_list); } if (!list_empty(&rh->failed_recovered_regions)) { - list_splice(&rh->failed_recovered_regions, &failed_recovered); - INIT_LIST_HEAD(&rh->failed_recovered_regions); + list_splice_init(&rh->failed_recovered_regions, + &failed_recovered); list_for_each_entry(reg, &failed_recovered, list) list_del(®->hash_list); @@ -790,7 +810,7 @@ static int recover(struct mirror_set *ms, struct region *reg) { int r; unsigned int i; - struct io_region from, to[KCOPYD_MAX_REGIONS], *dest; + struct dm_io_region from, to[DM_KCOPYD_MAX_REGIONS], *dest; struct mirror *m; unsigned long flags = 0; @@ -822,9 +842,9 @@ static int recover(struct mirror_set *ms, struct region *reg) } /* hand to kcopyd */ - set_bit(KCOPYD_IGNORE_ERROR, &flags); - r = kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags, - recovery_complete, reg); + set_bit(DM_KCOPYD_IGNORE_ERROR, &flags); + r = dm_kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, + flags, recovery_complete, reg); return r; } @@ -833,7 +853,7 @@ static void do_recovery(struct mirror_set *ms) { int r; struct region *reg; - struct dirty_log *log = ms->rh.log; + struct dm_dirty_log *log = ms->rh.log; /* * Start quiescing some regions. @@ -909,7 +929,7 @@ static void map_bio(struct mirror *m, struct bio *bio) bio->bi_sector = map_sector(m, bio); } -static void map_region(struct io_region *io, struct mirror *m, +static void map_region(struct dm_io_region *io, struct mirror *m, struct bio *bio) { io->bdev = m->dev->bdev; @@ -951,7 +971,7 @@ static void read_callback(unsigned long error, void *context) /* Asynchronous read. */ static void read_async_bio(struct mirror *m, struct bio *bio) { - struct io_region io; + struct dm_io_region io; struct dm_io_request io_req = { .bi_rw = READ, .mem.type = DM_IO_BVEC, @@ -1019,7 +1039,7 @@ static void __bio_mark_nosync(struct mirror_set *ms, { unsigned long flags; struct region_hash *rh = &ms->rh; - struct dirty_log *log = ms->rh.log; + struct dm_dirty_log *log = ms->rh.log; struct region *reg; region_t region = bio_to_region(rh, bio); int recovering = 0; @@ -1107,7 +1127,7 @@ out: static void do_write(struct mirror_set *ms, struct bio *bio) { unsigned int i; - struct io_region io[ms->nr_mirrors], *dest = io; + struct dm_io_region io[ms->nr_mirrors], *dest = io; struct mirror *m; struct dm_io_request io_req = { .bi_rw = WRITE, @@ -1182,6 +1202,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) spin_lock_irq(&ms->lock); bio_list_merge(&ms->failures, &sync); spin_unlock_irq(&ms->lock); + wake(ms); } else while ((bio = bio_list_pop(&sync))) do_write(ms, bio); @@ -1241,7 +1262,7 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures) bio_list_merge(&ms->failures, failures); spin_unlock_irq(&ms->lock); - wake(ms); + delayed_wake(ms); } static void trigger_event(struct work_struct *work) @@ -1255,7 +1276,7 @@ static void trigger_event(struct work_struct *work) /*----------------------------------------------------------------- * kmirrord *---------------------------------------------------------------*/ -static int _do_mirror(struct work_struct *work) +static void do_mirror(struct work_struct *work) { struct mirror_set *ms =container_of(work, struct mirror_set, kmirrord_work); @@ -1277,23 +1298,7 @@ static int _do_mirror(struct work_struct *work) do_writes(ms, &writes); do_failures(ms, &failures); - return (ms->failures.head) ? 1 : 0; -} - -static void do_mirror(struct work_struct *work) -{ - /* - * If _do_mirror returns 1, we give it - * another shot. This helps for cases like - * 'suspend' where we call flush_workqueue - * and expect all work to be finished. If - * a failure happens during a suspend, we - * couldn't issue a 'wake' because it would - * not be honored. Therefore, we return '1' - * from _do_mirror, and retry here. - */ - while (_do_mirror(work)) - schedule(); + dm_table_unplug_all(ms->ti->table); } @@ -1303,7 +1308,7 @@ static void do_mirror(struct work_struct *work) static struct mirror_set *alloc_context(unsigned int nr_mirrors, uint32_t region_size, struct dm_target *ti, - struct dirty_log *dl) + struct dm_dirty_log *dl) { size_t len; struct mirror_set *ms = NULL; @@ -1403,12 +1408,12 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti, /* * Create dirty log: log_type #log_params <log_params> */ -static struct dirty_log *create_dirty_log(struct dm_target *ti, +static struct dm_dirty_log *create_dirty_log(struct dm_target *ti, unsigned int argc, char **argv, unsigned int *args_used) { unsigned int param_count; - struct dirty_log *dl; + struct dm_dirty_log *dl; if (argc < 2) { ti->error = "Insufficient mirror log arguments"; @@ -1427,7 +1432,7 @@ static struct dirty_log *create_dirty_log(struct dm_target *ti, return NULL; } - dl = dm_create_dirty_log(argv[0], ti, param_count, argv + 2); + dl = dm_dirty_log_create(argv[0], ti, param_count, argv + 2); if (!dl) { ti->error = "Error creating mirror dirty log"; return NULL; @@ -1435,7 +1440,7 @@ static struct dirty_log *create_dirty_log(struct dm_target *ti, if (!_check_region_size(ti, dl->type->get_region_size(dl))) { ti->error = "Invalid region size"; - dm_destroy_dirty_log(dl); + dm_dirty_log_destroy(dl); return NULL; } @@ -1496,7 +1501,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) int r; unsigned int nr_mirrors, m, args_used; struct mirror_set *ms; - struct dirty_log *dl; + struct dm_dirty_log *dl; dl = create_dirty_log(ti, argc, argv, &args_used); if (!dl) @@ -1506,9 +1511,9 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) argc -= args_used; if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 || - nr_mirrors < 2 || nr_mirrors > KCOPYD_MAX_REGIONS + 1) { + nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) { ti->error = "Invalid number of mirrors"; - dm_destroy_dirty_log(dl); + dm_dirty_log_destroy(dl); return -EINVAL; } @@ -1516,13 +1521,13 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (argc < nr_mirrors * 2) { ti->error = "Too few mirror arguments"; - dm_destroy_dirty_log(dl); + dm_dirty_log_destroy(dl); return -EINVAL; } ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl); if (!ms) { - dm_destroy_dirty_log(dl); + dm_dirty_log_destroy(dl); return -ENOMEM; } @@ -1547,6 +1552,8 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto err_free_context; } INIT_WORK(&ms->kmirrord_work, do_mirror); + init_timer(&ms->timer); + ms->timer_pending = 0; INIT_WORK(&ms->trigger_event, trigger_event); r = parse_features(ms, argc, argv, &args_used); @@ -1571,7 +1578,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto err_destroy_wq; } - r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client); + r = dm_kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client); if (r) goto err_destroy_wq; @@ -1589,8 +1596,9 @@ static void mirror_dtr(struct dm_target *ti) { struct mirror_set *ms = (struct mirror_set *) ti->private; + del_timer_sync(&ms->timer); flush_workqueue(ms->kmirrord_wq); - kcopyd_client_destroy(ms->kcopyd_client); + dm_kcopyd_client_destroy(ms->kcopyd_client); destroy_workqueue(ms->kmirrord_wq); free_context(ms, ti, ms->nr_mirrors); } @@ -1734,7 +1742,7 @@ out: static void mirror_presuspend(struct dm_target *ti) { struct mirror_set *ms = (struct mirror_set *) ti->private; - struct dirty_log *log = ms->rh.log; + struct dm_dirty_log *log = ms->rh.log; atomic_set(&ms->suspend, 1); @@ -1763,7 +1771,7 @@ static void mirror_presuspend(struct dm_target *ti) static void mirror_postsuspend(struct dm_target *ti) { struct mirror_set *ms = ti->private; - struct dirty_log *log = ms->rh.log; + struct dm_dirty_log *log = ms->rh.log; if (log->type->postsuspend && log->type->postsuspend(log)) /* FIXME: need better error handling */ @@ -1773,7 +1781,7 @@ static void mirror_postsuspend(struct dm_target *ti) static void mirror_resume(struct dm_target *ti) { struct mirror_set *ms = ti->private; - struct dirty_log *log = ms->rh.log; + struct dm_dirty_log *log = ms->rh.log; atomic_set(&ms->suspend, 0); if (log->type->resume && log->type->resume(log)) @@ -1811,7 +1819,7 @@ static int mirror_status(struct dm_target *ti, status_type_t type, { unsigned int m, sz = 0; struct mirror_set *ms = (struct mirror_set *) ti->private; - struct dirty_log *log = ms->rh.log; + struct dm_dirty_log *log = ms->rh.log; char buffer[ms->nr_mirrors + 1]; switch (type) { @@ -1864,15 +1872,9 @@ static int __init dm_mirror_init(void) { int r; - r = dm_dirty_log_init(); - if (r) - return r; - r = dm_register_target(&mirror_target); - if (r < 0) { + if (r < 0) DMERR("Failed to register mirror target"); - dm_dirty_log_exit(); - } return r; } @@ -1884,8 +1886,6 @@ static void __exit dm_mirror_exit(void) r = dm_unregister_target(&mirror_target); if (r < 0) DMERR("unregister failed %d", r); - - dm_dirty_log_exit(); } /* Module hooks */ diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 4dc8a43c034..1ba8a47d61b 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -18,10 +18,10 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/log2.h> +#include <linux/dm-kcopyd.h> #include "dm-snap.h" #include "dm-bio-list.h" -#include "kcopyd.h" #define DM_MSG_PREFIX "snapshots" @@ -36,9 +36,9 @@ #define SNAPSHOT_COPY_PRIORITY 2 /* - * Each snapshot reserves this many pages for io + * Reserve 1MB for each snapshot initially (with minimum of 1 page). */ -#define SNAPSHOT_PAGES 256 +#define SNAPSHOT_PAGES (((1UL << 20) >> PAGE_SHIFT) ? : 1) static struct workqueue_struct *ksnapd; static void flush_queued_bios(struct work_struct *work); @@ -536,7 +536,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) s->last_percent = 0; init_rwsem(&s->lock); spin_lock_init(&s->pe_lock); - s->table = ti->table; + s->ti = ti; /* Allocate hash table for COW data */ if (init_hash_tables(s)) { @@ -558,7 +558,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad4; } - r = kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client); + r = dm_kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client); if (r) { ti->error = "Could not create kcopyd client"; goto bad5; @@ -591,7 +591,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) return 0; bad6: - kcopyd_client_destroy(s->kcopyd_client); + dm_kcopyd_client_destroy(s->kcopyd_client); bad5: s->store.destroy(&s->store); @@ -613,7 +613,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) static void __free_exceptions(struct dm_snapshot *s) { - kcopyd_client_destroy(s->kcopyd_client); + dm_kcopyd_client_destroy(s->kcopyd_client); s->kcopyd_client = NULL; exit_exception_table(&s->pending, pending_cache); @@ -699,7 +699,7 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err) s->valid = 0; - dm_table_event(s->table); + dm_table_event(s->ti->table); } static void get_pending_exception(struct dm_snap_pending_exception *pe) @@ -824,7 +824,7 @@ static void copy_callback(int read_err, unsigned long write_err, void *context) static void start_copy(struct dm_snap_pending_exception *pe) { struct dm_snapshot *s = pe->snap; - struct io_region src, dest; + struct dm_io_region src, dest; struct block_device *bdev = s->origin->bdev; sector_t dev_size; @@ -839,7 +839,7 @@ static void start_copy(struct dm_snap_pending_exception *pe) dest.count = src.count; /* Hand over to kcopyd */ - kcopyd_copy(s->kcopyd_client, + dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe); } @@ -1060,7 +1060,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) goto next_snapshot; /* Nothing to do if writing beyond end of snapshot */ - if (bio->bi_sector >= dm_table_get_size(snap->table)) + if (bio->bi_sector >= dm_table_get_size(snap->ti->table)) goto next_snapshot; /* diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h index 93bce5d4974..24f9fb73b98 100644 --- a/drivers/md/dm-snap.h +++ b/drivers/md/dm-snap.h @@ -132,7 +132,7 @@ struct exception_store { struct dm_snapshot { struct rw_semaphore lock; - struct dm_table *table; + struct dm_target *ti; struct dm_dev *origin; struct dm_dev *cow; @@ -169,7 +169,7 @@ struct dm_snapshot { /* The on disk metadata handler */ struct exception_store store; - struct kcopyd_client *kcopyd_client; + struct dm_kcopyd_client *kcopyd_client; /* Queue of snapshot writes for ksnapd to flush */ struct bio_list queued_bios; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index e75b1437b58..94116eaf470 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -245,44 +245,6 @@ int dm_table_create(struct dm_table **result, int mode, return 0; } -int dm_create_error_table(struct dm_table **result, struct mapped_device *md) -{ - struct dm_table *t; - sector_t dev_size = 1; - int r; - - /* - * Find current size of device. - * Default to 1 sector if inactive. - */ - t = dm_get_table(md); - if (t) { - dev_size = dm_table_get_size(t); - dm_table_put(t); - } - - r = dm_table_create(&t, FMODE_READ, 1, md); - if (r) - return r; - - r = dm_table_add_target(t, "error", 0, dev_size, NULL); - if (r) - goto out; - - r = dm_table_complete(t); - if (r) - goto out; - - *result = t; - -out: - if (r) - dm_table_put(t); - - return r; -} -EXPORT_SYMBOL_GPL(dm_create_error_table); - static void free_devices(struct list_head *devices) { struct list_head *tmp, *next; @@ -911,10 +873,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q) q->max_hw_sectors = t->limits.max_hw_sectors; q->seg_boundary_mask = t->limits.seg_boundary_mask; q->bounce_pfn = t->limits.bounce_pfn; + if (t->limits.no_cluster) - q->queue_flags &= ~(1 << QUEUE_FLAG_CLUSTER); + queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q); else - q->queue_flags |= (1 << QUEUE_FLAG_CLUSTER); + queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q); } @@ -954,7 +917,7 @@ void dm_table_presuspend_targets(struct dm_table *t) if (!t) return; - return suspend_targets(t, 0); + suspend_targets(t, 0); } void dm_table_postsuspend_targets(struct dm_table *t) @@ -962,7 +925,7 @@ void dm_table_postsuspend_targets(struct dm_table *t) if (!t) return; - return suspend_targets(t, 1); + suspend_targets(t, 1); } int dm_table_resume_targets(struct dm_table *t) diff --git a/drivers/md/dm-uevent.c b/drivers/md/dm-uevent.c index 50377e5dc2a..6f65883aef1 100644 --- a/drivers/md/dm-uevent.c +++ b/drivers/md/dm-uevent.c @@ -78,7 +78,7 @@ static struct dm_uevent *dm_build_path_uevent(struct mapped_device *md, event = dm_uevent_alloc(md); if (!event) { - DMERR("%s: dm_uevent_alloc() failed", __FUNCTION__); + DMERR("%s: dm_uevent_alloc() failed", __func__); goto err_nomem; } @@ -86,32 +86,32 @@ static struct dm_uevent *dm_build_path_uevent(struct mapped_device *md, if (add_uevent_var(&event->ku_env, "DM_TARGET=%s", ti->type->name)) { DMERR("%s: add_uevent_var() for DM_TARGET failed", - __FUNCTION__); + __func__); goto err_add; } if (add_uevent_var(&event->ku_env, "DM_ACTION=%s", dm_action)) { DMERR("%s: add_uevent_var() for DM_ACTION failed", - __FUNCTION__); + __func__); goto err_add; } if (add_uevent_var(&event->ku_env, "DM_SEQNUM=%u", dm_next_uevent_seq(md))) { DMERR("%s: add_uevent_var() for DM_SEQNUM failed", - __FUNCTION__); + __func__); goto err_add; } if (add_uevent_var(&event->ku_env, "DM_PATH=%s", path)) { - DMERR("%s: add_uevent_var() for DM_PATH failed", __FUNCTION__); + DMERR("%s: add_uevent_var() for DM_PATH failed", __func__); goto err_add; } if (add_uevent_var(&event->ku_env, "DM_NR_VALID_PATHS=%d", nr_valid_paths)) { DMERR("%s: add_uevent_var() for DM_NR_VALID_PATHS failed", - __FUNCTION__); + __func__); goto err_add; } @@ -146,25 +146,25 @@ void dm_send_uevents(struct list_head *events, struct kobject *kobj) if (dm_copy_name_and_uuid(event->md, event->name, event->uuid)) { DMERR("%s: dm_copy_name_and_uuid() failed", - __FUNCTION__); + __func__); goto uevent_free; } if (add_uevent_var(&event->ku_env, "DM_NAME=%s", event->name)) { DMERR("%s: add_uevent_var() for DM_NAME failed", - __FUNCTION__); + __func__); goto uevent_free; } if (add_uevent_var(&event->ku_env, "DM_UUID=%s", event->uuid)) { DMERR("%s: add_uevent_var() for DM_UUID failed", - __FUNCTION__); + __func__); goto uevent_free; } r = kobject_uevent_env(kobj, event->action, event->ku_env.envp); if (r) - DMERR("%s: kobject_uevent_env failed", __FUNCTION__); + DMERR("%s: kobject_uevent_env failed", __func__); uevent_free: dm_uevent_free(event); } @@ -187,7 +187,7 @@ void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti, struct dm_uevent *event; if (event_type >= ARRAY_SIZE(_dm_uevent_type_names)) { - DMERR("%s: Invalid event_type %d", __FUNCTION__, event_type); + DMERR("%s: Invalid event_type %d", __func__, event_type); goto out; } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 6617ce4af09..372369b1cc2 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -204,6 +204,7 @@ static int (*_inits[])(void) __initdata = { dm_target_init, dm_linear_init, dm_stripe_init, + dm_kcopyd_init, dm_interface_init, }; @@ -212,6 +213,7 @@ static void (*_exits[])(void) = { dm_target_exit, dm_linear_exit, dm_stripe_exit, + dm_kcopyd_exit, dm_interface_exit, }; @@ -922,7 +924,7 @@ static void free_minor(int minor) /* * See if the device with a specific minor # is free. */ -static int specific_minor(struct mapped_device *md, int minor) +static int specific_minor(int minor) { int r, m; @@ -955,7 +957,7 @@ out: return r; } -static int next_free_minor(struct mapped_device *md, int *minor) +static int next_free_minor(int *minor) { int r, m; @@ -966,9 +968,8 @@ static int next_free_minor(struct mapped_device *md, int *minor) spin_lock(&_minor_lock); r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m); - if (r) { + if (r) goto out; - } if (m >= (1 << MINORBITS)) { idr_remove(&_minor_idr, m); @@ -991,7 +992,7 @@ static struct block_device_operations dm_blk_dops; static struct mapped_device *alloc_dev(int minor) { int r; - struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL); + struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL); void *old_md; if (!md) { @@ -1004,13 +1005,12 @@ static struct mapped_device *alloc_dev(int minor) /* get a minor number for the dev */ if (minor == DM_ANY_MINOR) - r = next_free_minor(md, &minor); + r = next_free_minor(&minor); else - r = specific_minor(md, minor); + r = specific_minor(minor); if (r < 0) goto bad_minor; - memset(md, 0, sizeof(*md)); init_rwsem(&md->io_lock); mutex_init(&md->suspend_lock); spin_lock_init(&md->pushback_lock); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index b4584a39383..8c03b634e62 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -16,67 +16,6 @@ #include <linux/blkdev.h> #include <linux/hdreg.h> -#define DM_NAME "device-mapper" - -#define DMERR(f, arg...) \ - printk(KERN_ERR DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg) -#define DMERR_LIMIT(f, arg...) \ - do { \ - if (printk_ratelimit()) \ - printk(KERN_ERR DM_NAME ": " DM_MSG_PREFIX ": " \ - f "\n", ## arg); \ - } while (0) - -#define DMWARN(f, arg...) \ - printk(KERN_WARNING DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg) -#define DMWARN_LIMIT(f, arg...) \ - do { \ - if (printk_ratelimit()) \ - printk(KERN_WARNING DM_NAME ": " DM_MSG_PREFIX ": " \ - f "\n", ## arg); \ - } while (0) - -#define DMINFO(f, arg...) \ - printk(KERN_INFO DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg) -#define DMINFO_LIMIT(f, arg...) \ - do { \ - if (printk_ratelimit()) \ - printk(KERN_INFO DM_NAME ": " DM_MSG_PREFIX ": " f \ - "\n", ## arg); \ - } while (0) - -#ifdef CONFIG_DM_DEBUG -# define DMDEBUG(f, arg...) \ - printk(KERN_DEBUG DM_NAME ": " DM_MSG_PREFIX " DEBUG: " f "\n", ## arg) -# define DMDEBUG_LIMIT(f, arg...) \ - do { \ - if (printk_ratelimit()) \ - printk(KERN_DEBUG DM_NAME ": " DM_MSG_PREFIX ": " f \ - "\n", ## arg); \ - } while (0) -#else -# define DMDEBUG(f, arg...) do {} while (0) -# define DMDEBUG_LIMIT(f, arg...) do {} while (0) -#endif - -#define DMEMIT(x...) sz += ((sz >= maxlen) ? \ - 0 : scnprintf(result + sz, maxlen - sz, x)) - -#define SECTOR_SHIFT 9 - -/* - * Definitions of return values from target end_io function. - */ -#define DM_ENDIO_INCOMPLETE 1 -#define DM_ENDIO_REQUEUE 2 - -/* - * Definitions of return values from target map function. - */ -#define DM_MAPIO_SUBMITTED 0 -#define DM_MAPIO_REMAPPED 1 -#define DM_MAPIO_REQUEUE DM_ENDIO_REQUEUE - /* * Suspend feature flags */ @@ -136,34 +75,6 @@ static inline int array_too_big(unsigned long fixed, unsigned long obj, return (num > (ULONG_MAX - fixed) / obj); } -/* - * Ceiling(n / sz) - */ -#define dm_div_up(n, sz) (((n) + (sz) - 1) / (sz)) - -#define dm_sector_div_up(n, sz) ( \ -{ \ - sector_t _r = ((n) + (sz) - 1); \ - sector_div(_r, (sz)); \ - _r; \ -} \ -) - -/* - * ceiling(n / size) * size - */ -#define dm_round_up(n, sz) (dm_div_up((n), (sz)) * (sz)) - -static inline sector_t to_sector(unsigned long n) -{ - return (n >> 9); -} - -static inline unsigned long to_bytes(sector_t n) -{ - return (n << 9); -} - int dm_split_args(int *argc, char ***argvp, char *input); /* @@ -189,4 +100,13 @@ int dm_lock_for_deletion(struct mapped_device *md); void dm_kobject_uevent(struct mapped_device *md); +/* + * Dirty log + */ +int dm_dirty_log_init(void); +void dm_dirty_log_exit(void); + +int dm_kcopyd_init(void); +void dm_kcopyd_exit(void); + #endif diff --git a/drivers/md/kcopyd.h b/drivers/md/kcopyd.h deleted file mode 100644 index 4845f2a0c67..00000000000 --- a/drivers/md/kcopyd.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (C) 2001 Sistina Software - * - * This file is released under the GPL. - * - * Kcopyd provides a simple interface for copying an area of one - * block-device to one or more other block-devices, with an asynchronous - * completion notification. - */ - -#ifndef DM_KCOPYD_H -#define DM_KCOPYD_H - -#include "dm-io.h" - -/* FIXME: make this configurable */ -#define KCOPYD_MAX_REGIONS 8 - -#define KCOPYD_IGNORE_ERROR 1 - -/* - * To use kcopyd you must first create a kcopyd client object. - */ -struct kcopyd_client; -int kcopyd_client_create(unsigned int num_pages, struct kcopyd_client **result); -void kcopyd_client_destroy(struct kcopyd_client *kc); - -/* - * Submit a copy job to kcopyd. This is built on top of the - * previous three fns. - * - * read_err is a boolean, - * write_err is a bitset, with 1 bit for each destination region - */ -typedef void (*kcopyd_notify_fn)(int read_err, unsigned long write_err, - void *context); - -int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from, - unsigned int num_dests, struct io_region *dests, - unsigned int flags, kcopyd_notify_fn fn, void *context); - -#endif diff --git a/drivers/md/md.c b/drivers/md/md.c index 5ebfb4d7990..83eb78b0013 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -276,13 +276,15 @@ static mddev_t * mddev_find(dev_t unit) init_waitqueue_head(&new->sb_wait); new->reshape_position = MaxSector; new->resync_max = MaxSector; + new->level = LEVEL_NONE; new->queue = blk_alloc_queue(GFP_KERNEL); if (!new->queue) { kfree(new); return NULL; } - set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags); + /* Can be unlocked because the queue is new: no concurrency */ + queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, new->queue); blk_queue_make_request(new->queue, md_fail_request); @@ -731,9 +733,9 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version else rdev->desc_nr = sb->this_disk.number; - if (refdev == 0) + if (!refdev) { ret = 1; - else { + } else { __u64 ev1, ev2; mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); if (!uuid_equal(refsb, sb)) { @@ -1116,9 +1118,9 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) else rdev->desc_nr = le32_to_cpu(sb->dev_number); - if (refdev == 0) + if (!refdev) { ret = 1; - else { + } else { __u64 ev1, ev2; struct mdp_superblock_1 *refsb = (struct mdp_superblock_1*)page_address(refdev->sb_page); @@ -1368,6 +1370,11 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) MD_BUG(); return -EINVAL; } + + /* prevent duplicates */ + if (find_rdev(mddev, rdev->bdev->bd_dev)) + return -EEXIST; + /* make sure rdev->size exceeds mddev->size */ if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) { if (mddev->pers) { @@ -1651,6 +1658,8 @@ static void md_update_sb(mddev_t * mddev, int force_change) int sync_req; int nospares = 0; + if (mddev->external) + return; repeat: spin_lock_irq(&mddev->write_lock); @@ -1819,6 +1828,10 @@ state_show(mdk_rdev_t *rdev, char *page) len += sprintf(page+len, "%swrite_mostly",sep); sep = ","; } + if (test_bit(Blocked, &rdev->flags)) { + len += sprintf(page+len, "%sblocked", sep); + sep = ","; + } if (!test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags)) { len += sprintf(page+len, "%sspare", sep); @@ -1835,6 +1848,8 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) * remove - disconnects the device * writemostly - sets write_mostly * -writemostly - clears write_mostly + * blocked - sets the Blocked flag + * -blocked - clears the Blocked flag */ int err = -EINVAL; if (cmd_match(buf, "faulty") && rdev->mddev->pers) { @@ -1857,6 +1872,16 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) } else if (cmd_match(buf, "-writemostly")) { clear_bit(WriteMostly, &rdev->flags); err = 0; + } else if (cmd_match(buf, "blocked")) { + set_bit(Blocked, &rdev->flags); + err = 0; + } else if (cmd_match(buf, "-blocked")) { + clear_bit(Blocked, &rdev->flags); + wake_up(&rdev->blocked_wait); + set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); + md_wakeup_thread(rdev->mddev->thread); + + err = 0; } return err ? err : len; } @@ -2096,7 +2121,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, rv = -EBUSY; else rv = entry->store(rdev, page, length); - mddev_unlock(rdev->mddev); + mddev_unlock(mddev); } return rv; } @@ -2185,7 +2210,9 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi goto abort_free; } } + INIT_LIST_HEAD(&rdev->same_set); + init_waitqueue_head(&rdev->blocked_wait); return rdev; @@ -2456,7 +2483,6 @@ resync_start_show(mddev_t *mddev, char *page) static ssize_t resync_start_store(mddev_t *mddev, const char *buf, size_t len) { - /* can only set chunk_size if array is not yet active */ char *e; unsigned long long n = simple_strtoull(buf, &e, 10); @@ -2590,15 +2616,20 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) err = do_md_stop(mddev, 1); else { mddev->ro = 1; + set_disk_ro(mddev->gendisk, 1); err = do_md_run(mddev); } break; case read_auto: - /* stopping an active array */ if (mddev->pers) { - err = do_md_stop(mddev, 1); - if (err == 0) - mddev->ro = 2; /* FIXME mark devices writable */ + if (mddev->ro != 1) + err = do_md_stop(mddev, 1); + else + err = restart_array(mddev); + if (err == 0) { + mddev->ro = 2; + set_disk_ro(mddev->gendisk, 0); + } } else { mddev->ro = 2; err = do_md_run(mddev); @@ -2611,6 +2642,8 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) if (atomic_read(&mddev->writes_pending) == 0) { if (mddev->in_sync == 0) { mddev->in_sync = 1; + if (mddev->safemode == 1) + mddev->safemode = 0; if (mddev->persistent) set_bit(MD_CHANGE_CLEAN, &mddev->flags); @@ -2634,6 +2667,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) err = 0; } else { mddev->ro = 0; + set_disk_ro(mddev->gendisk, 0); err = do_md_run(mddev); } break; @@ -3711,6 +3745,30 @@ static int do_md_stop(mddev_t * mddev, int mode) mddev->reshape_position = MaxSector; mddev->external = 0; mddev->persistent = 0; + mddev->level = LEVEL_NONE; + mddev->clevel[0] = 0; + mddev->flags = 0; + mddev->ro = 0; + mddev->metadata_type[0] = 0; + mddev->chunk_size = 0; + mddev->ctime = mddev->utime = 0; + mddev->layout = 0; + mddev->max_disks = 0; + mddev->events = 0; + mddev->delta_disks = 0; + mddev->new_level = LEVEL_NONE; + mddev->new_layout = 0; + mddev->new_chunk = 0; + mddev->curr_resync = 0; + mddev->resync_mismatches = 0; + mddev->suspend_lo = mddev->suspend_hi = 0; + mddev->sync_speed_min = mddev->sync_speed_max = 0; + mddev->recovery = 0; + mddev->in_sync = 0; + mddev->changed = 0; + mddev->degraded = 0; + mddev->barriers_work = 0; + mddev->safemode = 0; } else if (mddev->pers) printk(KERN_INFO "md: %s switched to read-only mode.\n", @@ -4918,6 +4976,9 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) if (!rdev || test_bit(Faulty, &rdev->flags)) return; + + if (mddev->external) + set_bit(Blocked, &rdev->flags); /* dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", mdname(mddev), @@ -5364,6 +5425,8 @@ void md_write_start(mddev_t *mddev, struct bio *bi) md_wakeup_thread(mddev->sync_thread); } atomic_inc(&mddev->writes_pending); + if (mddev->safemode == 1) + mddev->safemode = 0; if (mddev->in_sync) { spin_lock_irq(&mddev->write_lock); if (mddev->in_sync) { @@ -5718,7 +5781,7 @@ static int remove_and_add_spares(mddev_t *mddev) rdev_for_each(rdev, rtmp, mddev) if (rdev->raid_disk >= 0 && - !mddev->external && + !test_bit(Blocked, &rdev->flags) && (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) && atomic_read(&rdev->nr_pending)==0) { @@ -5788,7 +5851,7 @@ void md_check_recovery(mddev_t *mddev) return; if (signal_pending(current)) { - if (mddev->pers->sync_request) { + if (mddev->pers->sync_request && !mddev->external) { printk(KERN_INFO "md: %s in immediate safe mode\n", mdname(mddev)); mddev->safemode = 2; @@ -5800,7 +5863,7 @@ void md_check_recovery(mddev_t *mddev) (mddev->flags && !mddev->external) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || test_bit(MD_RECOVERY_DONE, &mddev->recovery) || - (mddev->safemode == 1) || + (mddev->external == 0 && mddev->safemode == 1) || (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) && !mddev->in_sync && mddev->recovery_cp == MaxSector) )) @@ -5809,16 +5872,20 @@ void md_check_recovery(mddev_t *mddev) if (mddev_trylock(mddev)) { int spares = 0; - spin_lock_irq(&mddev->write_lock); - if (mddev->safemode && !atomic_read(&mddev->writes_pending) && - !mddev->in_sync && mddev->recovery_cp == MaxSector) { - mddev->in_sync = 1; - if (mddev->persistent) - set_bit(MD_CHANGE_CLEAN, &mddev->flags); + if (!mddev->external) { + spin_lock_irq(&mddev->write_lock); + if (mddev->safemode && + !atomic_read(&mddev->writes_pending) && + !mddev->in_sync && + mddev->recovery_cp == MaxSector) { + mddev->in_sync = 1; + if (mddev->persistent) + set_bit(MD_CHANGE_CLEAN, &mddev->flags); + } + if (mddev->safemode == 1) + mddev->safemode = 0; + spin_unlock_irq(&mddev->write_lock); } - if (mddev->safemode == 1) - mddev->safemode = 0; - spin_unlock_irq(&mddev->write_lock); if (mddev->flags) md_update_sb(mddev, 0); @@ -5913,6 +5980,16 @@ void md_check_recovery(mddev_t *mddev) } } +void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) +{ + sysfs_notify(&rdev->kobj, NULL, "state"); + wait_event_timeout(rdev->blocked_wait, + !test_bit(Blocked, &rdev->flags), + msecs_to_jiffies(5000)); + rdev_dec_pending(rdev, mddev); +} +EXPORT_SYMBOL(md_wait_for_blocked_rdev); + static int md_notify_reboot(struct notifier_block *this, unsigned long code, void *x) { @@ -5947,13 +6024,9 @@ static struct notifier_block md_notifier = { static void md_geninit(void) { - struct proc_dir_entry *p; - dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); - p = create_proc_entry("mdstat", S_IRUGO, NULL); - if (p) - p->proc_fops = &md_seq_fops; + proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops); } static int __init md_init(void) diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 3f299d835a2..42ee1a2dc14 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -244,7 +244,8 @@ static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev) conf->working_disks--; mddev->degraded++; printk(KERN_ALERT "multipath: IO failure on %s," - " disabling IO path. \n Operation continuing" + " disabling IO path.\n" + "multipath: Operation continuing" " on %d IO paths.\n", bdevname (rdev->bdev,b), conf->working_disks); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index ff61b309129..6778b7cb39b 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -773,7 +773,6 @@ static int make_request(struct request_queue *q, struct bio * bio) r1bio_t *r1_bio; struct bio *read_bio; int i, targets = 0, disks; - mdk_rdev_t *rdev; struct bitmap *bitmap = mddev->bitmap; unsigned long flags; struct bio_list bl; @@ -781,6 +780,7 @@ static int make_request(struct request_queue *q, struct bio * bio) const int rw = bio_data_dir(bio); const int do_sync = bio_sync(bio); int do_barriers; + mdk_rdev_t *blocked_rdev; /* * Register the new request and wait if the reconstruction @@ -862,10 +862,17 @@ static int make_request(struct request_queue *q, struct bio * bio) first = 0; } #endif + retry_write: + blocked_rdev = NULL; rcu_read_lock(); for (i = 0; i < disks; i++) { - if ((rdev=rcu_dereference(conf->mirrors[i].rdev)) != NULL && - !test_bit(Faulty, &rdev->flags)) { + mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + atomic_inc(&rdev->nr_pending); + blocked_rdev = rdev; + break; + } + if (rdev && !test_bit(Faulty, &rdev->flags)) { atomic_inc(&rdev->nr_pending); if (test_bit(Faulty, &rdev->flags)) { rdev_dec_pending(rdev, mddev); @@ -878,6 +885,20 @@ static int make_request(struct request_queue *q, struct bio * bio) } rcu_read_unlock(); + if (unlikely(blocked_rdev)) { + /* Wait for this device to become unblocked */ + int j; + + for (j = 0; j < i; j++) + if (r1_bio->bios[j]) + rdev_dec_pending(conf->mirrors[j].rdev, mddev); + + allow_barrier(conf); + md_wait_for_blocked_rdev(blocked_rdev, mddev); + wait_barrier(conf); + goto retry_write; + } + BUG_ON(targets == 0); /* we never fail the last device */ if (targets < conf->raid_disks) { @@ -1008,8 +1029,8 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) } else set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); - printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n" - " Operation continuing on %d devices\n", + printk(KERN_ALERT "raid1: Disk failure on %s, disabling device.\n" + "raid1: Operation continuing on %d devices.\n", bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 32389d2f18f..5938fa96292 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -790,6 +790,7 @@ static int make_request(struct request_queue *q, struct bio * bio) const int do_sync = bio_sync(bio); struct bio_list bl; unsigned long flags; + mdk_rdev_t *blocked_rdev; if (unlikely(bio_barrier(bio))) { bio_endio(bio, -EOPNOTSUPP); @@ -879,17 +880,23 @@ static int make_request(struct request_queue *q, struct bio * bio) /* * WRITE: */ - /* first select target devices under spinlock and + /* first select target devices under rcu_lock and * inc refcount on their rdev. Record them by setting * bios[x] to bio */ raid10_find_phys(conf, r10_bio); + retry_write: + blocked_rdev = 0; rcu_read_lock(); for (i = 0; i < conf->copies; i++) { int d = r10_bio->devs[i].devnum; mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); - if (rdev && - !test_bit(Faulty, &rdev->flags)) { + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + atomic_inc(&rdev->nr_pending); + blocked_rdev = rdev; + break; + } + if (rdev && !test_bit(Faulty, &rdev->flags)) { atomic_inc(&rdev->nr_pending); r10_bio->devs[i].bio = bio; } else { @@ -899,6 +906,22 @@ static int make_request(struct request_queue *q, struct bio * bio) } rcu_read_unlock(); + if (unlikely(blocked_rdev)) { + /* Have to wait for this device to get unblocked, then retry */ + int j; + int d; + + for (j = 0; j < i; j++) + if (r10_bio->devs[j].bio) { + d = r10_bio->devs[j].devnum; + rdev_dec_pending(conf->mirrors[d].rdev, mddev); + } + allow_barrier(conf); + md_wait_for_blocked_rdev(blocked_rdev, mddev); + wait_barrier(conf); + goto retry_write; + } + atomic_set(&r10_bio->remaining, 0); bio_list_init(&bl); @@ -1001,8 +1024,8 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) } set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); - printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n" - " Operation continuing on %d devices\n", + printk(KERN_ALERT "raid10: Disk failure on %s, disabling device.\n" + "raid10: Operation continuing on %d devices.\n", bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b162b839a66..087eee0cb80 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -63,6 +63,7 @@ #define STRIPE_SHIFT (PAGE_SHIFT - 9) #define STRIPE_SECTORS (STRIPE_SIZE>>9) #define IO_THRESHOLD 1 +#define BYPASS_THRESHOLD 1 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) #define HASH_MASK (NR_HASH - 1) @@ -398,6 +399,7 @@ static void ops_run_io(struct stripe_head *sh) might_sleep(); + set_bit(STRIPE_IO_STARTED, &sh->state); for (i = disks; i--; ) { int rw; struct bio *bi; @@ -433,7 +435,7 @@ static void ops_run_io(struct stripe_head *sh) bi->bi_bdev = rdev->bdev; pr_debug("%s: for %llu schedule op %ld on disc %d\n", - __FUNCTION__, (unsigned long long)sh->sector, + __func__, (unsigned long long)sh->sector, bi->bi_rw, i); atomic_inc(&sh->count); bi->bi_sector = sh->sector + rdev->data_offset; @@ -520,7 +522,7 @@ static void ops_complete_biofill(void *stripe_head_ref) raid5_conf_t *conf = sh->raid_conf; int i; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); /* clear completed biofills */ @@ -569,7 +571,7 @@ static void ops_run_biofill(struct stripe_head *sh) raid5_conf_t *conf = sh->raid_conf; int i; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); for (i = sh->disks; i--; ) { @@ -600,7 +602,7 @@ static void ops_complete_compute5(void *stripe_head_ref) int target = sh->ops.target; struct r5dev *tgt = &sh->dev[target]; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); set_bit(R5_UPTODATE, &tgt->flags); @@ -625,7 +627,7 @@ ops_run_compute5(struct stripe_head *sh, unsigned long pending) int i; pr_debug("%s: stripe %llu block: %d\n", - __FUNCTION__, (unsigned long long)sh->sector, target); + __func__, (unsigned long long)sh->sector, target); BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); for (i = disks; i--; ) @@ -653,7 +655,7 @@ static void ops_complete_prexor(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); set_bit(STRIPE_OP_PREXOR, &sh->ops.complete); @@ -670,7 +672,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) /* existing parity data subtracted */ struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); for (i = disks; i--; ) { @@ -699,7 +701,7 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, */ int prexor = test_bit(STRIPE_OP_PREXOR, &pending); - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); for (i = disks; i--; ) { @@ -744,7 +746,7 @@ static void ops_complete_postxor(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); @@ -757,7 +759,7 @@ static void ops_complete_write(void *stripe_head_ref) struct stripe_head *sh = stripe_head_ref; int disks = sh->disks, i, pd_idx = sh->pd_idx; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); for (i = disks; i--; ) { @@ -787,7 +789,7 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, unsigned long flags; dma_async_tx_callback callback; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); /* check if prexor is active which means only process blocks @@ -837,7 +839,7 @@ static void ops_complete_check(void *stripe_head_ref) struct stripe_head *sh = stripe_head_ref; int pd_idx = sh->pd_idx; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) && @@ -859,7 +861,7 @@ static void ops_run_check(struct stripe_head *sh) int count = 0, pd_idx = sh->pd_idx, i; struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); for (i = disks; i--; ) { @@ -1260,8 +1262,8 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) } set_bit(Faulty, &rdev->flags); printk (KERN_ALERT - "raid5: Disk failure on %s, disabling device." - " Operation continuing on %d devices\n", + "raid5: Disk failure on %s, disabling device.\n" + "raid5: Operation continuing on %d devices.\n", bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); } } @@ -1720,6 +1722,9 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand) locked++; } } + if (locked + 1 == disks) + if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) + atomic_inc(&sh->raid_conf->pending_full_writes); } else { BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); @@ -1759,7 +1764,7 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand) locked++; pr_debug("%s: stripe %llu locked: %d pending: %lx\n", - __FUNCTION__, (unsigned long long)sh->sector, + __func__, (unsigned long long)sh->sector, locked, sh->ops.pending); return locked; @@ -1947,6 +1952,9 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, STRIPE_SECTORS, 0, 0); } + if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) + if (atomic_dec_and_test(&conf->pending_full_writes)) + md_wakeup_thread(conf->mddev->thread); } /* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks @@ -2149,6 +2157,10 @@ static void handle_completed_write_requests(raid5_conf_t *conf, 0); } } + + if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) + if (atomic_dec_and_test(&conf->pending_full_writes)) + md_wakeup_thread(conf->mddev->thread); } static void handle_issuing_new_write_requests5(raid5_conf_t *conf, @@ -2333,6 +2345,9 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf, s->locked++; set_bit(R5_Wantwrite, &sh->dev[i].flags); } + if (s->locked == disks) + if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) + atomic_inc(&conf->pending_full_writes); /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ set_bit(STRIPE_INSYNC, &sh->state); @@ -2592,6 +2607,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, } } + /* * handle_stripe - do things to a stripe. * @@ -2617,6 +2633,7 @@ static void handle_stripe5(struct stripe_head *sh) struct stripe_head_state s; struct r5dev *dev; unsigned long pending = 0; + mdk_rdev_t *blocked_rdev = NULL; memset(&s, 0, sizeof(s)); pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " @@ -2676,6 +2693,11 @@ static void handle_stripe5(struct stripe_head *sh) if (dev->written) s.written++; rdev = rcu_dereference(conf->disks[i].rdev); + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + blocked_rdev = rdev; + atomic_inc(&rdev->nr_pending); + break; + } if (!rdev || !test_bit(In_sync, &rdev->flags)) { /* The ReadError flag will just be confusing now */ clear_bit(R5_ReadError, &dev->flags); @@ -2690,6 +2712,11 @@ static void handle_stripe5(struct stripe_head *sh) } rcu_read_unlock(); + if (unlikely(blocked_rdev)) { + set_bit(STRIPE_HANDLE, &sh->state); + goto unlock; + } + if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) sh->ops.count++; @@ -2879,8 +2906,13 @@ static void handle_stripe5(struct stripe_head *sh) if (sh->ops.count) pending = get_stripe_work(sh); + unlock: spin_unlock(&sh->lock); + /* wait for this device to become unblocked */ + if (unlikely(blocked_rdev)) + md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); + if (pending) raid5_run_ops(sh, pending); @@ -2897,6 +2929,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) struct stripe_head_state s; struct r6_state r6s; struct r5dev *dev, *pdev, *qdev; + mdk_rdev_t *blocked_rdev = NULL; r6s.qd_idx = raid6_next_disk(pd_idx, disks); pr_debug("handling stripe %llu, state=%#lx cnt=%d, " @@ -2960,6 +2993,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) if (dev->written) s.written++; rdev = rcu_dereference(conf->disks[i].rdev); + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + blocked_rdev = rdev; + atomic_inc(&rdev->nr_pending); + break; + } if (!rdev || !test_bit(In_sync, &rdev->flags)) { /* The ReadError flag will just be confusing now */ clear_bit(R5_ReadError, &dev->flags); @@ -2974,6 +3012,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) set_bit(R5_Insync, &dev->flags); } rcu_read_unlock(); + + if (unlikely(blocked_rdev)) { + set_bit(STRIPE_HANDLE, &sh->state); + goto unlock; + } pr_debug("locked=%d uptodate=%d to_read=%d" " to_write=%d failed=%d failed_num=%d,%d\n", s.locked, s.uptodate, s.to_read, s.to_write, s.failed, @@ -3079,8 +3122,13 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) handle_stripe_expansion(conf, sh, &r6s); + unlock: spin_unlock(&sh->lock); + /* wait for this device to become unblocked */ + if (unlikely(blocked_rdev)) + md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); + return_io(return_bi); for (i=disks; i-- ;) { @@ -3094,6 +3142,8 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) else continue; + set_bit(STRIPE_IO_STARTED, &sh->state); + bi = &sh->dev[i].req; bi->bi_rw = rw; @@ -3164,7 +3214,7 @@ static void raid5_activate_delayed(raid5_conf_t *conf) clear_bit(STRIPE_DELAYED, &sh->state); if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); - list_add_tail(&sh->lru, &conf->handle_list); + list_add_tail(&sh->lru, &conf->hold_list); } } else blk_plug_device(conf->mddev->queue); @@ -3442,6 +3492,58 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) } } +/* __get_priority_stripe - get the next stripe to process + * + * Full stripe writes are allowed to pass preread active stripes up until + * the bypass_threshold is exceeded. In general the bypass_count + * increments when the handle_list is handled before the hold_list; however, it + * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a + * stripe with in flight i/o. The bypass_count will be reset when the + * head of the hold_list has changed, i.e. the head was promoted to the + * handle_list. + */ +static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf) +{ + struct stripe_head *sh; + + pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", + __func__, + list_empty(&conf->handle_list) ? "empty" : "busy", + list_empty(&conf->hold_list) ? "empty" : "busy", + atomic_read(&conf->pending_full_writes), conf->bypass_count); + + if (!list_empty(&conf->handle_list)) { + sh = list_entry(conf->handle_list.next, typeof(*sh), lru); + + if (list_empty(&conf->hold_list)) + conf->bypass_count = 0; + else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { + if (conf->hold_list.next == conf->last_hold) + conf->bypass_count++; + else { + conf->last_hold = conf->hold_list.next; + conf->bypass_count -= conf->bypass_threshold; + if (conf->bypass_count < 0) + conf->bypass_count = 0; + } + } + } else if (!list_empty(&conf->hold_list) && + ((conf->bypass_threshold && + conf->bypass_count > conf->bypass_threshold) || + atomic_read(&conf->pending_full_writes) == 0)) { + sh = list_entry(conf->hold_list.next, + typeof(*sh), lru); + conf->bypass_count -= conf->bypass_threshold; + if (conf->bypass_count < 0) + conf->bypass_count = 0; + } else + return NULL; + + list_del_init(&sh->lru); + atomic_inc(&sh->count); + BUG_ON(atomic_read(&sh->count) != 1); + return sh; +} static int make_request(struct request_queue *q, struct bio * bi) { @@ -3914,7 +4016,6 @@ static void raid5d(mddev_t *mddev) handled = 0; spin_lock_irq(&conf->device_lock); while (1) { - struct list_head *first; struct bio *bio; if (conf->seq_flush != conf->seq_write) { @@ -3936,17 +4037,12 @@ static void raid5d(mddev_t *mddev) handled++; } - if (list_empty(&conf->handle_list)) { + sh = __get_priority_stripe(conf); + + if (!sh) { async_tx_issue_pending_all(); break; } - - first = conf->handle_list.next; - sh = list_entry(first, struct stripe_head, lru); - - list_del_init(first); - atomic_inc(&sh->count); - BUG_ON(atomic_read(&sh->count)!= 1); spin_unlock_irq(&conf->device_lock); handled++; @@ -3978,15 +4074,13 @@ static ssize_t raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) { raid5_conf_t *conf = mddev_to_conf(mddev); - char *end; - int new; + unsigned long new; if (len >= PAGE_SIZE) return -EINVAL; if (!conf) return -ENODEV; - new = simple_strtoul(page, &end, 10); - if (!*page || (*end && *end != '\n') ) + if (strict_strtoul(page, 10, &new)) return -EINVAL; if (new <= 16 || new > 32768) return -EINVAL; @@ -4011,6 +4105,40 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, raid5_store_stripe_cache_size); static ssize_t +raid5_show_preread_threshold(mddev_t *mddev, char *page) +{ + raid5_conf_t *conf = mddev_to_conf(mddev); + if (conf) + return sprintf(page, "%d\n", conf->bypass_threshold); + else + return 0; +} + +static ssize_t +raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len) +{ + raid5_conf_t *conf = mddev_to_conf(mddev); + unsigned long new; + if (len >= PAGE_SIZE) + return -EINVAL; + if (!conf) + return -ENODEV; + + if (strict_strtoul(page, 10, &new)) + return -EINVAL; + if (new > conf->max_nr_stripes) + return -EINVAL; + conf->bypass_threshold = new; + return len; +} + +static struct md_sysfs_entry +raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, + S_IRUGO | S_IWUSR, + raid5_show_preread_threshold, + raid5_store_preread_threshold); + +static ssize_t stripe_cache_active_show(mddev_t *mddev, char *page) { raid5_conf_t *conf = mddev_to_conf(mddev); @@ -4026,6 +4154,7 @@ raid5_stripecache_active = __ATTR_RO(stripe_cache_active); static struct attribute *raid5_attrs[] = { &raid5_stripecache_size.attr, &raid5_stripecache_active.attr, + &raid5_preread_bypass_threshold.attr, NULL, }; static struct attribute_group raid5_attrs_group = { @@ -4130,12 +4259,14 @@ static int run(mddev_t *mddev) init_waitqueue_head(&conf->wait_for_stripe); init_waitqueue_head(&conf->wait_for_overlap); INIT_LIST_HEAD(&conf->handle_list); + INIT_LIST_HEAD(&conf->hold_list); INIT_LIST_HEAD(&conf->delayed_list); INIT_LIST_HEAD(&conf->bitmap_list); INIT_LIST_HEAD(&conf->inactive_list); atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); atomic_set(&conf->active_aligned_reads, 0); + conf->bypass_threshold = BYPASS_THRESHOLD; pr_debug("raid5: run(%s) called.\n", mdname(mddev)); diff --git a/drivers/md/raid6algos.c b/drivers/md/raid6algos.c index 77a6e4bf503..21987e3dbe6 100644 --- a/drivers/md/raid6algos.c +++ b/drivers/md/raid6algos.c @@ -121,7 +121,8 @@ int __init raid6_select_algo(void) j0 = jiffies; while ( (j1 = jiffies) == j0 ) cpu_relax(); - while ( (jiffies-j1) < (1 << RAID6_TIME_JIFFIES_LG2) ) { + while (time_before(jiffies, + j1 + (1<<RAID6_TIME_JIFFIES_LG2))) { (*algo)->gen_syndrome(disks, PAGE_SIZE, dptrs); perf++; } |