7 files changed, 223 insertions, 90 deletions
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index f01096549a9..823ceba6efa 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1047,6 +1047,19 @@ static int populate_table(struct dm_table *table,
 	return dm_table_complete(table);
 }
 
+static int table_prealloc_integrity(struct dm_table *t,
+				    struct mapped_device *md)
+{
+	struct list_head *devices = dm_table_get_devices(t);
+	struct dm_dev_internal *dd;
+
+	list_for_each_entry(dd, devices, list)
+		if (bdev_get_integrity(dd->dm_dev.bdev))
+			return blk_integrity_register(dm_disk(md), NULL);
+
+	return 0;
+}
+
 static int table_load(struct dm_ioctl *param, size_t param_size)
 {
 	int r;
@@ -1068,6 +1081,14 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
 		goto out;
 	}
 
+	r = table_prealloc_integrity(t, md);
+	if (r) {
+		DMERR("%s: could not register integrity profile.",
+		      dm_device_name(md));
+		dm_table_destroy(t);
+		goto out;
+	}
+
 	down_write(&_hash_lock);
 	hc = dm_get_mdptr(md);
 	if (!hc || hc->md != md) {
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 0a225da2127..3e3fc06cb86 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -297,7 +297,8 @@ static int run_complete_job(struct kcopyd_job *job)
 	dm_kcopyd_notify_fn fn = job->fn;
 	struct dm_kcopyd_client *kc = job->kc;
 
-	kcopyd_put_pages(kc, job->pages);
+	if (job->pages)
+		kcopyd_put_pages(kc, job->pages);
 	mempool_free(job, kc->job_pool);
 	fn(read_err, write_err, context);
 
@@ -461,6 +462,7 @@ static void segment_complete(int read_err, unsigned long write_err,
 	sector_t progress = 0;
 	sector_t count = 0;
 	struct kcopyd_job *job = (struct kcopyd_job *) context;
+	struct dm_kcopyd_client *kc = job->kc;
 
 	mutex_lock(&job->lock);
 
@@ -490,7 +492,7 @@ static void segment_complete(int read_err, unsigned long write_err,
 
 	if (count) {
 		int i;
-		struct kcopyd_job *sub_job = mempool_alloc(job->kc->job_pool,
+		struct kcopyd_job *sub_job = mempool_alloc(kc->job_pool,
 							   GFP_NOIO);
 
 		*sub_job = *job;
@@ -509,13 +511,16 @@ static void segment_complete(int read_err, unsigned long write_err,
 	} else if (atomic_dec_and_test(&job->sub_jobs)) {
 
 		/*
-		 * To avoid a race we must keep the job around
-		 * until after the notify function has completed.
-		 * Otherwise the client may try and stop the job
-		 * after we've completed.
+		 * Queue the completion callback to the kcopyd thread.
+		 *
+		 * Some callers assume that all the completions are called
+		 * from a single thread and don't race with each other.
+		 *
+		 * We must not call the callback directly here because this
+		 * code may not be executing in the thread.
 		 */
-		job->fn(read_err, write_err, job->context);
-		mempool_free(job, job->kc->job_pool);
+		push(&kc->complete_jobs, job);
+		wake(kc);
 	}
 }
 
@@ -528,6 +533,8 @@ static void split_job(struct kcopyd_job *job)
 {
 	int i;
 
+	atomic_inc(&job->kc->nr_jobs);
+
 	atomic_set(&job->sub_jobs, SPLIT_COUNT);
 	for (i = 0; i < SPLIT_COUNT; i++)
 		segment_complete(0, 0u, job);
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index bfa107f59d9..79fb53e51c7 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -142,7 +142,6 @@ static struct target_type linear_target = {
 	.status = linear_status,
 	.ioctl  = linear_ioctl,
 	.merge  = linear_merge,
-	.features = DM_TARGET_SUPPORTS_BARRIERS,
 };
 
 int __init dm_linear_init(void)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index e8361b191b9..429b50b975d 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -52,8 +52,6 @@ struct dm_table {
 	sector_t *highs;
 	struct dm_target *targets;
 
-	unsigned barriers_supported:1;
-
 	/*
 	 * Indicates the rw permissions for the new logical
 	 * device.  This should be a combination of FMODE_READ
@@ -243,7 +241,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
 
 	INIT_LIST_HEAD(&t->devices);
 	atomic_set(&t->holders, 0);
-	t->barriers_supported = 1;
 
 	if (!num_targets)
 		num_targets = KEYS_PER_NODE;
@@ -751,10 +748,6 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 	/* FIXME: the plan is to combine high here and then have
 	 * the merge fn apply the target level restrictions. */
 	combine_restrictions_low(&t->limits, &tgt->limits);
-
-	if (!(tgt->type->features & DM_TARGET_SUPPORTS_BARRIERS))
-		t->barriers_supported = 0;
-
 	return 0;
 
  bad:
@@ -799,12 +792,6 @@ int dm_table_complete(struct dm_table *t)
 
 	check_for_valid_limits(&t->limits);
 
-	/*
-	 * We only support barriers if there is exactly one underlying device.
-	 */
-	if (!list_is_singular(&t->devices))
-		t->barriers_supported = 0;
-
 	/* how many indexes will the btree have ? */
 	leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
 	t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
@@ -879,6 +866,45 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
 	return &t->targets[(KEYS_PER_NODE * n) + k];
 }
 
+/*
+ * Set the integrity profile for this device if all devices used have
+ * matching profiles.
+ */
+static void dm_table_set_integrity(struct dm_table *t)
+{
+	struct list_head *devices = dm_table_get_devices(t);
+	struct dm_dev_internal *prev = NULL, *dd = NULL;
+
+	if (!blk_get_integrity(dm_disk(t->md)))
+		return;
+
+	list_for_each_entry(dd, devices, list) {
+		if (prev &&
+		    blk_integrity_compare(prev->dm_dev.bdev->bd_disk,
+					  dd->dm_dev.bdev->bd_disk) < 0) {
+			DMWARN("%s: integrity not set: %s and %s mismatch",
+			       dm_device_name(t->md),
+			       prev->dm_dev.bdev->bd_disk->disk_name,
+			       dd->dm_dev.bdev->bd_disk->disk_name);
+			goto no_integrity;
+		}
+		prev = dd;
+	}
+
+	if (!prev || !bdev_get_integrity(prev->dm_dev.bdev))
+		goto no_integrity;
+
+	blk_integrity_register(dm_disk(t->md),
+			       bdev_get_integrity(prev->dm_dev.bdev));
+
+	return;
+
+no_integrity:
+	blk_integrity_register(dm_disk(t->md), NULL);
+
+	return;
+}
+
 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
 {
 	/*
@@ -899,6 +925,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
 	else
 		queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q);
 
+	dm_table_set_integrity(t);
 }
 
 unsigned int dm_table_get_num_targets(struct dm_table *t)
@@ -1019,12 +1046,6 @@ struct mapped_device *dm_table_get_md(struct dm_table *t)
 	return t->md;
 }
 
-int dm_table_barrier_ok(struct dm_table *t)
-{
-	return t->barriers_supported;
-}
-EXPORT_SYMBOL(dm_table_barrier_ok);
-
 EXPORT_SYMBOL(dm_vcalloc);
 EXPORT_SYMBOL(dm_get_device);
 EXPORT_SYMBOL(dm_put_device);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 788ba96a625..8a994be035b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -89,12 +89,13 @@ union map_info *dm_get_mapinfo(struct bio *bio)
 /*
  * Bits for the md->flags field.
  */
-#define DMF_BLOCK_IO 0
+#define DMF_BLOCK_IO_FOR_SUSPEND 0
 #define DMF_SUSPENDED 1
 #define DMF_FROZEN 2
 #define DMF_FREEING 3
 #define DMF_DELETING 4
 #define DMF_NOFLUSH_SUSPENDING 5
+#define DMF_QUEUE_IO_TO_THREAD 6
 
 /*
  * Work processed by per-device workqueue.
@@ -124,6 +125,11 @@ struct mapped_device {
 	spinlock_t deferred_lock;
 
 	/*
+	 * An error from the barrier request currently being processed.
+	 */
+	int barrier_error;
+
+	/*
 	 * Processing queue (flush/barriers)
 	 */
 	struct workqueue_struct *wq;
@@ -424,6 +430,10 @@ static void end_io_acct(struct dm_io *io)
 	part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
 	part_stat_unlock();
 
+	/*
+	 * After this is decremented the bio must not be touched if it is
+	 * a barrier.
+	 */
 	dm_disk(md)->part0.in_flight = pending =
 		atomic_dec_return(&md->pending);
 
@@ -435,21 +445,18 @@ static void end_io_acct(struct dm_io *io)
 /*
  * Add the bio to the list of deferred io.
  */
-static int queue_io(struct mapped_device *md, struct bio *bio)
+static void queue_io(struct mapped_device *md, struct bio *bio)
 {
 	down_write(&md->io_lock);
 
-	if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
-		up_write(&md->io_lock);
-		return 1;
-	}
-
 	spin_lock_irq(&md->deferred_lock);
 	bio_list_add(&md->deferred, bio);
 	spin_unlock_irq(&md->deferred_lock);
 
+	if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))
+		queue_work(md->wq, &md->work);
+
 	up_write(&md->io_lock);
-	return 0;		/* deferred successfully */
 }
 
 /*
@@ -533,25 +540,35 @@ static void dec_pending(struct dm_io *io, int error)
 			 */
 			spin_lock_irqsave(&md->deferred_lock, flags);
 			if (__noflush_suspending(md))
-				bio_list_add(&md->deferred, io->bio);
+				bio_list_add_head(&md->deferred, io->bio);
 			else
 				/* noflush suspend was interrupted. */
 				io->error = -EIO;
 			spin_unlock_irqrestore(&md->deferred_lock, flags);
 		}
 
-		end_io_acct(io);
-
 		io_error = io->error;
 		bio = io->bio;
 
-		free_io(md, io);
+		if (bio_barrier(bio)) {
+			/*
+			 * There can be just one barrier request so we use
+			 * a per-device variable for error reporting.
+			 * Note that you can't touch the bio after end_io_acct
+			 */
+			md->barrier_error = io_error;
+			end_io_acct(io);
+		} else {
+			end_io_acct(io);
 
-		if (io_error != DM_ENDIO_REQUEUE) {
-			trace_block_bio_complete(md->queue, bio);
+			if (io_error != DM_ENDIO_REQUEUE) {
+				trace_block_bio_complete(md->queue, bio);
 
-			bio_endio(bio, io_error);
+				bio_endio(bio, io_error);
+			}
 		}
+
+		free_io(md, io);
 	}
 }
 
@@ -693,13 +710,19 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
 
 	clone->bi_sector = sector;
 	clone->bi_bdev = bio->bi_bdev;
-	clone->bi_rw = bio->bi_rw;
+	clone->bi_rw = bio->bi_rw & ~(1 << BIO_RW_BARRIER);
 	clone->bi_vcnt = 1;
 	clone->bi_size = to_bytes(len);
 	clone->bi_io_vec->bv_offset = offset;
 	clone->bi_io_vec->bv_len = clone->bi_size;
 	clone->bi_flags |= 1 << BIO_CLONED;
 
+	if (bio_integrity(bio)) {
+		bio_integrity_clone(clone, bio, GFP_NOIO);
+		bio_integrity_trim(clone,
+				   bio_sector_offset(bio, idx, offset), len);
+	}
+
 	return clone;
 }
 
@@ -714,6 +737,7 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
 
 	clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
 	__bio_clone(clone, bio);
+	clone->bi_rw &= ~(1 << BIO_RW_BARRIER);
 	clone->bi_destructor = dm_bio_destructor;
 	clone->bi_sector = sector;
 	clone->bi_idx = idx;
@@ -721,6 +745,14 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
 	clone->bi_size = to_bytes(len);
 	clone->bi_flags &= ~(1 << BIO_SEG_VALID);
 
+	if (bio_integrity(bio)) {
+		bio_integrity_clone(clone, bio, GFP_NOIO);
+
+		if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
+			bio_integrity_trim(clone,
+					   bio_sector_offset(bio, idx, 0), len);
+	}
+
 	return clone;
 }
 
@@ -834,14 +866,13 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 
 	ci.map = dm_get_table(md);
 	if (unlikely(!ci.map)) {
-		bio_io_error(bio);
-		return;
-	}
-	if (unlikely(bio_barrier(bio) && !dm_table_barrier_ok(ci.map))) {
-		dm_table_put(ci.map);
-		bio_endio(bio, -EOPNOTSUPP);
+		if (!bio_barrier(bio))
+			bio_io_error(bio);
+		else
+			md->barrier_error = -EIO;
 		return;
 	}
+
 	ci.md = md;
 	ci.bio = bio;
 	ci.io = alloc_io(md);
@@ -918,7 +949,6 @@ out:
  */
 static int dm_request(struct request_queue *q, struct bio *bio)
 {
-	int r = -EIO;
 	int rw = bio_data_dir(bio);
 	struct mapped_device *md = q->queuedata;
 	int cpu;
@@ -931,34 +961,27 @@ static int dm_request(struct request_queue *q, struct bio *bio)
 	part_stat_unlock();
 
 	/*
-	 * If we're suspended we have to queue
-	 * this io for later.
+	 * If we're suspended or the thread is processing barriers
+	 * we have to queue this io for later.
 	 */
-	while (test_bit(DMF_BLOCK_IO, &md->flags)) {
+	if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
+	    unlikely(bio_barrier(bio))) {
 		up_read(&md->io_lock);
 
-		if (bio_rw(bio) != READA)
-			r = queue_io(md, bio);
+		if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
+		    bio_rw(bio) == READA) {
+			bio_io_error(bio);
+			return 0;
+		}
 
-		if (r <= 0)
-			goto out_req;
+		queue_io(md, bio);
 
-		/*
-		 * We're in a while loop, because someone could suspend
-		 * before we get to the following read lock.
-		 */
-		down_read(&md->io_lock);
+		return 0;
 	}
 
 	__split_and_process_bio(md, bio);
 	up_read(&md->io_lock);
 	return 0;
-
-out_req:
-	if (r < 0)
-		bio_io_error(bio);
-
-	return 0;
 }
 
 static void dm_unplug_all(struct request_queue *q)
@@ -978,7 +1001,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
 	struct mapped_device *md = congested_data;
 	struct dm_table *map;
 
-	if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
+	if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
 		map = dm_get_table(md);
 		if (map) {
 			r = dm_table_any_congested(map, bdi_bits);
@@ -1193,6 +1216,7 @@ static void free_dev(struct mapped_device *md)
 	mempool_destroy(md->tio_pool);
 	mempool_destroy(md->io_pool);
 	bioset_free(md->bs);
+	blk_integrity_unregister(md->disk);
 	del_gendisk(md->disk);
 	free_minor(minor);
 
@@ -1406,6 +1430,36 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 	return r;
 }
 
+static int dm_flush(struct mapped_device *md)
+{
+	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
+	return 0;
+}
+
+static void process_barrier(struct mapped_device *md, struct bio *bio)
+{
+	int error = dm_flush(md);
+
+	if (unlikely(error)) {
+		bio_endio(bio, error);
+		return;
+	}
+	if (bio_empty_barrier(bio)) {
+		bio_endio(bio, 0);
+		return;
+	}
+
+	__split_and_process_bio(md, bio);
+
+	error = dm_flush(md);
+
+	if (!error && md->barrier_error)
+		error = md->barrier_error;
+
+	if (md->barrier_error != DM_ENDIO_REQUEUE)
+		bio_endio(bio, error);
+}
+
 /*
  * Process the deferred bios
  */
@@ -1417,25 +1471,34 @@ static void dm_wq_work(struct work_struct *work)
 
 	down_write(&md->io_lock);
 
-next_bio:
-	spin_lock_irq(&md->deferred_lock);
-	c = bio_list_pop(&md->deferred);
-	spin_unlock_irq(&md->deferred_lock);
+	while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
+		spin_lock_irq(&md->deferred_lock);
+		c = bio_list_pop(&md->deferred);
+		spin_unlock_irq(&md->deferred_lock);
 
-	if (c) {
-		__split_and_process_bio(md, c);
-		goto next_bio;
-	}
+		if (!c) {
+			clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
+			break;
+		}
 
-	clear_bit(DMF_BLOCK_IO, &md->flags);
+		up_write(&md->io_lock);
+
+		if (bio_barrier(c))
+			process_barrier(md, c);
+		else
+			__split_and_process_bio(md, c);
+
+		down_write(&md->io_lock);
+	}
 
 	up_write(&md->io_lock);
 }
 
 static void dm_queue_flush(struct mapped_device *md)
 {
+	clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
+	smp_mb__after_clear_bit();
 	queue_work(md->wq, &md->work);
-	flush_workqueue(md->wq);
 }
 
 /*
@@ -1553,20 +1616,36 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	}
 
 	/*
-	 * First we set the BLOCK_IO flag so no more ios will be mapped.
+	 * Here we must make sure that no processes are submitting requests
+	 * to target drivers i.e. no one may be executing
+	 * __split_and_process_bio. This is called from dm_request and
+	 * dm_wq_work.
+	 *
+	 * To get all processes out of __split_and_process_bio in dm_request,
+	 * we take the write lock. To prevent any process from reentering
+	 * __split_and_process_bio from dm_request, we set
+	 * DMF_QUEUE_IO_TO_THREAD.
+	 *
+	 * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND
+	 * and call flush_workqueue(md->wq). flush_workqueue will wait until
+	 * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any
+	 * further calls to __split_and_process_bio from dm_wq_work.
 	 */
 	down_write(&md->io_lock);
-	set_bit(DMF_BLOCK_IO, &md->flags);
-
+	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
+	set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
 	up_write(&md->io_lock);
 
+	flush_workqueue(md->wq);
+
 	/*
-	 * Wait for the already-mapped ios to complete.
+	 * At this point no more requests are entering target request routines.
+	 * We call dm_wait_for_completion to wait for all existing requests
+	 * to finish.
 	 */
 	r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
 
 	down_write(&md->io_lock);
-
 	if (noflush)
 		clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 	up_write(&md->io_lock);
@@ -1579,6 +1658,12 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 		goto out; /* pushback list is already flushed, so skip flush */
 	}
 
+	/*
+	 * If dm_wait_for_completion returned 0, the device is completely
+	 * quiescent now. There is no request-processing activity. All new
+	 * requests are being added to md->deferred list.
+	 */
+
 	dm_table_postsuspend_targets(map);
 
 	set_bit(DMF_SUSPENDED, &md->flags);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index b48397c0abb..a31506d93e9 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -52,7 +52,6 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits);
  * To check the return value from dm_table_find_target().
  */
 #define dm_target_is_valid(t) ((t)->table)
-int dm_table_barrier_ok(struct dm_table *t);
 
 /*-----------------------------------------------------------------
  * A registry of target types.
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index b4f4badc006..274b491a11c 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -123,6 +123,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 				goto out_free_pages;
 
 			bio->bi_io_vec[i].bv_page = page;
+			bio->bi_vcnt = i+1;
 		}
 	}
 	/* If not user-requests, copy the page pointers to all bios */
@@ -138,9 +139,9 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 	return r1_bio;
 
 out_free_pages:
-	for (i=0; i < RESYNC_PAGES ; i++)
-		for (j=0 ; j < pi->raid_disks; j++)
-			safe_put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
+	for (j=0 ; j < pi->raid_disks; j++)
+		for (i=0; i < r1_bio->bios[j]->bi_vcnt ; i++)
+			put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
 	j = -1;
 out_free_bio:
 	while ( ++j < pi->raid_disks )
@@ -585,7 +586,7 @@ static int raid1_congested(void *data, int bits)
 			/* Note the '|| 1' - when read_balance prefers
 			 * non-congested targets, it can be removed
 			 */
-			if ((bits & (1<<BDI_write_congested)) || 1)
+			if ((bits & (1<<BDI_async_congested)) || 1)
 				ret |= bdi_congested(&q->backing_dev_info, bits);
 			else
 				ret &= bdi_congested(&q->backing_dev_info, bits);