From a77e28c7e1dc1a6a035c7627d4a88ecf3ea09aea Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Fri, 4 Sep 2009 20:40:16 +0100
Subject: dm multipath: fix oops when request based io fails when no paths

The patch posted at http://marc.info/?l=dm-devel&m=124539787228784&w=2
which was merged into cec47e3d4a861e1d942b3a580d0bbef2700d2bb2 ("dm:
prepare for request based option") introduced a regression in
request-based dm.

If map_request() calls dm_kill_unmapped_request() to complete a cloned
bio without dispatching it, clone->bio is still set when
dm_end_request() is called and the BUG_ON(clone->bio) is incorrect.

The patch fixes this bug by freeing bio in dm_end_request() if the clone
has bio.  I've redone my tests to cover all I/O paths and confirmed
there's no other regression.

Here is the oops I hit in request-based dm when I do I/O to a multipath
device which doesn't have any active path nor queue_if_no_path setting:

------------[ cut here ]------------
kernel BUG at /root/2.6.31-rc4.rqdm/drivers/md/dm.c:828!
invalid opcode: 0000 [#1] SMP
last sysfs file: /sys/devices/system/cpu/cpu3/cache/index2/shared_cpu_map
CPU 1
Modules linked in: autofs4 sunrpc cpufreq_ondemand acpi_cpufreq dm_mirror dm_region_hash dm_log dm_service_time dm_multipath scsi_dh dm_mod video output sbs sbshc battery ac sg sr_mod e1000e button cdrom serio_raw rtc_cmos rtc_core rtc_lib piix lpfc scsi_transport_fc ata_piix libata megaraid_sas sd_mod scsi_mod crc_t10dif ext3 jbd uhci_hcd ohci_hcd ehci_hcd [last unloaded: microcode]
Pid: 7, comm: ksoftirqd/1 Not tainted 2.6.31-rc4.rqdm #1 Express5800/120Lj [N8100-1417]
RIP: 0010:[<ffffffffa023629d>]  [<ffffffffa023629d>] dm_softirq_done+0xbd/0x100 [dm_mod]
RSP: 0018:ffff8800280a1f08  EFLAGS: 00010282
RAX: ffffffffa02544e0 RBX: ffff8802aa1111d0 RCX: ffff8802aa1111e0
RDX: ffff8802ab913e70 RSI: 0000000000000000 RDI: ffff8802ab913e70
RBP: ffff8800280a1f28 R08: ffffc90005457040 R09: 0000000000000000
R10: 0000000000000001 R11: 0000000000000000 R12: 00000000fffffffb
R13: ffff8802ab913e88 R14: ffff8802ab9c1438 R15: 0000000000000100
FS:  0000000000000000(0000) GS:ffff88002809e000(0000) knlGS:0000000000000000
CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
CR2: 0000003d54a98640 CR3: 000000029f0a1000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process ksoftirqd/1 (pid: 7, threadinfo ffff8802ae50e000, task ffff8802ae4f8040)
Stack:
 ffff8800280a1f38 0000000000000020 ffffffff814f30a0 0000000000000004
<0> ffff8800280a1f58 ffffffff8116b245 ffff8800280a1f38 ffff8800280a1f38
<0> ffff8800280a1f58 0000000000000001 ffff8800280a1fa8 ffffffff810477bc
Call Trace:
 <IRQ>
 [<ffffffff8116b245>] blk_done_softirq+0x75/0x90
 [<ffffffff810477bc>] __do_softirq+0xcc/0x210
 [<ffffffff81047170>] ? ksoftirqd+0x0/0x110
 [<ffffffff8100ce7c>] call_softirq+0x1c/0x50
 <EOI>
 [<ffffffff8100e785>] do_softirq+0x65/0xa0
 [<ffffffff81047170>] ? ksoftirqd+0x0/0x110
 [<ffffffff810471e0>] ksoftirqd+0x70/0x110
 [<ffffffff81059559>] kthread+0x99/0xb0
 [<ffffffff8100cd7a>] child_rip+0xa/0x20
 [<ffffffff8100c73c>] ? restore_args+0x0/0x30
 [<ffffffff810594c0>] ? kthread+0x0/0xb0
 [<ffffffff8100cd70>] ? child_rip+0x0/0x20
Code: 44 89 e6 48 89 df e8 23 fb f2 e0 be 01 00 00 00 4c 89 f7 e8 f6 fd ff ff 5b 41 5c 41 5d 41 5e c9 c3 4c 89 ef e8 85 fe ff ff eb ed <0f> 0b eb fe 41 8b 85 dc 00 00 00 48 83 bb 10 01 00 00 00 89 83
RIP  [<ffffffffa023629d>] dm_softirq_done+0xbd/0x100 [dm_mod]
 RSP <ffff8800280a1f08>
---[ end trace 16af0a1d8542da55 ]---

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8a311ea0d44..b4845b14740 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -738,16 +738,22 @@ static void rq_completed(struct mapped_device *md, int run_queue)
 	dm_put(md);
 }
 
+static void free_rq_clone(struct request *clone)
+{
+	struct dm_rq_target_io *tio = clone->end_io_data;
+
+	blk_rq_unprep_clone(clone);
+	free_rq_tio(tio);
+}
+
 static void dm_unprep_request(struct request *rq)
 {
 	struct request *clone = rq->special;
-	struct dm_rq_target_io *tio = clone->end_io_data;
 
 	rq->special = NULL;
 	rq->cmd_flags &= ~REQ_DONTPREP;
 
-	blk_rq_unprep_clone(clone);
-	free_rq_tio(tio);
+	free_rq_clone(clone);
 }
 
 /*
@@ -825,8 +831,7 @@ static void dm_end_request(struct request *clone, int error)
 			rq->sense_len = clone->sense_len;
 	}
 
-	BUG_ON(clone->bio);
-	free_rq_tio(tio);
+	free_rq_clone(clone);
 
 	blk_end_request_all(rq, error);
 
-- 
cgit v1.2.3


From 8811f46c1f9386fc7017150de9d52359e5b1826e Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 4 Sep 2009 20:40:19 +0100
Subject: dm snapshot: implement iterate devices

Implement the .iterate_devices for the origin and snapshot targets.
dm-snapshot's lack of .iterate_devices resulted in the inability to
properly establish queue_limits for both targets.

With 4K sector drives: an unfortunate side-effect of not establishing
proper limits in either targets' DM device was that IO to the devices
would fail even though both had been created without error.

Commit af4874e03ed82f050d5872d8c39ce64bf16b5c38 ("dm target:s introduce
iterate devices fn") in 2.6.31-rc1 should have implemented .iterate_devices
for dm-snap.c's origin and snapshot targets.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index d573165cd2b..57f1bf7f3b7 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1176,6 +1176,15 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
 	return 0;
 }
 
+static int snapshot_iterate_devices(struct dm_target *ti,
+				    iterate_devices_callout_fn fn, void *data)
+{
+	struct dm_snapshot *snap = ti->private;
+
+	return fn(ti, snap->origin, 0, ti->len, data);
+}
+
+
 /*-----------------------------------------------------------------
  * Origin methods
  *---------------------------------------------------------------*/
@@ -1410,20 +1419,29 @@ static int origin_status(struct dm_target *ti, status_type_t type, char *result,
 	return 0;
 }
 
+static int origin_iterate_devices(struct dm_target *ti,
+				  iterate_devices_callout_fn fn, void *data)
+{
+	struct dm_dev *dev = ti->private;
+
+	return fn(ti, dev, 0, ti->len, data);
+}
+
 static struct target_type origin_target = {
 	.name    = "snapshot-origin",
-	.version = {1, 6, 0},
+	.version = {1, 7, 0},
 	.module  = THIS_MODULE,
 	.ctr     = origin_ctr,
 	.dtr     = origin_dtr,
 	.map     = origin_map,
 	.resume  = origin_resume,
 	.status  = origin_status,
+	.iterate_devices = origin_iterate_devices,
 };
 
 static struct target_type snapshot_target = {
 	.name    = "snapshot",
-	.version = {1, 6, 0},
+	.version = {1, 7, 0},
 	.module  = THIS_MODULE,
 	.ctr     = snapshot_ctr,
 	.dtr     = snapshot_dtr,
@@ -1431,6 +1449,7 @@ static struct target_type snapshot_target = {
 	.end_io  = snapshot_end_io,
 	.resume  = snapshot_resume,
 	.status  = snapshot_status,
+	.iterate_devices = snapshot_iterate_devices,
 };
 
 static int __init dm_snapshot_init(void)
-- 
cgit v1.2.3


From f6a1ed10864b7540fa758bbccf3433fe17070329 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 4 Sep 2009 20:40:22 +0100
Subject: dm table: fix queue_limit checking device iterator

The logic to check for valid device areas is inverted relative to proper
use with iterate_devices.

The iterate_devices method calls its callback for every underlying
device in the target.  If any callback returns non-zero, iterate_devices
exits immediately.  But the callback device_area_is_valid() returns 0 on
error and 1 on success.  The overall effect without is that an error is
issued only if every device is invalid.

This patch renames device_area_is_valid to device_area_is_invalid and
inverts the logic so that one invalid device is sufficient to raise
an error.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index d952b344191..aa60526075d 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -343,10 +343,10 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
 }
 
 /*
- * If possible, this checks an area of a destination device is valid.
+ * If possible, this checks an area of a destination device is invalid.
  */
-static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev,
-				sector_t start, sector_t len, void *data)
+static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
+				  sector_t start, sector_t len, void *data)
 {
 	struct queue_limits *limits = data;
 	struct block_device *bdev = dev->bdev;
@@ -357,16 +357,16 @@ static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev,
 	char b[BDEVNAME_SIZE];
 
 	if (!dev_size)
-		return 1;
+		return 0;
 
 	if ((start >= dev_size) || (start + len > dev_size)) {
 		DMWARN("%s: %s too small for target",
 		       dm_device_name(ti->table->md), bdevname(bdev, b));
-		return 0;
+		return 1;
 	}
 
 	if (logical_block_size_sectors <= 1)
-		return 1;
+		return 0;
 
 	if (start & (logical_block_size_sectors - 1)) {
 		DMWARN("%s: start=%llu not aligned to h/w "
@@ -374,7 +374,7 @@ static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev,
 		       dm_device_name(ti->table->md),
 		       (unsigned long long)start,
 		       limits->logical_block_size, bdevname(bdev, b));
-		return 0;
+		return 1;
 	}
 
 	if (len & (logical_block_size_sectors - 1)) {
@@ -383,10 +383,10 @@ static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev,
 		       dm_device_name(ti->table->md),
 		       (unsigned long long)len,
 		       limits->logical_block_size, bdevname(bdev, b));
-		return 0;
+		return 1;
 	}
 
-	return 1;
+	return 0;
 }
 
 /*
@@ -1000,8 +1000,8 @@ int dm_calculate_queue_limits(struct dm_table *table,
 		 * Check each device area is consistent with the target's
 		 * overall queue limits.
 		 */
-		if (!ti->type->iterate_devices(ti, device_area_is_valid,
-					       &ti_limits))
+		if (ti->type->iterate_devices(ti, device_area_is_invalid,
+					      &ti_limits))
 			return -EINVAL;
 
 combine_limits:
-- 
cgit v1.2.3


From a963a956225eb0f8c4d3537f428153c30adf54b8 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 4 Sep 2009 20:40:24 +0100
Subject: dm table: add more context to terse warning messages

A couple of recent warning messages make it difficult for the reader to
determine exactly what is wrong.  This patch adds more information to
those messages.

The messages were added by these commits:
  5dea271b6d87bd1d79a59c1d5baac2596a841c37 ("dm table: pass correct dev area size
to device_area_is_valid")
  ea9df47cc92573b159ef3b4fda516c32cba9c4fd ("dm table: fix blk_stack_limits arg
to use bytes not sectors")

The patch also corrects references to logical_block_size in printk format
strings from %hu to %u.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index aa60526075d..c90e662d280 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -360,8 +360,12 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
 		return 0;
 
 	if ((start >= dev_size) || (start + len > dev_size)) {
-		DMWARN("%s: %s too small for target",
-		       dm_device_name(ti->table->md), bdevname(bdev, b));
+		DMWARN("%s: %s too small for target: "
+		       "start=%llu, len=%llu, dev_size=%llu",
+		       dm_device_name(ti->table->md), bdevname(bdev, b),
+		       (unsigned long long)start,
+		       (unsigned long long)len,
+		       (unsigned long long)dev_size);
 		return 1;
 	}
 
@@ -370,7 +374,7 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
 
 	if (start & (logical_block_size_sectors - 1)) {
 		DMWARN("%s: start=%llu not aligned to h/w "
-		       "logical block size %hu of %s",
+		       "logical block size %u of %s",
 		       dm_device_name(ti->table->md),
 		       (unsigned long long)start,
 		       limits->logical_block_size, bdevname(bdev, b));
@@ -379,7 +383,7 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
 
 	if (len & (logical_block_size_sectors - 1)) {
 		DMWARN("%s: len=%llu not aligned to h/w "
-		       "logical block size %hu of %s",
+		       "logical block size %u of %s",
 		       dm_device_name(ti->table->md),
 		       (unsigned long long)len,
 		       limits->logical_block_size, bdevname(bdev, b));
@@ -496,8 +500,15 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
 	}
 
 	if (blk_stack_limits(limits, &q->limits, start << 9) < 0)
-		DMWARN("%s: target device %s is misaligned",
-		       dm_device_name(ti->table->md), bdevname(bdev, b));
+		DMWARN("%s: target device %s is misaligned: "
+		       "physical_block_size=%u, logical_block_size=%u, "
+		       "alignment_offset=%u, start=%llu",
+		       dm_device_name(ti->table->md), bdevname(bdev, b),
+		       q->limits.physical_block_size,
+		       q->limits.logical_block_size,
+		       q->limits.alignment_offset,
+		       (unsigned long long) start << 9);
+
 
 	/*
 	 * Check if merge fn is supported.
@@ -698,7 +709,7 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table,
 
 	if (remaining) {
 		DMWARN("%s: table line %u (start sect %llu len %llu) "
-		       "not aligned to h/w logical block size %hu",
+		       "not aligned to h/w logical block size %u",
 		       dm_device_name(table->md), i,
 		       (unsigned long long) ti->begin,
 		       (unsigned long long) ti->len,
-- 
cgit v1.2.3


From 40bea431274c247425e7f5970d796ff7b37a2b22 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 4 Sep 2009 20:40:25 +0100
Subject: dm stripe: expose correct io hints

Set sensible I/O hints for striped DM devices in the topology
infrastructure added for 2.6.31 for userspace tools to
obtain via sysfs.

Add .io_hints to 'struct target_type' to allow the I/O hints portion
(io_min and io_opt) of the 'struct queue_limits' to be set by each
target and implement this for dm-stripe.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-stripe.c        | 13 ++++++++++++-
 drivers/md/dm-table.c         |  4 ++++
 include/linux/device-mapper.h |  4 ++++
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 4e0e5937e42..3e563d25173 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -329,9 +329,19 @@ static int stripe_iterate_devices(struct dm_target *ti,
 	return ret;
 }
 
+static void stripe_io_hints(struct dm_target *ti,
+			    struct queue_limits *limits)
+{
+	struct stripe_c *sc = ti->private;
+	unsigned chunk_size = (sc->chunk_mask + 1) << 9;
+
+	blk_limits_io_min(limits, chunk_size);
+	limits->io_opt = chunk_size * sc->stripes;
+}
+
 static struct target_type stripe_target = {
 	.name   = "striped",
-	.version = {1, 2, 0},
+	.version = {1, 3, 0},
 	.module = THIS_MODULE,
 	.ctr    = stripe_ctr,
 	.dtr    = stripe_dtr,
@@ -339,6 +349,7 @@ static struct target_type stripe_target = {
 	.end_io = stripe_end_io,
 	.status = stripe_status,
 	.iterate_devices = stripe_iterate_devices,
+	.io_hints = stripe_io_hints,
 };
 
 int __init dm_stripe_init(void)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index c90e662d280..1a6cb3c7822 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1007,6 +1007,10 @@ int dm_calculate_queue_limits(struct dm_table *table,
 		ti->type->iterate_devices(ti, dm_set_device_limits,
 					  &ti_limits);
 
+		/* Set I/O hints portion of queue limits */
+		if (ti->type->io_hints)
+			ti->type->io_hints(ti, &ti_limits);
+
 		/*
 		 * Check each device area is consistent with the target's
 		 * overall queue limits.
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 655e7721580..df7607e6dce 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -91,6 +91,9 @@ typedef int (*dm_iterate_devices_fn) (struct dm_target *ti,
 				      iterate_devices_callout_fn fn,
 				      void *data);
 
+typedef void (*dm_io_hints_fn) (struct dm_target *ti,
+				struct queue_limits *limits);
+
 /*
  * Returns:
  *    0: The target can handle the next I/O immediately.
@@ -151,6 +154,7 @@ struct target_type {
 	dm_merge_fn merge;
 	dm_busy_fn busy;
 	dm_iterate_devices_fn iterate_devices;
+	dm_io_hints_fn io_hints;
 
 	/* For internal device-mapper use. */
 	struct list_head list;
-- 
cgit v1.2.3


From 4142a969175302bc843d1505133488bfdbfa4732 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Fri, 4 Sep 2009 20:40:28 +0100
Subject: dm log: fix userspace status output

Fix 'dmsetup table' output.

There is a missing ' ' at the end of the string causing two
words to run together.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log-userspace-base.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index e69b9656099..2f2a244e110 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -577,7 +577,7 @@ static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
 		break;
 	case STATUSTYPE_TABLE:
 		sz = 0;
-		DMEMIT("%s %u %s %s", log->type->name, lc->usr_argc + 1,
+		DMEMIT("%s %u %s %s ", log->type->name, lc->usr_argc + 1,
 		       lc->uuid, lc->usr_argv_str);
 		break;
 	}
-- 
cgit v1.2.3


From b8313b6da7e2e7c7f47d93d8561969a3ff9ba0ea Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Fri, 4 Sep 2009 20:40:30 +0100
Subject: dm log: remove incorrect field from userspace table output

The output of 'dmsetup table' includes an internal field that should not
be there.  This patch removes it.  To make the fix simpler, we first
reorder a constructor argument

The 'device size' argument is generated internally.  Currently it is
placed as the last space-separated word of the constructor string.
However, we need to use a version of the string without this word, so we
move it to the beginning instead so it is trivial to skip past it.

We keep a copy of the arguments passed to userspace for creating a log,
just in case we need to resend them.  These are the same arguments that
are desired in the STATUSTYPE_TABLE request, except for one.  When
creating the userspace log, the userspace daemon must know the size of
the mirror, so that is added to the arguments given in the constructor
table.  We were printing this extra argument out as well, which is a
mistake.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log-userspace-base.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 2f2a244e110..c49da0a41c8 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -111,10 +111,9 @@ static int build_constructor_string(struct dm_target *ti,
 		return -ENOMEM;
 	}
 
-	for (i = 0, str_size = 0; i < argc; i++)
-		str_size += sprintf(str + str_size, "%s ", argv[i]);
-	str_size += sprintf(str + str_size, "%llu",
-			    (unsigned long long)ti->len);
+	str_size = sprintf(str, "%llu", (unsigned long long)ti->len);
+	for (i = 0; i < argc; i++)
+		str_size += sprintf(str + str_size, " %s", argv[i]);
 
 	*ctr_str = str;
 	return str_size;
@@ -561,6 +560,7 @@ static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
 			    char *result, unsigned maxlen)
 {
 	int r = 0;
+	char *table_args;
 	size_t sz = (size_t)maxlen;
 	struct log_c *lc = log->context;
 
@@ -577,8 +577,12 @@ static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
 		break;
 	case STATUSTYPE_TABLE:
 		sz = 0;
-		DMEMIT("%s %u %s %s ", log->type->name, lc->usr_argc + 1,
-		       lc->uuid, lc->usr_argv_str);
+		table_args = strstr(lc->usr_argv_str, " ");
+		BUG_ON(!table_args); /* There will always be a ' ' */
+		table_args++;
+
+		DMEMIT("%s %u %s %s ", log->type->name, lc->usr_argc,
+		       lc->uuid, table_args);
 		break;
 	}
 	return (r) ? 0 : (int)sz;
-- 
cgit v1.2.3


From d2b698644c97cb033261536a4f2010924a00eac9 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Fri, 4 Sep 2009 20:40:32 +0100
Subject: dm raid1: do not allow log_failure variable to unset after being set

This patch fixes a bug which was triggering a case where the primary leg
could not be changed on failure even when the mirror was in-sync.

The case involves the failure of the primary device along with
the transient failure of the log device.  The problem is that
bios can be put on the 'failures' list (due to log failure)
before 'fail_mirror' is called due to the primary device failure.
Normally, this is fine, but if the log device failure is transient,
a subsequent iteration of the work thread, 'do_mirror', will
reset 'log_failure'.  The 'do_failures' function then resets
the 'in_sync' variable when processing bios on the failures list.
The 'in_sync' variable is what is used to determine if the
primary device can be switched in the event of a failure.  Since
this has been reset, the primary device is incorrectly assumed
to be not switchable.

The case has been seen in the cluster mirror context, where one
machine realizes the log device is dead before the other machines.
As the responsibilities of the server migrate from one node to
another (because the mirror is being reconfigured due to the failure),
the new server may think for a moment that the log device is fine -
thus resetting the 'log_failure' variable.

In any case, it is inappropiate for us to reset the 'log_failure'
variable.  The above bug simply illustrates that it can actually
hurt us.

Cc: stable@kernel.org
Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid1.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 9726577cde4..33f179e66bf 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -648,7 +648,13 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 	 */
 	dm_rh_inc_pending(ms->rh, &sync);
 	dm_rh_inc_pending(ms->rh, &nosync);
-	ms->log_failure = dm_rh_flush(ms->rh) ? 1 : 0;
+
+	/*
+	 * If the flush fails on a previous call and succeeds here,
+	 * we must not reset the log_failure variable.  We need
+	 * userspace interaction to do that.
+	 */
+	ms->log_failure = dm_rh_flush(ms->rh) ? 1 : ms->log_failure;
 
 	/*
 	 * Dispatch io.
-- 
cgit v1.2.3


From 7ec23d50949d5062b5b749638dd9380ed75e58e5 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Fri, 4 Sep 2009 20:40:34 +0100
Subject: dm log: userspace add luid to distinguish between concurrent log
 instances

Device-mapper userspace logs (like the clustered log) are
identified by a universally unique identifier (UUID).  This
identifier is used to associate requests from the kernel to
a specific log in userspace.  The UUID must be unique everywhere,
since multiple machines may use this identifier when communicating
about a particular log, as is the case for cluster logs.

Sometimes, device-mapper/LVM may re-use a UUID.  This is the
case during pvmoves, when moving from one segment of an LV
to another, or when resizing a mirror, etc.  In these cases,
a new log is created with the same UUID and loaded in the
"inactive" slot.  When a device-mapper "resume" is issued,
the "live" table is deactivated and the new "inactive" table
becomes "live".  (The "inactive" table can also be removed
via a device-mapper 'clear' command.)

The above two issues were colliding.  More than one log was being
created with the same UUID, and there was no way to distinguish
between them.  So, sometimes the wrong log would be swapped
out during the exchange.

The solution is to create a locally unique identifier,
'luid', to go along with the UUID.  This new identifier is used
to determine exactly which log is being referenced by the kernel
when the log exchange is made.  The identifier is not
universally safe, but it does not need to be, since
create/destroy/suspend/resume operations are bound to a specific
machine; and these are the operations that make up the exchange.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log-userspace-base.c     | 23 ++++++++++++++---------
 drivers/md/dm-log-userspace-transfer.c |  6 ++++--
 drivers/md/dm-log-userspace-transfer.h |  2 +-
 include/linux/dm-log-userspace.h       | 13 ++++++++++++-
 4 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index c49da0a41c8..6e186b1a062 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -21,6 +21,7 @@ struct log_c {
 	struct dm_target *ti;
 	uint32_t region_size;
 	region_t region_count;
+	uint64_t luid;
 	char uuid[DM_UUID_LEN];
 
 	char *usr_argv_str;
@@ -63,7 +64,7 @@ static int userspace_do_request(struct log_c *lc, const char *uuid,
 	 * restored.
 	 */
 retry:
-	r = dm_consult_userspace(uuid, request_type, data,
+	r = dm_consult_userspace(uuid, lc->luid, request_type, data,
 				 data_size, rdata, rdata_size);
 
 	if (r != -ESRCH)
@@ -74,14 +75,15 @@ retry:
 		set_current_state(TASK_INTERRUPTIBLE);
 		schedule_timeout(2*HZ);
 		DMWARN("Attempting to contact userspace log server...");
-		r = dm_consult_userspace(uuid, DM_ULOG_CTR, lc->usr_argv_str,
+		r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_CTR,
+					 lc->usr_argv_str,
 					 strlen(lc->usr_argv_str) + 1,
 					 NULL, NULL);
 		if (!r)
 			break;
 	}
 	DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete");
-	r = dm_consult_userspace(uuid, DM_ULOG_RESUME, NULL,
+	r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_RESUME, NULL,
 				 0, NULL, NULL);
 	if (!r)
 		goto retry;
@@ -153,6 +155,9 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 		return -ENOMEM;
 	}
 
+	/* The ptr value is sufficient for local unique id */
+	lc->luid = (uint64_t)lc;
+
 	lc->ti = ti;
 
 	if (strlen(argv[0]) > (DM_UUID_LEN - 1)) {
@@ -172,7 +177,7 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 	}
 
 	/* Send table string */
-	r = dm_consult_userspace(lc->uuid, DM_ULOG_CTR,
+	r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR,
 				 ctr_str, str_size, NULL, NULL);
 
 	if (r == -ESRCH) {
@@ -182,7 +187,7 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 
 	/* Since the region size does not change, get it now */
 	rdata_size = sizeof(rdata);
-	r = dm_consult_userspace(lc->uuid, DM_ULOG_GET_REGION_SIZE,
+	r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_GET_REGION_SIZE,
 				 NULL, 0, (char *)&rdata, &rdata_size);
 
 	if (r) {
@@ -211,7 +216,7 @@ static void userspace_dtr(struct dm_dirty_log *log)
 	int r;
 	struct log_c *lc = log->context;
 
-	r = dm_consult_userspace(lc->uuid, DM_ULOG_DTR,
+	r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
 				 NULL, 0,
 				 NULL, NULL);
 
@@ -226,7 +231,7 @@ static int userspace_presuspend(struct dm_dirty_log *log)
 	int r;
 	struct log_c *lc = log->context;
 
-	r = dm_consult_userspace(lc->uuid, DM_ULOG_PRESUSPEND,
+	r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND,
 				 NULL, 0,
 				 NULL, NULL);
 
@@ -238,7 +243,7 @@ static int userspace_postsuspend(struct dm_dirty_log *log)
 	int r;
 	struct log_c *lc = log->context;
 
-	r = dm_consult_userspace(lc->uuid, DM_ULOG_POSTSUSPEND,
+	r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND,
 				 NULL, 0,
 				 NULL, NULL);
 
@@ -251,7 +256,7 @@ static int userspace_resume(struct dm_dirty_log *log)
 	struct log_c *lc = log->context;
 
 	lc->in_sync_hint = 0;
-	r = dm_consult_userspace(lc->uuid, DM_ULOG_RESUME,
+	r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME,
 				 NULL, 0,
 				 NULL, NULL);
 
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 8ce74d95ae4..ba0edad2d04 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -147,7 +147,8 @@ static void cn_ulog_callback(void *data)
 
 /**
  * dm_consult_userspace
- * @uuid: log's uuid (must be DM_UUID_LEN in size)
+ * @uuid: log's universal unique identifier (must be DM_UUID_LEN in size)
+ * @luid: log's local unique identifier
  * @request_type:  found in include/linux/dm-log-userspace.h
  * @data: data to tx to the server
  * @data_size: size of data in bytes
@@ -163,7 +164,7 @@ static void cn_ulog_callback(void *data)
  *
  * Returns: 0 on success, -EXXX on failure
  **/
-int dm_consult_userspace(const char *uuid, int request_type,
+int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type,
 			 char *data, size_t data_size,
 			 char *rdata, size_t *rdata_size)
 {
@@ -190,6 +191,7 @@ resend:
 
 	memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - overhead_size);
 	memcpy(tfr->uuid, uuid, DM_UUID_LEN);
+	tfr->luid = luid;
 	tfr->seq = dm_ulog_seq++;
 
 	/*
diff --git a/drivers/md/dm-log-userspace-transfer.h b/drivers/md/dm-log-userspace-transfer.h
index c26d8e4e271..04ee874f915 100644
--- a/drivers/md/dm-log-userspace-transfer.h
+++ b/drivers/md/dm-log-userspace-transfer.h
@@ -11,7 +11,7 @@
 
 int dm_ulog_tfr_init(void);
 void dm_ulog_tfr_exit(void);
-int dm_consult_userspace(const char *uuid, int request_type,
+int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type,
 			 char *data, size_t data_size,
 			 char *rdata, size_t *rdata_size);
 
diff --git a/include/linux/dm-log-userspace.h b/include/linux/dm-log-userspace.h
index 642e3017b51..8a1f972c0fe 100644
--- a/include/linux/dm-log-userspace.h
+++ b/include/linux/dm-log-userspace.h
@@ -371,7 +371,18 @@
 	(DM_ULOG_REQUEST_MASK & (request_type))
 
 struct dm_ulog_request {
-	char uuid[DM_UUID_LEN]; /* Ties a request to a specific mirror log */
+	/*
+	 * The local unique identifier (luid) and the universally unique
+	 * identifier (uuid) are used to tie a request to a specific
+	 * mirror log.  A single machine log could probably make due with
+	 * just the 'luid', but a cluster-aware log must use the 'uuid' and
+	 * the 'luid'.  The uuid is what is required for node to node
+	 * communication concerning a particular log, but the 'luid' helps
+	 * differentiate between logs that are being swapped and have the
+	 * same 'uuid'.  (Think "live" and "inactive" device-mapper tables.)
+	 */
+	uint64_t luid;
+	char uuid[DM_UUID_LEN];
 	char padding[7];        /* Padding because DM_UUID_LEN = 129 */
 
 	int32_t error;          /* Used to report back processing errors */
-- 
cgit v1.2.3


From 02d2fd31defce6ff77146ad0fef4f19006055d86 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 4 Sep 2009 20:40:37 +0100
Subject: dm snapshot: refactor zero_disk_area to use chunk_io

Refactor chunk_io to prepare for the fix in the following patch.

Pass an area pointer to chunk_io and simplify zero_disk_area to use
chunk_io.  No functional change.

Cc: stable@kernel.org
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap-persistent.c | 26 +++++++-------------------
 1 file changed, 7 insertions(+), 19 deletions(-)

diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 6e3fe4f1493..2a3d626a98d 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -188,7 +188,8 @@ static void do_metadata(struct work_struct *work)
 /*
  * Read or write a chunk aligned and sized block of data from a device.
  */
-static int chunk_io(struct pstore *ps, chunk_t chunk, int rw, int metadata)
+static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
+		    int metadata)
 {
 	struct dm_io_region where = {
 		.bdev = ps->store->cow->bdev,
@@ -198,7 +199,7 @@ static int chunk_io(struct pstore *ps, chunk_t chunk, int rw, int metadata)
 	struct dm_io_request io_req = {
 		.bi_rw = rw,
 		.mem.type = DM_IO_VMA,
-		.mem.ptr.vma = ps->area,
+		.mem.ptr.vma = area,
 		.client = ps->io_client,
 		.notify.fn = NULL,
 	};
@@ -240,7 +241,7 @@ static int area_io(struct pstore *ps, int rw)
 
 	chunk = area_location(ps, ps->current_area);
 
-	r = chunk_io(ps, chunk, rw, 0);
+	r = chunk_io(ps, ps->area, chunk, rw, 0);
 	if (r)
 		return r;
 
@@ -254,20 +255,7 @@ static void zero_memory_area(struct pstore *ps)
 
 static int zero_disk_area(struct pstore *ps, chunk_t area)
 {
-	struct dm_io_region where = {
-		.bdev = ps->store->cow->bdev,
-		.sector = ps->store->chunk_size * area_location(ps, area),
-		.count = ps->store->chunk_size,
-	};
-	struct dm_io_request io_req = {
-		.bi_rw = WRITE,
-		.mem.type = DM_IO_VMA,
-		.mem.ptr.vma = ps->zero_area,
-		.client = ps->io_client,
-		.notify.fn = NULL,
-	};
-
-	return dm_io(&io_req, 1, &where, NULL);
+	return chunk_io(ps, ps->zero_area, area_location(ps, area), WRITE, 0);
 }
 
 static int read_header(struct pstore *ps, int *new_snapshot)
@@ -297,7 +285,7 @@ static int read_header(struct pstore *ps, int *new_snapshot)
 	if (r)
 		return r;
 
-	r = chunk_io(ps, 0, READ, 1);
+	r = chunk_io(ps, ps->area, 0, READ, 1);
 	if (r)
 		goto bad;
 
@@ -359,7 +347,7 @@ static int write_header(struct pstore *ps)
 	dh->version = cpu_to_le32(ps->version);
 	dh->chunk_size = cpu_to_le32(ps->store->chunk_size);
 
-	return chunk_io(ps, 0, WRITE, 1);
+	return chunk_io(ps, ps->area, 0, WRITE, 1);
 }
 
 /*
-- 
cgit v1.2.3


From 61578dcd3fafe6babd72e8db32110cc0b630a432 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 4 Sep 2009 20:40:39 +0100
Subject: dm snapshot: fix header corruption race on invalidation

If a persistent snapshot fills up, a race can corrupt the on-disk header
which causes a crash on any future attempt to activate the snapshot
(typically while booting).  This patch fixes the race.

When the snapshot overflows, __invalidate_snapshot is called, which calls
snapshot store method drop_snapshot. It goes to persistent_drop_snapshot that
calls write_header. write_header constructs the new header in the "area"
location.

Concurrently, an existing kcopyd job may finish, call copy_callback
and commit_exception method, that goes to persistent_commit_exception.
persistent_commit_exception doesn't do locking, relying on the fact that
callbacks are single-threaded, but it can race with snapshot invalidation and
overwrite the header that is just being written while the snapshot is being
invalidated.

The result of this race is a corrupted header being written that can
lead to a crash on further reactivation (if chunk_size is zero in the
corrupted header).

The fix is to use separate memory areas for each.

See the bug: https://bugzilla.redhat.com/show_bug.cgi?id=461506

Cc: stable@kernel.org
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap-persistent.c | 44 +++++++++++++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 10 deletions(-)

diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 2a3d626a98d..5d1a97580cb 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -105,6 +105,13 @@ struct pstore {
 	 */
 	void *zero_area;
 
+	/*
+	 * An area used for header. The header can be written
+	 * concurrently with metadata (when invalidating the snapshot),
+	 * so it needs a separate buffer.
+	 */
+	void *header_area;
+
 	/*
 	 * Used to keep track of which metadata area the data in
 	 * 'chunk' refers to.
@@ -148,16 +155,27 @@ static int alloc_area(struct pstore *ps)
 	 */
 	ps->area = vmalloc(len);
 	if (!ps->area)
-		return r;
+		goto err_area;
 
 	ps->zero_area = vmalloc(len);
-	if (!ps->zero_area) {
-		vfree(ps->area);
-		return r;
-	}
+	if (!ps->zero_area)
+		goto err_zero_area;
 	memset(ps->zero_area, 0, len);
 
+	ps->header_area = vmalloc(len);
+	if (!ps->header_area)
+		goto err_header_area;
+
 	return 0;
+
+err_header_area:
+	vfree(ps->zero_area);
+
+err_zero_area:
+	vfree(ps->area);
+
+err_area:
+	return r;
 }
 
 static void free_area(struct pstore *ps)
@@ -169,6 +187,10 @@ static void free_area(struct pstore *ps)
 	if (ps->zero_area)
 		vfree(ps->zero_area);
 	ps->zero_area = NULL;
+
+	if (ps->header_area)
+		vfree(ps->header_area);
+	ps->header_area = NULL;
 }
 
 struct mdata_req {
@@ -285,11 +307,11 @@ static int read_header(struct pstore *ps, int *new_snapshot)
 	if (r)
 		return r;
 
-	r = chunk_io(ps, ps->area, 0, READ, 1);
+	r = chunk_io(ps, ps->header_area, 0, READ, 1);
 	if (r)
 		goto bad;
 
-	dh = (struct disk_header *) ps->area;
+	dh = ps->header_area;
 
 	if (le32_to_cpu(dh->magic) == 0) {
 		*new_snapshot = 1;
@@ -339,15 +361,15 @@ static int write_header(struct pstore *ps)
 {
 	struct disk_header *dh;
 
-	memset(ps->area, 0, ps->store->chunk_size << SECTOR_SHIFT);
+	memset(ps->header_area, 0, ps->store->chunk_size << SECTOR_SHIFT);
 
-	dh = (struct disk_header *) ps->area;
+	dh = ps->header_area;
 	dh->magic = cpu_to_le32(SNAP_MAGIC);
 	dh->valid = cpu_to_le32(ps->valid);
 	dh->version = cpu_to_le32(ps->version);
 	dh->chunk_size = cpu_to_le32(ps->store->chunk_size);
 
-	return chunk_io(ps, ps->area, 0, WRITE, 1);
+	return chunk_io(ps, ps->header_area, 0, WRITE, 1);
 }
 
 /*
@@ -667,6 +689,8 @@ static int persistent_ctr(struct dm_exception_store *store,
 	ps->valid = 1;
 	ps->version = SNAPSHOT_DISK_VERSION;
 	ps->area = NULL;
+	ps->zero_area = NULL;
+	ps->header_area = NULL;
 	ps->next_free = 2;	/* skipping the header and first area */
 	ps->current_committed = 0;
 
-- 
cgit v1.2.3


From 2defcc3fb4661e7351cb2ac48d843efc4c64db13 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 4 Sep 2009 20:40:41 +0100
Subject: dm exception store: split set_chunk_size

Break the function set_chunk_size to two functions in preparation for
the fix in the following patch.

Cc: stable@kernel.org
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-exception-store.c | 8 ++++++++
 drivers/md/dm-exception-store.h | 4 ++++
 2 files changed, 12 insertions(+)

diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 3710ff88fc1..4c01c7535fb 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -171,6 +171,14 @@ static int set_chunk_size(struct dm_exception_store *store,
 	 */
 	chunk_size_ulong = round_up(chunk_size_ulong, PAGE_SIZE >> 9);
 
+	return dm_exception_store_set_chunk_size(store, chunk_size_ulong,
+						 error);
+}
+
+int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
+				      unsigned long chunk_size_ulong,
+				      char **error)
+{
 	/* Check chunk_size is a power of 2 */
 	if (!is_power_of_2(chunk_size_ulong)) {
 		*error = "Chunk size is not a power of 2";
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index 2442c8c0789..812c71872ba 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -168,6 +168,10 @@ static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
 int dm_exception_store_type_register(struct dm_exception_store_type *type);
 int dm_exception_store_type_unregister(struct dm_exception_store_type *type);
 
+int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
+				      unsigned long chunk_size_ulong,
+				      char **error);
+
 int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
 			      unsigned *args_used,
 			      struct dm_exception_store **store);
-- 
cgit v1.2.3


From ae0b7448e91353ea5f821601a055aca6b58042cd Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 4 Sep 2009 20:40:43 +0100
Subject: dm snapshot: fix on disk chunk size validation

Fix some problems seen in the chunk size processing when activating a
pre-existing snapshot.

For a new snapshot, the chunk size can either be supplied by the creator
or a default value can be used.  For an existing snapshot, the
chunk size in the snapshot header on disk should always be used.

If someone attempts to load an existing snapshot and has the 'default
chunk size' option set, the kernel uses its default value even when it
is incorrect for the snapshot being loaded.  This patch ensures the
correct on-disk value is always used.

Secondly, when the code does use the chunk size stored on the disk it is
prudent to revalidate it, so the code can exit cleanly if it got
corrupted as happened in
https://bugzilla.redhat.com/show_bug.cgi?id=461506 .

Cc: stable@kernel.org
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-exception-store.c |  5 +++++
 drivers/md/dm-snap-persistent.c | 22 ++++++++++++++--------
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 4c01c7535fb..556acff3952 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -191,6 +191,11 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
 		return -EINVAL;
 	}
 
+	if (chunk_size_ulong > INT_MAX >> SECTOR_SHIFT) {
+		*error = "Chunk size is too high";
+		return -EINVAL;
+	}
+
 	store->chunk_size = chunk_size_ulong;
 	store->chunk_mask = chunk_size_ulong - 1;
 	store->chunk_shift = ffs(chunk_size_ulong) - 1;
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 5d1a97580cb..d5b2e08750d 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -286,6 +286,7 @@ static int read_header(struct pstore *ps, int *new_snapshot)
 	struct disk_header *dh;
 	chunk_t chunk_size;
 	int chunk_size_supplied = 1;
+	char *chunk_err;
 
 	/*
 	 * Use default chunk size (or hardsect_size, if larger) if none supplied
@@ -329,20 +330,25 @@ static int read_header(struct pstore *ps, int *new_snapshot)
 	ps->version = le32_to_cpu(dh->version);
 	chunk_size = le32_to_cpu(dh->chunk_size);
 
-	if (!chunk_size_supplied || ps->store->chunk_size == chunk_size)
+	if (ps->store->chunk_size == chunk_size)
 		return 0;
 
-	DMWARN("chunk size %llu in device metadata overrides "
-	       "table chunk size of %llu.",
-	       (unsigned long long)chunk_size,
-	       (unsigned long long)ps->store->chunk_size);
+	if (chunk_size_supplied)
+		DMWARN("chunk size %llu in device metadata overrides "
+		       "table chunk size of %llu.",
+		       (unsigned long long)chunk_size,
+		       (unsigned long long)ps->store->chunk_size);
 
 	/* We had a bogus chunk_size. Fix stuff up. */
 	free_area(ps);
 
-	ps->store->chunk_size = chunk_size;
-	ps->store->chunk_mask = chunk_size - 1;
-	ps->store->chunk_shift = ffs(chunk_size) - 1;
+	r = dm_exception_store_set_chunk_size(ps->store, chunk_size,
+					      &chunk_err);
+	if (r) {
+		DMERR("invalid on-disk chunk size %llu: %s.",
+		      (unsigned long long)chunk_size, chunk_err);
+		return r;
+	}
 
 	r = dm_io_client_resize(sectors_to_pages(ps->store->chunk_size),
 				ps->io_client);
-- 
cgit v1.2.3