From a82c53a0e3f57f02782330372b7adad67b417645 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@comcast.net>
Date: Fri, 9 May 2008 13:28:36 +0200
Subject: splice: fix sendfile() issue with relay

Splice isn't always incrementing the ppos correctly, which broke
relay splice.

Signed-off-by: Tom Zanussi <zanussi@comcast.net>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/splice.c    | 12 ++++++++----
 kernel/relay.c |  2 +-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/fs/splice.c b/fs/splice.c
index 78150038b58..a048ad2130c 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -983,7 +983,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 
 	while (len) {
 		size_t read_len;
-		loff_t pos = sd->pos;
+		loff_t pos = sd->pos, prev_pos = pos;
 
 		ret = do_splice_to(in, &pos, pipe, len, flags);
 		if (unlikely(ret <= 0))
@@ -998,15 +998,19 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 		 * could get stuck data in the internal pipe:
 		 */
 		ret = actor(pipe, sd);
-		if (unlikely(ret <= 0))
+		if (unlikely(ret <= 0)) {
+			sd->pos = prev_pos;
 			goto out_release;
+		}
 
 		bytes += ret;
 		len -= ret;
 		sd->pos = pos;
 
-		if (ret < read_len)
+		if (ret < read_len) {
+			sd->pos = prev_pos + ret;
 			goto out_release;
+		}
 	}
 
 done:
@@ -1072,7 +1076,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 
 	ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
 	if (ret > 0)
-		*ppos += ret;
+		*ppos = sd.pos;
 
 	return ret;
 }
diff --git a/kernel/relay.c b/kernel/relay.c
index bc24dcdc570..7de644cdec4 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1191,7 +1191,7 @@ static ssize_t relay_file_splice_read(struct file *in,
 	ret = 0;
 	spliced = 0;
 
-	while (len) {
+	while (len && !spliced) {
 		ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);
 		if (ret < 0)
 			break;
-- 
cgit v1.2.3


From ca39d651d17df49b6d11f851d56c0ce0ce01ea1a Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 20 May 2008 21:27:41 +0200
Subject: splice: handle try_to_release_page() failure

splice currently assumes that try_to_release_page() always suceeds,
but it can return failure. If it does, we cannot steal the page.

Acked-by: Mingming Cao <cmm@us.ibm.com
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/splice.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/splice.c b/fs/splice.c
index a048ad2130c..aa5f6f60b30 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -58,8 +58,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
 		 */
 		wait_on_page_writeback(page);
 
-		if (PagePrivate(page))
-			try_to_release_page(page, GFP_KERNEL);
+		if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
+			goto out_unlock;
 
 		/*
 		 * If we succeeded in removing the mapping, set LRU flag
@@ -75,6 +75,7 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
 	 * Raced with truncate or failed to remove page from current
 	 * address space, unlock and return failure.
 	 */
+out_unlock:
 	unlock_page(page);
 	return 1;
 }
-- 
cgit v1.2.3


From 05caf8dbc1880415df3378cfd114d832c9618b60 Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Date: Thu, 22 May 2008 15:13:29 +0200
Subject: block: Move the second call to get_request to the end of the loop

In function get_request_wait, the second call to get_request could be
moved to the end of the while loop, because if the first call to
get_request fails, the second call will fail without sleep.

Signed-off-by: Zhang Yanmin <yanmin.zhang@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c | 37 +++++++++++++++++--------------------
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 6a9cc0d22a6..1905aaba49f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -806,35 +806,32 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
 	rq = get_request(q, rw_flags, bio, GFP_NOIO);
 	while (!rq) {
 		DEFINE_WAIT(wait);
+		struct io_context *ioc;
 		struct request_list *rl = &q->rq;
 
 		prepare_to_wait_exclusive(&rl->wait[rw], &wait,
 				TASK_UNINTERRUPTIBLE);
 
-		rq = get_request(q, rw_flags, bio, GFP_NOIO);
-
-		if (!rq) {
-			struct io_context *ioc;
+		blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
 
-			blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
-
-			__generic_unplug_device(q);
-			spin_unlock_irq(q->queue_lock);
-			io_schedule();
+		__generic_unplug_device(q);
+		spin_unlock_irq(q->queue_lock);
+		io_schedule();
 
-			/*
-			 * After sleeping, we become a "batching" process and
-			 * will be able to allocate at least one request, and
-			 * up to a big batch of them for a small period time.
-			 * See ioc_batching, ioc_set_batching
-			 */
-			ioc = current_io_context(GFP_NOIO, q->node);
-			ioc_set_batching(q, ioc);
+		/*
+		 * After sleeping, we become a "batching" process and
+		 * will be able to allocate at least one request, and
+		 * up to a big batch of them for a small period time.
+		 * See ioc_batching, ioc_set_batching
+		 */
+		ioc = current_io_context(GFP_NOIO, q->node);
+		ioc_set_batching(q, ioc);
 
-			spin_lock_irq(q->queue_lock);
-		}
+		spin_lock_irq(q->queue_lock);
 		finish_wait(&rl->wait[rw], &wait);
-	}
+
+		rq = get_request(q, rw_flags, bio, GFP_NOIO);
+	};
 
 	return rq;
 }
-- 
cgit v1.2.3


From be754d2c2161c0cce11d62727016985ecb76831b Mon Sep 17 00:00:00 2001
From: Richard Kennedy <richard@rsk.demon.co.uk>
Date: Fri, 23 May 2008 06:52:00 +0200
Subject: block: reorder cfq_queue to save space on 64bit builds

saves 8 bytes of padding & increases objects/slab from 30 to 32 on my
AMD64 config

Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index b399c62936e..4df3f052243 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -124,6 +124,8 @@ struct cfq_data {
 struct cfq_queue {
 	/* reference count */
 	atomic_t ref;
+	/* various state flags, see below */
+	unsigned int flags;
 	/* parent cfq_data */
 	struct cfq_data *cfqd;
 	/* service_tree member */
@@ -138,14 +140,14 @@ struct cfq_queue {
 	int queued[2];
 	/* currently allocated requests */
 	int allocated[2];
-	/* pending metadata requests */
-	int meta_pending;
 	/* fifo list of requests in sort_list */
 	struct list_head fifo;
 
 	unsigned long slice_end;
 	long slice_resid;
 
+	/* pending metadata requests */
+	int meta_pending;
 	/* number of requests that are on the dispatch list or inside driver */
 	int dispatched;
 
@@ -153,8 +155,6 @@ struct cfq_queue {
 	unsigned short ioprio, org_ioprio;
 	unsigned short ioprio_class, org_ioprio_class;
 
-	/* various state flags, see below */
-	unsigned int flags;
 };
 
 enum cfqq_state_flags {
-- 
cgit v1.2.3


From 9d5f09a424a67ddb959829894efb4c71cbf6d600 Mon Sep 17 00:00:00 2001
From: "Alan D. Brunelle" <Alan.Brunelle@hp.com>
Date: Tue, 27 May 2008 14:54:41 +0200
Subject: Added in MESSAGE notes for blktraces

Allows messages to be inserted into blktrace streams.

Signed-off-by: Alan D. Brunelle <alan.brunelle@hp.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blktrace.c             | 14 ++++++++++++++
 include/linux/blktrace_api.h | 25 +++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/block/blktrace.c b/block/blktrace.c
index b2cbb4e5d76..20e11f354f1 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -75,6 +75,20 @@ static void trace_note_time(struct blk_trace *bt)
 	local_irq_restore(flags);
 }
 
+void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
+{
+	int n;
+	va_list args;
+	static char bt_msg_buf[BLK_TN_MAX_MSG];
+
+	va_start(args, fmt);
+	n = vscnprintf(bt_msg_buf, BLK_TN_MAX_MSG, fmt, args);
+	va_end(args);
+
+	trace_note(bt, 0, BLK_TN_MESSAGE, bt_msg_buf, n);
+}
+EXPORT_SYMBOL_GPL(__trace_note_message);
+
 static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
 			 pid_t pid)
 {
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index cfc3147e5cf..b7cd8f1eedb 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -55,6 +55,7 @@ enum blktrace_act {
 enum blktrace_notify {
 	__BLK_TN_PROCESS = 0,		/* establish pid/name mapping */
 	__BLK_TN_TIMESTAMP,		/* include system clock */
+	__BLK_TN_MESSAGE,		/* Character string message */
 };
 
 
@@ -79,6 +80,7 @@ enum blktrace_notify {
 
 #define BLK_TN_PROCESS		(__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY))
 #define BLK_TN_TIMESTAMP	(__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY))
+#define BLK_TN_MESSAGE		(__BLK_TN_MESSAGE | BLK_TC_ACT(BLK_TC_NOTIFY))
 
 #define BLK_IO_TRACE_MAGIC	0x65617400
 #define BLK_IO_TRACE_VERSION	0x07
@@ -149,7 +151,28 @@ extern void blk_trace_shutdown(struct request_queue *);
 extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *);
 extern int do_blk_trace_setup(struct request_queue *q,
 	char *name, dev_t dev, struct blk_user_trace_setup *buts);
+extern void __trace_note_message(struct blk_trace *, const char *fmt, ...);
 
+/**
+ * blk_add_trace_msg - Add a (simple) message to the blktrace stream
+ * @q:		queue the io is for
+ * @fmt:	format to print message in
+ * args...	Variable argument list for format
+ *
+ * Description:
+ *     Records a (simple) message onto the blktrace stream.
+ *
+ *     NOTE: BLK_TN_MAX_MSG characters are output at most.
+ *     NOTE: Can not use 'static inline' due to presence of var args...
+ *
+ **/
+#define blk_add_trace_msg(q, fmt, ...)					\
+	do {								\
+		struct blk_trace *bt = (q)->blk_trace;			\
+		if (unlikely(bt))					\
+			__trace_note_message(bt, fmt, ##__VA_ARGS__);	\
+	} while (0)
+#define BLK_TN_MAX_MSG		1024
 
 /**
  * blk_add_trace_rq - Add a trace for a request oriented action
@@ -299,6 +322,8 @@ extern int blk_trace_remove(struct request_queue *q);
 #define blk_trace_setup(q, name, dev, arg)	(-ENOTTY)
 #define blk_trace_startstop(q, start)		(-ENOTTY)
 #define blk_trace_remove(q)			(-ENOTTY)
+#define blk_add_trace_msg(q, fmt, ...)		do { } while (0)
+
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
 #endif /* __KERNEL__ */
 #endif
-- 
cgit v1.2.3


From 4722dc52a891ab6cb2d637ddb87233e0ce277827 Mon Sep 17 00:00:00 2001
From: "Alan D. Brunelle" <Alan.Brunelle@hp.com>
Date: Tue, 27 May 2008 14:55:00 +0200
Subject: Added in elevator switch message to blktrace stream

Signed-off-by: Alan D. Brunelle <alan.brunelle@hp.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/elevator.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/block/elevator.c b/block/elevator.c
index 980f8ae147b..902dd1344d5 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -1110,6 +1110,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
 	spin_unlock_irq(q->queue_lock);
 
+	blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name);
+
 	return 1;
 
 fail_register:
-- 
cgit v1.2.3


From 64565911cdb57c2f512a9715b985b5617402cc67 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 28 May 2008 14:45:33 +0200
Subject: block: make blktrace use per-cpu buffers for message notes

Currently it uses a single static char array, but that risks
being corrupted when multiple users issue message notes at the
same time. Make the buffers dynamically allocated when the trace
is setup and make them per-cpu instead.

The default max message size of 1k is also very large, the
interface is mainly for small text notes. So shrink it to 128 bytes.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blktrace.c             | 15 ++++++++++++---
 include/linux/blktrace_api.h |  3 ++-
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/block/blktrace.c b/block/blktrace.c
index 20e11f354f1..7ae87cc4a16 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -79,13 +79,16 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
 {
 	int n;
 	va_list args;
-	static char bt_msg_buf[BLK_TN_MAX_MSG];
+	char *buf;
 
+	preempt_disable();
+	buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
 	va_start(args, fmt);
-	n = vscnprintf(bt_msg_buf, BLK_TN_MAX_MSG, fmt, args);
+	n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
 	va_end(args);
 
-	trace_note(bt, 0, BLK_TN_MESSAGE, bt_msg_buf, n);
+	trace_note(bt, 0, BLK_TN_MESSAGE, buf, n);
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(__trace_note_message);
 
@@ -246,6 +249,7 @@ static void blk_trace_cleanup(struct blk_trace *bt)
 	debugfs_remove(bt->dropped_file);
 	blk_remove_tree(bt->dir);
 	free_percpu(bt->sequence);
+	free_percpu(bt->msg_data);
 	kfree(bt);
 }
 
@@ -360,6 +364,10 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (!bt->sequence)
 		goto err;
 
+	bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG);
+	if (!bt->msg_data)
+		goto err;
+
 	ret = -ENOENT;
 	dir = blk_create_tree(buts->name);
 	if (!dir)
@@ -406,6 +414,7 @@ err:
 		if (bt->dropped_file)
 			debugfs_remove(bt->dropped_file);
 		free_percpu(bt->sequence);
+		free_percpu(bt->msg_data);
 		if (bt->rchan)
 			relay_close(bt->rchan);
 		kfree(bt);
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index b7cd8f1eedb..e3ef903aae8 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -121,6 +121,7 @@ struct blk_trace {
 	int trace_state;
 	struct rchan *rchan;
 	unsigned long *sequence;
+	unsigned char *msg_data;
 	u16 act_mask;
 	u64 start_lba;
 	u64 end_lba;
@@ -172,7 +173,7 @@ extern void __trace_note_message(struct blk_trace *, const char *fmt, ...);
 		if (unlikely(bt))					\
 			__trace_note_message(bt, fmt, ##__VA_ARGS__);	\
 	} while (0)
-#define BLK_TN_MAX_MSG		1024
+#define BLK_TN_MAX_MSG		128
 
 /**
  * blk_add_trace_rq - Add a trace for a request oriented action
-- 
cgit v1.2.3


From d6de8be711b28049a5cb93c954722c311c7d3f7f Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 28 May 2008 14:46:59 +0200
Subject: cfq-iosched: fix RCU problem in cfq_cic_lookup()

cfq_cic_lookup() needs to properly protect ioc->ioc_data before
dereferencing it and also exclude updaters of ioc->ioc_data as well.

Also add a number of comments documenting why the existing RCU usage
is OK.

Thanks a lot to "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> for
review and comments!

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 4df3f052243..d01b411c72f 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1142,6 +1142,9 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 	kmem_cache_free(cfq_pool, cfqq);
 }
 
+/*
+ * Must always be called with the rcu_read_lock() held
+ */
 static void
 __call_for_each_cic(struct io_context *ioc,
 		    void (*func)(struct io_context *, struct cfq_io_context *))
@@ -1197,6 +1200,11 @@ static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
 	cfq_cic_free(cic);
 }
 
+/*
+ * Must be called with rcu_read_lock() held or preemption otherwise disabled.
+ * Only two callers of this - ->dtor() which is called with the rcu_read_lock(),
+ * and ->trim() which is called with the task lock held
+ */
 static void cfq_free_io_context(struct io_context *ioc)
 {
 	/*
@@ -1502,20 +1510,24 @@ static struct cfq_io_context *
 cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
 {
 	struct cfq_io_context *cic;
+	unsigned long flags;
 	void *k;
 
 	if (unlikely(!ioc))
 		return NULL;
 
+	rcu_read_lock();
+
 	/*
 	 * we maintain a last-hit cache, to avoid browsing over the tree
 	 */
 	cic = rcu_dereference(ioc->ioc_data);
-	if (cic && cic->key == cfqd)
+	if (cic && cic->key == cfqd) {
+		rcu_read_unlock();
 		return cic;
+	}
 
 	do {
-		rcu_read_lock();
 		cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd);
 		rcu_read_unlock();
 		if (!cic)
@@ -1524,10 +1536,13 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
 		k = cic->key;
 		if (unlikely(!k)) {
 			cfq_drop_dead_cic(cfqd, ioc, cic);
+			rcu_read_lock();
 			continue;
 		}
 
+		spin_lock_irqsave(&ioc->lock, flags);
 		rcu_assign_pointer(ioc->ioc_data, cic);
+		spin_unlock_irqrestore(&ioc->lock, flags);
 		break;
 	} while (1);
 
@@ -2134,6 +2149,10 @@ static void *cfq_init_queue(struct request_queue *q)
 
 static void cfq_slab_kill(void)
 {
+	/*
+	 * Caller already ensured that pending RCU callbacks are completed,
+	 * so we should have no busy allocations at this point.
+	 */
 	if (cfq_pool)
 		kmem_cache_destroy(cfq_pool);
 	if (cfq_ioc_pool)
@@ -2292,6 +2311,11 @@ static void __exit cfq_exit(void)
 	ioc_gone = &all_gone;
 	/* ioc_gone's update must be visible before reading ioc_count */
 	smp_wmb();
+
+	/*
+	 * this also protects us from entering cfq_slab_kill() with
+	 * pending RCU callbacks
+	 */
 	if (elv_ioc_count_read(ioc_count))
 		wait_for_completion(ioc_gone);
 	cfq_slab_kill();
-- 
cgit v1.2.3