From 75bd2ef1457998791cfc89cd59927574488fc22a Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Thu, 15 May 2008 09:09:23 -0600
Subject: bsg: cdev lock_kernel() pushdown

Push the cdev lock_kernel call into bsg_open().

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 block/bsg.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/bsg.c b/block/bsg.c
index f0b7cd34321..dbe3ffd505c 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -19,6 +19,7 @@
 #include <linux/uio.h>
 #include <linux/idr.h>
 #include <linux/bsg.h>
+#include <linux/smp_lock.h>
 
 #include <scsi/scsi.h>
 #include <scsi/scsi_ioctl.h>
@@ -834,7 +835,11 @@ static struct bsg_device *bsg_get_device(struct inode *inode, struct file *file)
 
 static int bsg_open(struct inode *inode, struct file *file)
 {
-	struct bsg_device *bd = bsg_get_device(inode, file);
+	struct bsg_device *bd;
+
+	lock_kernel();
+	bd = bsg_get_device(inode, file);
+	unlock_kernel();
 
 	if (IS_ERR(bd))
 		return PTR_ERR(bd);
-- 
cgit v1.2.3


From 962cf36c5bf6d2840b8d66ee9a606fae2f540bbd Mon Sep 17 00:00:00 2001
From: "Carlos R. Mafra" <crmafra2@gmail.com>
Date: Thu, 15 May 2008 11:15:37 -0300
Subject: Remove argument from open_softirq which is always NULL

As git-grep shows, open_softirq() is always called with the last argument
being NULL

block/blk-core.c:       open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
kernel/hrtimer.c:       open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL);
kernel/rcuclassic.c:    open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
kernel/rcupreempt.c:    open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
kernel/sched.c: open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
kernel/softirq.c:       open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL);
kernel/softirq.c:       open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL);
kernel/timer.c: open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
net/core/dev.c: open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
net/core/dev.c: open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);

This observation has already been made by Matthew Wilcox in June 2002
(http://www.cs.helsinki.fi/linux/linux-kernel/2002-25/0687.html)

"I notice that none of the current softirq routines use the data element
passed to them."

and the situation hasn't changed since them. So it appears we can safely
remove that extra argument to save 128 (54) bytes of kernel data (text).

Signed-off-by: Carlos R. Mafra <crmafra@ift.unesp.br>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 block/blk-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 6a9cc0d22a6..75fdc65136e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2048,7 +2048,7 @@ int __init blk_dev_init(void)
 	for_each_possible_cpu(i)
 		INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
 
-	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
+	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
 	register_hotcpu_notifier(&blk_cpu_notifier);
 
 	return 0;
-- 
cgit v1.2.3


From d585d0b9d73ed999cc7b8cf3cac4a5b01abb544e Mon Sep 17 00:00:00 2001
From: Divyesh Shah <dpshah@google.com>
Date: Mon, 16 Jun 2008 18:37:08 +0200
Subject: block: Fix the starving writes bug in the anticipatory IO scheduler

AS scheduler alternates between issuing read and write batches. It does
the batch switch only after all requests from the previous batch are
completed.

When switching to a write batch, if there is an on-going read request,
it waits for its completion and indicates its intention of switching by
setting ad->changed_batch and the new direction but does not update the
batch_expire_time for the new write batch which it does in the case of
no previous pending requests.
On completion of the read request, it sees that we were waiting for the
switch and schedules work for kblockd right away and resets the
ad->changed_data flag.
Now when kblockd enters dispatch_request where it is expected to pick
up a write request, it in turn ends the write batch because the
batch_expire_timer was not updated and shows the expire timestamp for
the previous batch.

This results in the write starvation for all the cases where there is
the intention for switching to a write batch, but there is a previous
in-flight read request and the batch gets reverted to a read_batch
right away.

This also holds true in the reverse case (switching from a write batch
to a read batch with an in-flight write request).

I've checked that this bug exists on 2.6.11, 2.6.18, 2.6.24 and
linux-2.6-block git HEAD. I've tested the fix on x86 platforms with
SCSI drives where the driver asks for the next request while a current
request is in-flight.

This patch is based off linux-2.6-block git HEAD.

Bug reproduction:
A simple scenario which reproduces this bug is:
- dd if=/dev/hda3 of=/dev/null &
- lilo
   The lilo takes forever to complete.

This can also be reproduced fairly easily with the earlier dd and
another test
program doing msync().

The example test program below should print out a message after every
iteration
but it simply hangs forever. With this bugfix it makes forward progress.

====
Example test program using msync() (thanks to suleiman AT google DOT
com)

inline uint64_t
rdtsc(void)
{
         int64_t tsc;

         __asm __volatile("rdtsc" : "=A" (tsc));
         return (tsc);
}

int
main(int argc, char **argv)
{
         struct stat st;
         uint64_t e, s, t;
         char *p, q;
         long i;
         int fd;

         if (argc < 2) {
                 printf("Usage: %s <file>\n", argv[0]);
                 return (1);
         }

         if ((fd = open(argv[1], O_RDWR | O_NOATIME)) < 0)
                 err(1, "open");

         if (fstat(fd, &st) < 0)
                 err(1, "fstat");

         p = mmap(NULL, st.st_size, PROT_READ | PROT_WRITE,
MAP_SHARED, fd, 0);

         t = 0;
         for (i = 0; i < 1000; i++) {
                 *p = 0;
                 msync(p, 4096, MS_SYNC);
                 s = rdtsc();
                *p = 0;
                 __asm __volatile(""::: "memory");
                 e = rdtsc();
                 if (argc > 2)
                         printf("%d: %lld cycles %jd %jd\n",
                                i, e - s, (intmax_t)s, (intmax_t)e);
                 t += e - s;
         }
         printf("average time: %lld cycles\n", t / 1000);
         return (0);
}

Cc: <stable@kernel.org>
Acked-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/as-iosched.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'block')

diff --git a/block/as-iosched.c b/block/as-iosched.c
index 8c3946787db..743f33a01a0 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -831,6 +831,8 @@ static void as_completed_request(struct request_queue *q, struct request *rq)
 	}
 
 	if (ad->changed_batch && ad->nr_dispatched == 1) {
+		ad->current_batch_expires = jiffies +
+					ad->batch_expire[ad->batch_data_dir];
 		kblockd_schedule_work(&ad->antic_work);
 		ad->changed_batch = 0;
 
-- 
cgit v1.2.3


From 9a11b4ed0e7c44bca7c939aa544c3c47aae40c12 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 29 May 2008 09:32:08 +0200
Subject: cfq-iosched: properly protect ioc_gone and ioc count

If we have multiple tasks freeing cfq_io_contexts when cfq-iosched
is being unloaded, we could complete() ioc_gone twice. Fix that by
protecting ioc_gone complete() and clearing with a spinlock for
just that purpose. Doesn't matter from a performance perspective,
since it'll only enter that path when ioc_gone != NULL (when cfq-iosched
is being rmmod'ed).

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d01b411c72f..32aa3674f8a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -48,6 +48,7 @@ static struct kmem_cache *cfq_ioc_pool;
 
 static DEFINE_PER_CPU(unsigned long, ioc_count);
 static struct completion *ioc_gone;
+static DEFINE_SPINLOCK(ioc_gone_lock);
 
 #define CFQ_PRIO_LISTS		IOPRIO_BE_NR
 #define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
@@ -1177,8 +1178,19 @@ static void cfq_cic_free_rcu(struct rcu_head *head)
 	kmem_cache_free(cfq_ioc_pool, cic);
 	elv_ioc_count_dec(ioc_count);
 
-	if (ioc_gone && !elv_ioc_count_read(ioc_count))
-		complete(ioc_gone);
+	if (ioc_gone) {
+		/*
+		 * CFQ scheduler is exiting, grab exit lock and check
+		 * the pending io context count. If it hits zero,
+		 * complete ioc_gone and set it back to NULL
+		 */
+		spin_lock(&ioc_gone_lock);
+		if (ioc_gone && !elv_ioc_count_read(ioc_count)) {
+			complete(ioc_gone);
+			ioc_gone = NULL;
+		}
+		spin_unlock(&ioc_gone_lock);
+	}
 }
 
 static void cfq_cic_free(struct cfq_io_context *cic)
@@ -2317,7 +2329,7 @@ static void __exit cfq_exit(void)
 	 * pending RCU callbacks
 	 */
 	if (elv_ioc_count_read(ioc_count))
-		wait_for_completion(ioc_gone);
+		wait_for_completion(&all_gone);
 	cfq_slab_kill();
 }
 
-- 
cgit v1.2.3


From 863fddcb4b0caee4c2d5bd6e3b28779920516db3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 29 May 2008 09:35:22 +0200
Subject: as-iosched: properly protect ioc_gone and ioc count

If we have multiple tasks freeing io contexts when as-iosched
is being unloaded, we could complete() ioc_gone twice. Fix that by
protecting ioc_gone complete() and clearing with a spinlock for
just that purpose. Doesn't matter from a performance perspective,
since it'll only enter that path when ioc_gone != NULL (when as-iosched
is being rmmod'ed).

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/as-iosched.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/as-iosched.c b/block/as-iosched.c
index 743f33a01a0..9735acb5b4f 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -151,6 +151,7 @@ enum arq_state {
 
 static DEFINE_PER_CPU(unsigned long, ioc_count);
 static struct completion *ioc_gone;
+static DEFINE_SPINLOCK(ioc_gone_lock);
 
 static void as_move_to_dispatch(struct as_data *ad, struct request *rq);
 static void as_antic_stop(struct as_data *ad);
@@ -164,8 +165,19 @@ static void free_as_io_context(struct as_io_context *aic)
 {
 	kfree(aic);
 	elv_ioc_count_dec(ioc_count);
-	if (ioc_gone && !elv_ioc_count_read(ioc_count))
-		complete(ioc_gone);
+	if (ioc_gone) {
+		/*
+		 * AS scheduler is exiting, grab exit lock and check
+		 * the pending io context count. If it hits zero,
+		 * complete ioc_gone and set it back to NULL.
+		 */
+		spin_lock(&ioc_gone_lock);
+		if (ioc_gone && !elv_ioc_count_read(ioc_count)) {
+			complete(ioc_gone);
+			ioc_gone = NULL;
+		}
+		spin_unlock(&ioc_gone_lock);
+	}
 }
 
 static void as_trim(struct io_context *ioc)
@@ -1493,7 +1505,7 @@ static void __exit as_exit(void)
 	/* ioc_gone's update must be visible before reading ioc_count */
 	smp_wmb();
 	if (elv_ioc_count_read(ioc_count))
-		wait_for_completion(ioc_gone);
+		wait_for_completion(&all_gone);
 	synchronize_rcu();
 }
 
-- 
cgit v1.2.3


From 7b679138b3237a9a3d45a4fda23a58ac79cd279c Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 30 May 2008 12:23:07 +0200
Subject: cfq-iosched: add message logging through blktrace

Now that blktrace has the ability to carry arbitrary messages in
its stream, use that for some CFQ logging.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 65 ++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 55 insertions(+), 10 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 32aa3674f8a..0ebb626a25d 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -11,6 +11,7 @@
 #include <linux/elevator.h>
 #include <linux/rbtree.h>
 #include <linux/ioprio.h>
+#include <linux/blktrace_api.h>
 
 /*
  * tunables
@@ -41,7 +42,7 @@ static int cfq_slice_idle = HZ / 125;
 
 #define RQ_CIC(rq)		\
 	((struct cfq_io_context *) (rq)->elevator_private)
-#define RQ_CFQQ(rq)		((rq)->elevator_private2)
+#define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elevator_private2)
 
 static struct kmem_cache *cfq_pool;
 static struct kmem_cache *cfq_ioc_pool;
@@ -156,6 +157,7 @@ struct cfq_queue {
 	unsigned short ioprio, org_ioprio;
 	unsigned short ioprio_class, org_ioprio_class;
 
+	pid_t pid;
 };
 
 enum cfqq_state_flags {
@@ -199,6 +201,11 @@ CFQ_CFQQ_FNS(slice_new);
 CFQ_CFQQ_FNS(sync);
 #undef CFQ_CFQQ_FNS
 
+#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
+	blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
+#define cfq_log(cfqd, fmt, args...)	\
+	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
+
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *, int,
 				       struct io_context *, gfp_t);
@@ -235,8 +242,10 @@ static inline int cfq_bio_sync(struct bio *bio)
  */
 static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
 {
-	if (cfqd->busy_queues)
+	if (cfqd->busy_queues) {
+		cfq_log(cfqd, "schedule dispatch");
 		kblockd_schedule_work(&cfqd->unplug_work);
+	}
 }
 
 static int cfq_queue_empty(struct request_queue *q)
@@ -271,6 +280,7 @@ static inline void
 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
+	cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
 }
 
 /*
@@ -540,6 +550,7 @@ static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
  */
 static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
+	cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
 	BUG_ON(cfq_cfqq_on_rr(cfqq));
 	cfq_mark_cfqq_on_rr(cfqq);
 	cfqd->busy_queues++;
@@ -553,6 +564,7 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
  */
 static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
+	cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
 	cfq_clear_cfqq_on_rr(cfqq);
 
@@ -639,6 +651,8 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq)
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 
 	cfqd->rq_in_driver++;
+	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
+						cfqd->rq_in_driver);
 
 	/*
 	 * If the depth is larger 1, it really could be queueing. But lets
@@ -658,6 +672,8 @@ static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
 
 	WARN_ON(!cfqd->rq_in_driver);
 	cfqd->rq_in_driver--;
+	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
+						cfqd->rq_in_driver);
 }
 
 static void cfq_remove_request(struct request *rq)
@@ -747,6 +763,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
 				   struct cfq_queue *cfqq)
 {
 	if (cfqq) {
+		cfq_log_cfqq(cfqd, cfqq, "set_active");
 		cfqq->slice_end = 0;
 		cfq_clear_cfqq_must_alloc_slice(cfqq);
 		cfq_clear_cfqq_fifo_expire(cfqq);
@@ -764,6 +781,8 @@ static void
 __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		    int timed_out)
 {
+	cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
+
 	if (cfq_cfqq_wait_request(cfqq))
 		del_timer(&cfqd->idle_slice_timer);
 
@@ -773,8 +792,10 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	/*
 	 * store what was left of this slice, if the queue idled/timed out
 	 */
-	if (timed_out && !cfq_cfqq_slice_new(cfqq))
+	if (timed_out && !cfq_cfqq_slice_new(cfqq)) {
 		cfqq->slice_resid = cfqq->slice_end - jiffies;
+		cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
+	}
 
 	cfq_resort_rr_list(cfqd, cfqq);
 
@@ -866,6 +887,12 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 	if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq))
 		return;
 
+	/*
+	 * still requests with the driver, don't idle
+	 */
+	if (cfqd->rq_in_driver)
+		return;
+
 	/*
 	 * task has exited, don't wait
 	 */
@@ -893,6 +920,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 		sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
 
 	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
+	cfq_log(cfqd, "arm_idle: %lu", sl);
 }
 
 /*
@@ -903,6 +931,8 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 
+	cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
+
 	cfq_remove_request(rq);
 	cfqq->dispatched++;
 	elv_dispatch_sort(q, rq);
@@ -932,8 +962,9 @@ static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
 	rq = rq_entry_fifo(cfqq->fifo.next);
 
 	if (time_before(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo]))
-		return NULL;
+		rq = NULL;
 
+	cfq_log_cfqq(cfqd, cfqq, "fifo=%p", rq);
 	return rq;
 }
 
@@ -1073,6 +1104,7 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
 
 	BUG_ON(cfqd->busy_queues);
 
+	cfq_log(cfqd, "forced_dispatch=%d\n", dispatched);
 	return dispatched;
 }
 
@@ -1113,6 +1145,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
 		dispatched += __cfq_dispatch_requests(cfqd, cfqq, max_dispatch);
 	}
 
+	cfq_log(cfqd, "dispatched=%d", dispatched);
 	return dispatched;
 }
 
@@ -1131,6 +1164,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 	if (!atomic_dec_and_test(&cfqq->ref))
 		return;
 
+	cfq_log_cfqq(cfqd, cfqq, "put_queue");
 	BUG_ON(rb_first(&cfqq->sort_list));
 	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
 	BUG_ON(cfq_cfqq_on_rr(cfqq));
@@ -1439,6 +1473,8 @@ retry:
 				cfq_mark_cfqq_idle_window(cfqq);
 			cfq_mark_cfqq_sync(cfqq);
 		}
+		cfqq->pid = current->pid;
+		cfq_log_cfqq(cfqd, cfqq, "alloced");
 	}
 
 	if (new_cfqq)
@@ -1687,7 +1723,7 @@ static void
 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		       struct cfq_io_context *cic)
 {
-	int enable_idle;
+	int old_idle, enable_idle;
 
 	/*
 	 * Don't idle for async or idle io prio class
@@ -1695,7 +1731,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
 		return;
 
-	enable_idle = cfq_cfqq_idle_window(cfqq);
+	old_idle = cfq_cfqq_idle_window(cfqq);
 
 	if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
 	    (cfqd->hw_tag && CIC_SEEKY(cic)))
@@ -1707,10 +1743,13 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			enable_idle = 1;
 	}
 
-	if (enable_idle)
-		cfq_mark_cfqq_idle_window(cfqq);
-	else
-		cfq_clear_cfqq_idle_window(cfqq);
+	if (old_idle != enable_idle) {
+		cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
+		if (enable_idle)
+			cfq_mark_cfqq_idle_window(cfqq);
+		else
+			cfq_clear_cfqq_idle_window(cfqq);
+	}
 }
 
 /*
@@ -1769,6 +1808,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
  */
 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
+	cfq_log_cfqq(cfqd, cfqq, "preempt");
 	cfq_slice_expired(cfqd, 1);
 
 	/*
@@ -1830,6 +1870,7 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 
+	cfq_log_cfqq(cfqd, cfqq, "insert_request");
 	cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
 
 	cfq_add_rq_rb(rq);
@@ -1847,6 +1888,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	unsigned long now;
 
 	now = jiffies;
+	cfq_log_cfqq(cfqd, cfqq, "complete");
 
 	WARN_ON(!cfqd->rq_in_driver);
 	WARN_ON(!cfqq->dispatched);
@@ -2016,6 +2058,7 @@ queue_fail:
 
 	cfq_schedule_dispatch(cfqd);
 	spin_unlock_irqrestore(q->queue_lock, flags);
+	cfq_log(cfqd, "set_request fail");
 	return 1;
 }
 
@@ -2041,6 +2084,8 @@ static void cfq_idle_slice_timer(unsigned long data)
 	unsigned long flags;
 	int timed_out = 1;
 
+	cfq_log(cfqd, "idle timer fired");
+
 	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
 
 	cfqq = cfqd->active_queue;
-- 
cgit v1.2.3


From 02c62304e6af60f1963695c6bc1bbffe619aa585 Mon Sep 17 00:00:00 2001
From: "Alan D. Brunelle" <Alan.Brunelle@hp.com>
Date: Wed, 11 Jun 2008 09:12:52 +0200
Subject: Added in user-injected messages into blk traces

This allows a user to annotate the blk trace stream: writing a suitable
message to {/sys/kernel/debug}/block/<dsf>/msg will have it propagated
into the trace stream.

Signed-off-by: Alan D. Brunelle <alan.brunelle@hp.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blktrace.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

(limited to 'block')

diff --git a/block/blktrace.c b/block/blktrace.c
index 8d3a2778026..eb9651ccb24 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -244,6 +244,7 @@ err:
 static void blk_trace_cleanup(struct blk_trace *bt)
 {
 	relay_close(bt->rchan);
+	debugfs_remove(bt->msg_file);
 	debugfs_remove(bt->dropped_file);
 	blk_remove_tree(bt->dir);
 	free_percpu(bt->sequence);
@@ -291,6 +292,44 @@ static const struct file_operations blk_dropped_fops = {
 	.read =		blk_dropped_read,
 };
 
+static int blk_msg_open(struct inode *inode, struct file *filp)
+{
+	filp->private_data = inode->i_private;
+
+	return 0;
+}
+
+static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	char *msg;
+	struct blk_trace *bt;
+
+	if (count > BLK_TN_MAX_MSG)
+		return -EINVAL;
+
+	msg = kmalloc(count, GFP_KERNEL);
+	if (msg == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(msg, buffer, count)) {
+		kfree(msg);
+		return -EFAULT;
+	}
+
+	bt = filp->private_data;
+	__trace_note_message(bt, "%s", msg);
+	kfree(msg);
+
+	return count;
+}
+
+static const struct file_operations blk_msg_fops = {
+	.owner =	THIS_MODULE,
+	.open =		blk_msg_open,
+	.write =	blk_msg_write,
+};
+
 /*
  * Keep track of how many times we encountered a full subbuffer, to aid
  * the user space app in telling how many lost events there were.
@@ -380,6 +419,10 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (!bt->dropped_file)
 		goto err;
 
+	bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
+	if (!bt->msg_file)
+		goto err;
+
 	bt->rchan = relay_open("trace", dir, buts->buf_size,
 				buts->buf_nr, &blk_relay_callbacks, bt);
 	if (!bt->rchan)
@@ -409,6 +452,8 @@ err:
 	if (dir)
 		blk_remove_tree(dir);
 	if (bt) {
+		if (bt->msg_file)
+			debugfs_remove(bt->msg_file);
 		if (bt->dropped_file)
 			debugfs_remove(bt->dropped_file);
 		free_percpu(bt->sequence);
-- 
cgit v1.2.3


From 1c9ce5276324ae566ca409491b99a2cc8d5986fa Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Fri, 13 Jun 2008 09:41:00 +0200
Subject: block: export "ro" attribute

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index b922d4801c8..43e468ee599 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -400,6 +400,14 @@ static ssize_t disk_removable_show(struct device *dev,
 		       (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
 }
 
+static ssize_t disk_ro_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	return sprintf(buf, "%d\n", disk->policy ? 1 : 0);
+}
+
 static ssize_t disk_size_show(struct device *dev,
 			      struct device_attribute *attr, char *buf)
 {
@@ -472,6 +480,7 @@ static ssize_t disk_fail_store(struct device *dev,
 
 static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
 static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
+static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, disk_size_show, NULL);
 static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, disk_stat_show, NULL);
@@ -483,6 +492,7 @@ static struct device_attribute dev_attr_fail =
 static struct attribute *disk_attrs[] = {
 	&dev_attr_range.attr,
 	&dev_attr_removable.attr,
+	&dev_attr_ro.attr,
 	&dev_attr_size.attr,
 	&dev_attr_capability.attr,
 	&dev_attr_stat.attr,
-- 
cgit v1.2.3


From 7ba1ba12eeef0aa7113beb16410ef8b7c748e18b Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Mon, 30 Jun 2008 20:04:41 +0200
Subject: block: Block layer data integrity support

Some block devices support verifying the integrity of requests by way
of checksums or other protection information that is submitted along
with the I/O.

This patch implements support for generating and verifying integrity
metadata, as well as correctly merging, splitting and cloning bios and
requests that have this extra information attached.

See Documentation/block/data-integrity.txt for more information.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/Kconfig         |  12 ++
 block/Makefile        |   1 +
 block/blk-core.c      |   7 +
 block/blk-integrity.c | 382 ++++++++++++++++++++++++++++++++++++++++++++++++++
 block/blk-merge.c     |   3 +
 block/blk.h           |   8 ++
 block/elevator.c      |   6 +
 7 files changed, 419 insertions(+)
 create mode 100644 block/blk-integrity.c

(limited to 'block')

diff --git a/block/Kconfig b/block/Kconfig
index 3e97f2bc446..1ab7c15c8d7 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -81,6 +81,18 @@ config BLK_DEV_BSG
 
 	  If unsure, say N.
 
+config BLK_DEV_INTEGRITY
+	bool "Block layer data integrity support"
+	---help---
+	Some storage devices allow extra information to be
+	stored/retrieved to help protect the data.  The block layer
+	data integrity option provides hooks which can be used by
+	filesystems to ensure better data integrity.
+
+	Say yes here if you have a storage device that provides the
+	T10/SCSI Data Integrity Field or the T13/ATA External Path
+	Protection.  If in doubt, say N.
+
 endif # BLOCK
 
 config BLOCK_COMPAT
diff --git a/block/Makefile b/block/Makefile
index 5a43c7d7959..045f7b62e4b 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -14,3 +14,4 @@ obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
 
 obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
 obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
+obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o
diff --git a/block/blk-core.c b/block/blk-core.c
index 1905aaba49f..e0fb0bcc0c1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -143,6 +143,10 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 
 		bio->bi_size -= nbytes;
 		bio->bi_sector += (nbytes >> 9);
+
+		if (bio_integrity(bio))
+			bio_integrity_advance(bio, nbytes);
+
 		if (bio->bi_size == 0)
 			bio_endio(bio, error);
 	} else {
@@ -1381,6 +1385,9 @@ end_io:
 		 */
 		blk_partition_remap(bio);
 
+		if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
+			goto end_io;
+
 		if (old_sector != -1)
 			blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,
 					    old_sector);
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
new file mode 100644
index 00000000000..65f23ef38bb
--- /dev/null
+++ b/block/blk-integrity.c
@@ -0,0 +1,382 @@
+/*
+ * blk-integrity.c - Block layer data integrity extensions
+ *
+ * Copyright (C) 2007, 2008 Oracle Corporation
+ * Written by: Martin K. Petersen <martin.petersen@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/bio.h>
+#include <linux/scatterlist.h>
+
+#include "blk.h"
+
+static struct kmem_cache *integrity_cachep;
+
+/**
+ * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements
+ * @rq:		request with integrity metadata attached
+ *
+ * Description: Returns the number of elements required in a
+ * scatterlist corresponding to the integrity metadata in a request.
+ */
+int blk_rq_count_integrity_sg(struct request *rq)
+{
+	struct bio_vec *iv, *ivprv;
+	struct req_iterator iter;
+	unsigned int segments;
+
+	ivprv = NULL;
+	segments = 0;
+
+	rq_for_each_integrity_segment(iv, rq, iter) {
+
+		if (!ivprv || !BIOVEC_PHYS_MERGEABLE(ivprv, iv))
+			segments++;
+
+		ivprv = iv;
+	}
+
+	return segments;
+}
+EXPORT_SYMBOL(blk_rq_count_integrity_sg);
+
+/**
+ * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
+ * @rq:		request with integrity metadata attached
+ * @sglist:	target scatterlist
+ *
+ * Description: Map the integrity vectors in request into a
+ * scatterlist.  The scatterlist must be big enough to hold all
+ * elements.  I.e. sized using blk_rq_count_integrity_sg().
+ */
+int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist)
+{
+	struct bio_vec *iv, *ivprv;
+	struct req_iterator iter;
+	struct scatterlist *sg;
+	unsigned int segments;
+
+	ivprv = NULL;
+	sg = NULL;
+	segments = 0;
+
+	rq_for_each_integrity_segment(iv, rq, iter) {
+
+		if (ivprv) {
+			if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv))
+				goto new_segment;
+
+			sg->length += iv->bv_len;
+		} else {
+new_segment:
+			if (!sg)
+				sg = sglist;
+			else {
+				sg->page_link &= ~0x02;
+				sg = sg_next(sg);
+			}
+
+			sg_set_page(sg, iv->bv_page, iv->bv_len, iv->bv_offset);
+			segments++;
+		}
+
+		ivprv = iv;
+	}
+
+	if (sg)
+		sg_mark_end(sg);
+
+	return segments;
+}
+EXPORT_SYMBOL(blk_rq_map_integrity_sg);
+
+/**
+ * blk_integrity_compare - Compare integrity profile of two block devices
+ * @b1:		Device to compare
+ * @b2:		Device to compare
+ *
+ * Description: Meta-devices like DM and MD need to verify that all
+ * sub-devices use the same integrity format before advertising to
+ * upper layers that they can send/receive integrity metadata.  This
+ * function can be used to check whether two block devices have
+ * compatible integrity formats.
+ */
+int blk_integrity_compare(struct block_device *bd1, struct block_device *bd2)
+{
+	struct blk_integrity *b1 = bd1->bd_disk->integrity;
+	struct blk_integrity *b2 = bd2->bd_disk->integrity;
+
+	BUG_ON(bd1->bd_disk == NULL);
+	BUG_ON(bd2->bd_disk == NULL);
+
+	if (!b1 || !b2)
+		return 0;
+
+	if (b1->sector_size != b2->sector_size) {
+		printk(KERN_ERR "%s: %s/%s sector sz %u != %u\n", __func__,
+		       bd1->bd_disk->disk_name, bd2->bd_disk->disk_name,
+		       b1->sector_size, b2->sector_size);
+		return -1;
+	}
+
+	if (b1->tuple_size != b2->tuple_size) {
+		printk(KERN_ERR "%s: %s/%s tuple sz %u != %u\n", __func__,
+		       bd1->bd_disk->disk_name, bd2->bd_disk->disk_name,
+		       b1->tuple_size, b2->tuple_size);
+		return -1;
+	}
+
+	if (b1->tag_size && b2->tag_size && (b1->tag_size != b2->tag_size)) {
+		printk(KERN_ERR "%s: %s/%s tag sz %u != %u\n", __func__,
+		       bd1->bd_disk->disk_name, bd2->bd_disk->disk_name,
+		       b1->tag_size, b2->tag_size);
+		return -1;
+	}
+
+	if (strcmp(b1->name, b2->name)) {
+		printk(KERN_ERR "%s: %s/%s type %s != %s\n", __func__,
+		       bd1->bd_disk->disk_name, bd2->bd_disk->disk_name,
+		       b1->name, b2->name);
+		return -1;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(blk_integrity_compare);
+
+struct integrity_sysfs_entry {
+	struct attribute attr;
+	ssize_t (*show)(struct blk_integrity *, char *);
+	ssize_t (*store)(struct blk_integrity *, const char *, size_t);
+};
+
+static ssize_t integrity_attr_show(struct kobject *kobj, struct attribute *attr,
+				   char *page)
+{
+	struct blk_integrity *bi =
+		container_of(kobj, struct blk_integrity, kobj);
+	struct integrity_sysfs_entry *entry =
+		container_of(attr, struct integrity_sysfs_entry, attr);
+
+	return entry->show(bi, page);
+}
+
+static ssize_t integrity_attr_store(struct kobject *kobj, struct attribute *attr,
+				    const char *page, size_t count)
+{
+	struct blk_integrity *bi =
+		container_of(kobj, struct blk_integrity, kobj);
+	struct integrity_sysfs_entry *entry =
+		container_of(attr, struct integrity_sysfs_entry, attr);
+	ssize_t ret = 0;
+
+	if (entry->store)
+		ret = entry->store(bi, page, count);
+
+	return ret;
+}
+
+static ssize_t integrity_format_show(struct blk_integrity *bi, char *page)
+{
+	if (bi != NULL && bi->name != NULL)
+		return sprintf(page, "%s\n", bi->name);
+	else
+		return sprintf(page, "none\n");
+}
+
+static ssize_t integrity_tag_size_show(struct blk_integrity *bi, char *page)
+{
+	if (bi != NULL)
+		return sprintf(page, "%u\n", bi->tag_size);
+	else
+		return sprintf(page, "0\n");
+}
+
+static ssize_t integrity_read_store(struct blk_integrity *bi,
+				    const char *page, size_t count)
+{
+	char *p = (char *) page;
+	unsigned long val = simple_strtoul(p, &p, 10);
+
+	if (val)
+		set_bit(INTEGRITY_FLAG_READ, &bi->flags);
+	else
+		clear_bit(INTEGRITY_FLAG_READ, &bi->flags);
+
+	return count;
+}
+
+static ssize_t integrity_read_show(struct blk_integrity *bi, char *page)
+{
+	return sprintf(page, "%d\n",
+		       test_bit(INTEGRITY_FLAG_READ, &bi->flags) ? 1 : 0);
+}
+
+static ssize_t integrity_write_store(struct blk_integrity *bi,
+				     const char *page, size_t count)
+{
+	char *p = (char *) page;
+	unsigned long val = simple_strtoul(p, &p, 10);
+
+	if (val)
+		set_bit(INTEGRITY_FLAG_WRITE, &bi->flags);
+	else
+		clear_bit(INTEGRITY_FLAG_WRITE, &bi->flags);
+
+	return count;
+}
+
+static ssize_t integrity_write_show(struct blk_integrity *bi, char *page)
+{
+	return sprintf(page, "%d\n",
+		       test_bit(INTEGRITY_FLAG_WRITE, &bi->flags) ? 1 : 0);
+}
+
+static struct integrity_sysfs_entry integrity_format_entry = {
+	.attr = { .name = "format", .mode = S_IRUGO },
+	.show = integrity_format_show,
+};
+
+static struct integrity_sysfs_entry integrity_tag_size_entry = {
+	.attr = { .name = "tag_size", .mode = S_IRUGO },
+	.show = integrity_tag_size_show,
+};
+
+static struct integrity_sysfs_entry integrity_read_entry = {
+	.attr = { .name = "read_verify", .mode = S_IRUGO | S_IWUSR },
+	.show = integrity_read_show,
+	.store = integrity_read_store,
+};
+
+static struct integrity_sysfs_entry integrity_write_entry = {
+	.attr = { .name = "write_generate", .mode = S_IRUGO | S_IWUSR },
+	.show = integrity_write_show,
+	.store = integrity_write_store,
+};
+
+static struct attribute *integrity_attrs[] = {
+	&integrity_format_entry.attr,
+	&integrity_tag_size_entry.attr,
+	&integrity_read_entry.attr,
+	&integrity_write_entry.attr,
+	NULL,
+};
+
+static struct sysfs_ops integrity_ops = {
+	.show	= &integrity_attr_show,
+	.store	= &integrity_attr_store,
+};
+
+static int __init blk_dev_integrity_init(void)
+{
+	integrity_cachep = kmem_cache_create("blkdev_integrity",
+					     sizeof(struct blk_integrity),
+					     0, SLAB_PANIC, NULL);
+	return 0;
+}
+subsys_initcall(blk_dev_integrity_init);
+
+static void blk_integrity_release(struct kobject *kobj)
+{
+	struct blk_integrity *bi =
+		container_of(kobj, struct blk_integrity, kobj);
+
+	kmem_cache_free(integrity_cachep, bi);
+}
+
+static struct kobj_type integrity_ktype = {
+	.default_attrs	= integrity_attrs,
+	.sysfs_ops	= &integrity_ops,
+	.release	= blk_integrity_release,
+};
+
+/**
+ * blk_integrity_register - Register a gendisk as being integrity-capable
+ * @disk:	struct gendisk pointer to make integrity-aware
+ * @template:	integrity profile
+ *
+ * Description: When a device needs to advertise itself as being able
+ * to send/receive integrity metadata it must use this function to
+ * register the capability with the block layer.  The template is a
+ * blk_integrity struct with values appropriate for the underlying
+ * hardware.  See Documentation/block/data-integrity.txt.
+ */
+int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
+{
+	struct blk_integrity *bi;
+
+	BUG_ON(disk == NULL);
+	BUG_ON(template == NULL);
+
+	if (disk->integrity == NULL) {
+		bi = kmem_cache_alloc(integrity_cachep, GFP_KERNEL | __GFP_ZERO);
+		if (!bi)
+			return -1;
+
+		if (kobject_init_and_add(&bi->kobj, &integrity_ktype,
+					 &disk->dev.kobj, "%s", "integrity")) {
+			kmem_cache_free(integrity_cachep, bi);
+			return -1;
+		}
+
+		kobject_uevent(&bi->kobj, KOBJ_ADD);
+
+		set_bit(INTEGRITY_FLAG_READ, &bi->flags);
+		set_bit(INTEGRITY_FLAG_WRITE, &bi->flags);
+		bi->sector_size = disk->queue->hardsect_size;
+		disk->integrity = bi;
+	} else
+		bi = disk->integrity;
+
+	/* Use the provided profile as template */
+	bi->name = template->name;
+	bi->generate_fn = template->generate_fn;
+	bi->verify_fn = template->verify_fn;
+	bi->tuple_size = template->tuple_size;
+	bi->set_tag_fn = template->set_tag_fn;
+	bi->get_tag_fn = template->get_tag_fn;
+	bi->tag_size = template->tag_size;
+
+	return 0;
+}
+EXPORT_SYMBOL(blk_integrity_register);
+
+/**
+ * blk_integrity_unregister - Remove block integrity profile
+ * @disk:	disk whose integrity profile to deallocate
+ *
+ * Description: This function frees all memory used by the block
+ * integrity profile.  To be called at device teardown.
+ */
+void blk_integrity_unregister(struct gendisk *disk)
+{
+	struct blk_integrity *bi;
+
+	if (!disk || !disk->integrity)
+		return;
+
+	bi = disk->integrity;
+
+	kobject_uevent(&bi->kobj, KOBJ_REMOVE);
+	kobject_del(&bi->kobj);
+	kobject_put(&disk->dev.kobj);
+	kmem_cache_free(integrity_cachep, bi);
+}
+EXPORT_SYMBOL(blk_integrity_unregister);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 651136aae76..5efc9e7a68b 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -441,6 +441,9 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 	    || next->special)
 		return 0;
 
+	if (blk_integrity_rq(req) != blk_integrity_rq(next))
+		return 0;
+
 	/*
 	 * If we are allowed to merge, then append bio list
 	 * from next to rq and release next. merge_requests_fn
diff --git a/block/blk.h b/block/blk.h
index 59776ab4742..c79f30e1df5 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -51,4 +51,12 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
 	return q->nr_congestion_off;
 }
 
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+
+#define rq_for_each_integrity_segment(bvl, _rq, _iter)		\
+	__rq_for_each_bio(_iter.bio, _rq)			\
+		bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i)
+
+#endif /* BLK_DEV_INTEGRITY */
+
 #endif
diff --git a/block/elevator.c b/block/elevator.c
index 902dd1344d5..1f5bfe69602 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -86,6 +86,12 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
 	if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special)
 		return 0;
 
+	/*
+	 * only merge integrity protected bio into ditto rq
+	 */
+	if (bio_integrity(bio) != blk_integrity_rq(rq))
+		return 0;
+
 	if (!elv_iosched_allow_merge(rq, bio))
 		return 0;
 
-- 
cgit v1.2.3


From b984679efe1a616ec4ac919dba08286d71593900 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 17 Jun 2008 19:05:48 +0200
Subject: block: integrity checkpatch cleanups

> 80 char lines and that sort of thing.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-integrity.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 65f23ef38bb..4ffa3814f6a 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -178,8 +178,9 @@ static ssize_t integrity_attr_show(struct kobject *kobj, struct attribute *attr,
 	return entry->show(bi, page);
 }
 
-static ssize_t integrity_attr_store(struct kobject *kobj, struct attribute *attr,
-				    const char *page, size_t count)
+static ssize_t integrity_attr_store(struct kobject *kobj,
+				    struct attribute *attr, const char *page,
+				    size_t count)
 {
 	struct blk_integrity *bi =
 		container_of(kobj, struct blk_integrity, kobj);
@@ -326,7 +327,8 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
 	BUG_ON(template == NULL);
 
 	if (disk->integrity == NULL) {
-		bi = kmem_cache_alloc(integrity_cachep, GFP_KERNEL | __GFP_ZERO);
+		bi = kmem_cache_alloc(integrity_cachep,
+						GFP_KERNEL | __GFP_ZERO);
 		if (!bi)
 			return -1;
 
-- 
cgit v1.2.3


From 0b07de85a76e1346e675f0e98437378932473df7 Mon Sep 17 00:00:00 2001
From: Adel Gadllah <adel.gadllah@gmail.com>
Date: Thu, 26 Jun 2008 13:48:27 +0200
Subject: allow userspace to modify scsi command filter on per device basis

This patch exports the per-gendisk command filter to user space through
sysfs, so it can be changed by the system administrator.
All users of the old cmd filter have been converted to use the new one.

Original patch from Peter Jones.

Signed-off-by: Adel Gadllah <adel.gadllah@gmail.com>
Signed-off-by: Peter Jones <pjones@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/Makefile     |   3 +-
 block/bsg.c        |  38 +++++--
 block/cmd-filter.c | 325 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 block/genhd.c      |   2 +
 block/scsi_ioctl.c | 121 +-------------------
 5 files changed, 361 insertions(+), 128 deletions(-)
 create mode 100644 block/cmd-filter.c

(limited to 'block')

diff --git a/block/Makefile b/block/Makefile
index 045f7b62e4b..208000b0750 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -4,7 +4,8 @@
 
 obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
-			blk-exec.o blk-merge.o ioctl.o genhd.o scsi_ioctl.o
+			blk-exec.o blk-merge.o ioctl.o genhd.o scsi_ioctl.o \
+			cmd-filter.o
 
 obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
diff --git a/block/bsg.c b/block/bsg.c
index f0b7cd34321..439940c3a1f 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -44,11 +44,12 @@ struct bsg_device {
 	char name[BUS_ID_SIZE];
 	int max_queue;
 	unsigned long flags;
+	struct blk_scsi_cmd_filter *cmd_filter;
+	mode_t *f_mode;
 };
 
 enum {
 	BSG_F_BLOCK		= 1,
-	BSG_F_WRITE_PERM	= 2,
 };
 
 #define BSG_DEFAULT_CMDS	64
@@ -172,7 +173,7 @@ unlock:
 }
 
 static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
-				struct sg_io_v4 *hdr, int has_write_perm)
+				struct sg_io_v4 *hdr, struct bsg_device *bd)
 {
 	if (hdr->request_len > BLK_MAX_CDB) {
 		rq->cmd = kzalloc(hdr->request_len, GFP_KERNEL);
@@ -185,7 +186,8 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
 		return -EFAULT;
 
 	if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) {
-		if (blk_verify_command(rq->cmd, has_write_perm))
+		if (blk_cmd_filter_verify_command(bd->cmd_filter, rq->cmd,
+						 bd->f_mode))
 			return -EPERM;
 	} else if (!capable(CAP_SYS_RAWIO))
 		return -EPERM;
@@ -263,8 +265,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr)
 	rq = blk_get_request(q, rw, GFP_KERNEL);
 	if (!rq)
 		return ERR_PTR(-ENOMEM);
-	ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, test_bit(BSG_F_WRITE_PERM,
-						       &bd->flags));
+	ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd);
 	if (ret)
 		goto out;
 
@@ -566,12 +567,23 @@ static inline void bsg_set_block(struct bsg_device *bd, struct file *file)
 		set_bit(BSG_F_BLOCK, &bd->flags);
 }
 
-static inline void bsg_set_write_perm(struct bsg_device *bd, struct file *file)
+static void bsg_set_cmd_filter(struct bsg_device *bd,
+			   struct file *file)
 {
-	if (file->f_mode & FMODE_WRITE)
-		set_bit(BSG_F_WRITE_PERM, &bd->flags);
-	else
-		clear_bit(BSG_F_WRITE_PERM, &bd->flags);
+	struct inode *inode;
+	struct gendisk *disk;
+
+	if (!file)
+		return;
+
+	inode = file->f_dentry->d_inode;
+	if (!inode)
+		return;
+
+	disk = inode->i_bdev->bd_disk;
+
+	bd->cmd_filter = &disk->cmd_filter;
+	bd->f_mode = &file->f_mode;
 }
 
 /*
@@ -595,6 +607,8 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 	dprintk("%s: read %Zd bytes\n", bd->name, count);
 
 	bsg_set_block(bd, file);
+	bsg_set_cmd_filter(bd, file);
+
 	bytes_read = 0;
 	ret = __bsg_read(buf, count, bd, NULL, &bytes_read);
 	*ppos = bytes_read;
@@ -668,7 +682,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
 	dprintk("%s: write %Zd bytes\n", bd->name, count);
 
 	bsg_set_block(bd, file);
-	bsg_set_write_perm(bd, file);
+	bsg_set_cmd_filter(bd, file);
 
 	bytes_written = 0;
 	ret = __bsg_write(bd, buf, count, &bytes_written);
@@ -771,7 +785,9 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
 	}
 
 	bd->queue = rq;
+
 	bsg_set_block(bd, file);
+	bsg_set_cmd_filter(bd, file);
 
 	atomic_set(&bd->ref_count, 1);
 	mutex_lock(&bsg_mutex);
diff --git a/block/cmd-filter.c b/block/cmd-filter.c
new file mode 100644
index 00000000000..35e327ceaa9
--- /dev/null
+++ b/block/cmd-filter.c
@@ -0,0 +1,325 @@
+/*
+ * Copyright 2004 Peter M. Jones <pjones@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
+ *
+ */
+
+#include <linux/list.h>
+#include <linux/genhd.h>
+#include <linux/spinlock.h>
+#include <linux/parser.h>
+#include <linux/capability.h>
+#include <linux/bitops.h>
+
+#include <scsi/scsi.h>
+#include <linux/cdrom.h>
+
+int blk_cmd_filter_verify_command(struct blk_scsi_cmd_filter *filter,
+				  unsigned char *cmd, mode_t *f_mode)
+{
+	/* root can do any command. */
+	if (capable(CAP_SYS_RAWIO))
+		return 0;
+
+	/* if there's no filter set, assume we're filtering everything out */
+	if (!filter)
+		return -EPERM;
+
+	/* Anybody who can open the device can do a read-safe command */
+	if (test_bit(cmd[0], filter->read_ok))
+		return 0;
+
+	/* Write-safe commands require a writable open */
+	if (test_bit(cmd[0], filter->write_ok) && (*f_mode & FMODE_WRITE))
+		return 0;
+
+	return -EPERM;
+}
+EXPORT_SYMBOL(blk_cmd_filter_verify_command);
+
+int blk_verify_command(struct file *file, unsigned char *cmd)
+{
+	struct gendisk *disk;
+	struct inode *inode;
+
+	if (!file)
+		return -EINVAL;
+
+	inode = file->f_dentry->d_inode;
+	if (!inode)
+		return -EINVAL;
+
+	disk = inode->i_bdev->bd_disk;
+
+	return blk_cmd_filter_verify_command(&disk->cmd_filter,
+						 cmd, &file->f_mode);
+}
+EXPORT_SYMBOL(blk_verify_command);
+
+/* and now, the sysfs stuff */
+static ssize_t rcf_cmds_show(struct blk_scsi_cmd_filter *filter, char *page,
+			     int rw)
+{
+	char *npage = page;
+	unsigned long *okbits;
+	int i;
+
+	if (rw == READ)
+		okbits = filter->read_ok;
+	else
+		okbits = filter->write_ok;
+
+	for (i = 0; i < BLK_SCSI_MAX_CMDS; i++) {
+		if (test_bit(i, okbits)) {
+			sprintf(npage, "%02x", i);
+			npage += 2;
+			if (i < BLK_SCSI_MAX_CMDS - 1)
+				sprintf(npage++, " ");
+		}
+	}
+
+	if (npage != page)
+		npage += sprintf(npage, "\n");
+
+	return npage - page;
+}
+
+static ssize_t rcf_readcmds_show(struct blk_scsi_cmd_filter *filter, char *page)
+{
+	return rcf_cmds_show(filter, page, READ);
+}
+
+static ssize_t rcf_writecmds_show(struct blk_scsi_cmd_filter *filter,
+				 char *page)
+{
+	return rcf_cmds_show(filter, page, WRITE);
+}
+
+static ssize_t rcf_cmds_store(struct blk_scsi_cmd_filter *filter,
+			      const char *page, size_t count, int rw)
+{
+	ssize_t ret = 0;
+	unsigned long okbits[BLK_SCSI_CMD_PER_LONG], *target_okbits;
+	int cmd, status, len;
+	substring_t ss;
+
+	memset(&okbits, 0, sizeof(okbits));
+
+	for (len = strlen(page); len > 0; len -= 3) {
+		if (len < 2)
+			break;
+		ss.from = (char *) page + ret;
+		ss.to = (char *) page + ret + 2;
+		ret += 3;
+		status = match_hex(&ss, &cmd);
+		/* either of these cases means invalid input, so do nothing. */
+		if (status || cmd >= BLK_SCSI_MAX_CMDS)
+			return -EINVAL;
+
+		__set_bit(cmd, okbits);
+	}
+
+	if (rw == READ)
+		target_okbits = filter->read_ok;
+	else
+		target_okbits = filter->write_ok;
+
+	memmove(target_okbits, okbits, sizeof(okbits));
+	return count;
+}
+
+static ssize_t rcf_readcmds_store(struct blk_scsi_cmd_filter *filter,
+				  const char *page, size_t count)
+{
+	return rcf_cmds_store(filter, page, count, READ);
+}
+
+static ssize_t rcf_writecmds_store(struct blk_scsi_cmd_filter *filter,
+				   const char *page, size_t count)
+{
+	return rcf_cmds_store(filter, page, count, WRITE);
+}
+
+struct rcf_sysfs_entry {
+	struct attribute attr;
+	ssize_t (*show)(struct blk_scsi_cmd_filter *, char *);
+	ssize_t (*store)(struct blk_scsi_cmd_filter *, const char *, size_t);
+};
+
+static struct rcf_sysfs_entry rcf_readcmds_entry = {
+	.attr = { .name = "read_table", .mode = S_IRUGO | S_IWUSR },
+	.show = rcf_readcmds_show,
+	.store = rcf_readcmds_store,
+};
+
+static struct rcf_sysfs_entry rcf_writecmds_entry = {
+	.attr = {.name = "write_table", .mode = S_IRUGO | S_IWUSR },
+	.show = rcf_writecmds_show,
+	.store = rcf_writecmds_store,
+};
+
+static struct attribute *default_attrs[] = {
+	&rcf_readcmds_entry.attr,
+	&rcf_writecmds_entry.attr,
+	NULL,
+};
+
+#define to_rcf(atr) container_of((atr), struct rcf_sysfs_entry, attr)
+
+static ssize_t
+rcf_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
+{
+	struct rcf_sysfs_entry *entry = to_rcf(attr);
+	struct blk_scsi_cmd_filter *filter;
+
+	filter = container_of(kobj, struct blk_scsi_cmd_filter, kobj);
+	if (entry->show)
+		return entry->show(filter, page);
+
+	return 0;
+}
+
+static ssize_t
+rcf_attr_store(struct kobject *kobj, struct attribute *attr,
+			const char *page, size_t length)
+{
+	struct rcf_sysfs_entry *entry = to_rcf(attr);
+	struct blk_scsi_cmd_filter *filter;
+
+	if (!capable(CAP_SYS_RAWIO))
+		return -EPERM;
+
+	if (!entry->store)
+		return -EINVAL;
+
+	filter = container_of(kobj, struct blk_scsi_cmd_filter, kobj);
+	return entry->store(filter, page, length);
+}
+
+static struct sysfs_ops rcf_sysfs_ops = {
+	.show = rcf_attr_show,
+	.store = rcf_attr_store,
+};
+
+static struct kobj_type rcf_ktype = {
+	.sysfs_ops = &rcf_sysfs_ops,
+	.default_attrs = default_attrs,
+};
+
+static void rcf_set_defaults(struct blk_scsi_cmd_filter *filter)
+{
+	/* Basic read-only commands */
+	__set_bit(TEST_UNIT_READY, filter->read_ok);
+	__set_bit(REQUEST_SENSE, filter->read_ok);
+	__set_bit(READ_6, filter->read_ok);
+	__set_bit(READ_10, filter->read_ok);
+	__set_bit(READ_12, filter->read_ok);
+	__set_bit(READ_16, filter->read_ok);
+	__set_bit(READ_BUFFER, filter->read_ok);
+	__set_bit(READ_DEFECT_DATA, filter->read_ok);
+	__set_bit(READ_LONG, filter->read_ok);
+	__set_bit(INQUIRY, filter->read_ok);
+	__set_bit(MODE_SENSE, filter->read_ok);
+	__set_bit(MODE_SENSE_10, filter->read_ok);
+	__set_bit(LOG_SENSE, filter->read_ok);
+	__set_bit(START_STOP, filter->read_ok);
+	__set_bit(GPCMD_VERIFY_10, filter->read_ok);
+	__set_bit(VERIFY_16, filter->read_ok);
+	__set_bit(GPCMD_READ_BUFFER_CAPACITY, filter->read_ok);
+
+	/* Audio CD commands */
+	__set_bit(GPCMD_PLAY_CD, filter->read_ok);
+	__set_bit(GPCMD_PLAY_AUDIO_10, filter->read_ok);
+	__set_bit(GPCMD_PLAY_AUDIO_MSF, filter->read_ok);
+	__set_bit(GPCMD_PLAY_AUDIO_TI, filter->read_ok);
+	__set_bit(GPCMD_PAUSE_RESUME, filter->read_ok);
+
+	/* CD/DVD data reading */
+	__set_bit(GPCMD_READ_CD, filter->read_ok);
+	__set_bit(GPCMD_READ_CD_MSF, filter->read_ok);
+	__set_bit(GPCMD_READ_DISC_INFO, filter->read_ok);
+	__set_bit(GPCMD_READ_CDVD_CAPACITY, filter->read_ok);
+	__set_bit(GPCMD_READ_DVD_STRUCTURE, filter->read_ok);
+	__set_bit(GPCMD_READ_HEADER, filter->read_ok);
+	__set_bit(GPCMD_READ_TRACK_RZONE_INFO, filter->read_ok);
+	__set_bit(GPCMD_READ_SUBCHANNEL, filter->read_ok);
+	__set_bit(GPCMD_READ_TOC_PMA_ATIP, filter->read_ok);
+	__set_bit(GPCMD_REPORT_KEY, filter->read_ok);
+	__set_bit(GPCMD_SCAN, filter->read_ok);
+	__set_bit(GPCMD_GET_CONFIGURATION, filter->read_ok);
+	__set_bit(GPCMD_READ_FORMAT_CAPACITIES, filter->read_ok);
+	__set_bit(GPCMD_GET_EVENT_STATUS_NOTIFICATION, filter->read_ok);
+	__set_bit(GPCMD_GET_PERFORMANCE, filter->read_ok);
+	__set_bit(GPCMD_SEEK, filter->read_ok);
+	__set_bit(GPCMD_STOP_PLAY_SCAN, filter->read_ok);
+
+	/* Basic writing commands */
+	__set_bit(WRITE_6, filter->write_ok);
+	__set_bit(WRITE_10, filter->write_ok);
+	__set_bit(WRITE_VERIFY, filter->write_ok);
+	__set_bit(WRITE_12, filter->write_ok);
+	__set_bit(WRITE_VERIFY_12, filter->write_ok);
+	__set_bit(WRITE_16, filter->write_ok);
+	__set_bit(WRITE_LONG, filter->write_ok);
+	__set_bit(WRITE_LONG_2, filter->write_ok);
+	__set_bit(ERASE, filter->write_ok);
+	__set_bit(GPCMD_MODE_SELECT_10, filter->write_ok);
+	__set_bit(MODE_SELECT, filter->write_ok);
+	__set_bit(LOG_SELECT, filter->write_ok);
+	__set_bit(GPCMD_BLANK, filter->write_ok);
+	__set_bit(GPCMD_CLOSE_TRACK, filter->write_ok);
+	__set_bit(GPCMD_FLUSH_CACHE, filter->write_ok);
+	__set_bit(GPCMD_FORMAT_UNIT, filter->write_ok);
+	__set_bit(GPCMD_REPAIR_RZONE_TRACK, filter->write_ok);
+	__set_bit(GPCMD_RESERVE_RZONE_TRACK, filter->write_ok);
+	__set_bit(GPCMD_SEND_DVD_STRUCTURE, filter->write_ok);
+	__set_bit(GPCMD_SEND_EVENT, filter->write_ok);
+	__set_bit(GPCMD_SEND_KEY, filter->write_ok);
+	__set_bit(GPCMD_SEND_OPC, filter->write_ok);
+	__set_bit(GPCMD_SEND_CUE_SHEET, filter->write_ok);
+	__set_bit(GPCMD_SET_SPEED, filter->write_ok);
+	__set_bit(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL, filter->write_ok);
+	__set_bit(GPCMD_LOAD_UNLOAD, filter->write_ok);
+	__set_bit(GPCMD_SET_STREAMING, filter->write_ok);
+}
+
+int blk_register_filter(struct gendisk *disk)
+{
+	int ret;
+	struct blk_scsi_cmd_filter *filter = &disk->cmd_filter;
+	struct kobject *parent = kobject_get(disk->holder_dir->parent);
+
+	if (!parent)
+		return -ENODEV;
+
+	ret = kobject_init_and_add(&filter->kobj, &rcf_ktype, parent,
+				 "%s", "cmd_filter");
+
+	if (ret < 0)
+		return ret;
+
+	rcf_set_defaults(filter);
+	return 0;
+}
+
+void blk_unregister_filter(struct gendisk *disk)
+{
+	struct blk_scsi_cmd_filter *filter = &disk->cmd_filter;
+
+	kobject_put(&filter->kobj);
+	kobject_put(disk->holder_dir->parent);
+}
+
diff --git a/block/genhd.c b/block/genhd.c
index 43e468ee599..9074f384b09 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -189,6 +189,7 @@ void add_disk(struct gendisk *disk)
 			    disk->minors, NULL, exact_match, exact_lock, disk);
 	register_disk(disk);
 	blk_register_queue(disk);
+	blk_register_filter(disk);
 
 	bdi = &disk->queue->backing_dev_info;
 	bdi_register_dev(bdi, MKDEV(disk->major, disk->first_minor));
@@ -200,6 +201,7 @@ EXPORT_SYMBOL(del_gendisk);	/* in partitions/check.c */
 
 void unlink_gendisk(struct gendisk *disk)
 {
+	blk_unregister_filter(disk);
 	sysfs_remove_link(&disk->dev.kobj, "bdi");
 	bdi_unregister(&disk->queue->backing_dev_info);
 	blk_unregister_queue(disk);
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 78199c08ec9..c5b9bcfc0a6 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -105,120 +105,12 @@ static int sg_emulated_host(struct request_queue *q, int __user *p)
 	return put_user(1, p);
 }
 
-#define CMD_READ_SAFE	0x01
-#define CMD_WRITE_SAFE	0x02
-#define CMD_WARNED	0x04
-#define safe_for_read(cmd)	[cmd] = CMD_READ_SAFE
-#define safe_for_write(cmd)	[cmd] = CMD_WRITE_SAFE
-
-int blk_verify_command(unsigned char *cmd, int has_write_perm)
-{
-	static unsigned char cmd_type[256] = {
-
-		/* Basic read-only commands */
-		safe_for_read(TEST_UNIT_READY),
-		safe_for_read(REQUEST_SENSE),
-		safe_for_read(READ_6),
-		safe_for_read(READ_10),
-		safe_for_read(READ_12),
-		safe_for_read(READ_16),
-		safe_for_read(READ_BUFFER),
-		safe_for_read(READ_DEFECT_DATA),
-		safe_for_read(READ_LONG),
-		safe_for_read(INQUIRY),
-		safe_for_read(MODE_SENSE),
-		safe_for_read(MODE_SENSE_10),
-		safe_for_read(LOG_SENSE),
-		safe_for_read(START_STOP),
-		safe_for_read(GPCMD_VERIFY_10),
-		safe_for_read(VERIFY_16),
-
-		/* Audio CD commands */
-		safe_for_read(GPCMD_PLAY_CD),
-		safe_for_read(GPCMD_PLAY_AUDIO_10),
-		safe_for_read(GPCMD_PLAY_AUDIO_MSF),
-		safe_for_read(GPCMD_PLAY_AUDIO_TI),
-		safe_for_read(GPCMD_PAUSE_RESUME),
-
-		/* CD/DVD data reading */
-		safe_for_read(GPCMD_READ_BUFFER_CAPACITY),
-		safe_for_read(GPCMD_READ_CD),
-		safe_for_read(GPCMD_READ_CD_MSF),
-		safe_for_read(GPCMD_READ_DISC_INFO),
-		safe_for_read(GPCMD_READ_CDVD_CAPACITY),
-		safe_for_read(GPCMD_READ_DVD_STRUCTURE),
-		safe_for_read(GPCMD_READ_HEADER),
-		safe_for_read(GPCMD_READ_TRACK_RZONE_INFO),
-		safe_for_read(GPCMD_READ_SUBCHANNEL),
-		safe_for_read(GPCMD_READ_TOC_PMA_ATIP),
-		safe_for_read(GPCMD_REPORT_KEY),
-		safe_for_read(GPCMD_SCAN),
-		safe_for_read(GPCMD_GET_CONFIGURATION),
-		safe_for_read(GPCMD_READ_FORMAT_CAPACITIES),
-		safe_for_read(GPCMD_GET_EVENT_STATUS_NOTIFICATION),
-		safe_for_read(GPCMD_GET_PERFORMANCE),
-		safe_for_read(GPCMD_SEEK),
-		safe_for_read(GPCMD_STOP_PLAY_SCAN),
-
-		/* Basic writing commands */
-		safe_for_write(WRITE_6),
-		safe_for_write(WRITE_10),
-		safe_for_write(WRITE_VERIFY),
-		safe_for_write(WRITE_12),
-		safe_for_write(WRITE_VERIFY_12),
-		safe_for_write(WRITE_16),
-		safe_for_write(WRITE_LONG),
-		safe_for_write(WRITE_LONG_2),
-		safe_for_write(ERASE),
-		safe_for_write(GPCMD_MODE_SELECT_10),
-		safe_for_write(MODE_SELECT),
-		safe_for_write(LOG_SELECT),
-		safe_for_write(GPCMD_BLANK),
-		safe_for_write(GPCMD_CLOSE_TRACK),
-		safe_for_write(GPCMD_FLUSH_CACHE),
-		safe_for_write(GPCMD_FORMAT_UNIT),
-		safe_for_write(GPCMD_REPAIR_RZONE_TRACK),
-		safe_for_write(GPCMD_RESERVE_RZONE_TRACK),
-		safe_for_write(GPCMD_SEND_DVD_STRUCTURE),
-		safe_for_write(GPCMD_SEND_EVENT),
-		safe_for_write(GPCMD_SEND_KEY),
-		safe_for_write(GPCMD_SEND_OPC),
-		safe_for_write(GPCMD_SEND_CUE_SHEET),
-		safe_for_write(GPCMD_SET_SPEED),
-		safe_for_write(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL),
-		safe_for_write(GPCMD_LOAD_UNLOAD),
-		safe_for_write(GPCMD_SET_STREAMING),
-	};
-	unsigned char type = cmd_type[cmd[0]];
-
-	/* Anybody who can open the device can do a read-safe command */
-	if (type & CMD_READ_SAFE)
-		return 0;
-
-	/* Write-safe commands just require a writable open.. */
-	if ((type & CMD_WRITE_SAFE) && has_write_perm)
-		return 0;
-
-	/* And root can do any command.. */
-	if (capable(CAP_SYS_RAWIO))
-		return 0;
-
-	if (!type) {
-		cmd_type[cmd[0]] = CMD_WARNED;
-		printk(KERN_WARNING "scsi: unknown opcode 0x%02x\n", cmd[0]);
-	}
-
-	/* Otherwise fail it with an "Operation not permitted" */
-	return -EPERM;
-}
-EXPORT_SYMBOL_GPL(blk_verify_command);
-
 static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq,
-			     struct sg_io_hdr *hdr, int has_write_perm)
+			     struct sg_io_hdr *hdr, struct file *file)
 {
 	if (copy_from_user(rq->cmd, hdr->cmdp, hdr->cmd_len))
 		return -EFAULT;
-	if (blk_verify_command(rq->cmd, has_write_perm))
+	if (blk_verify_command(file, rq->cmd))
 		return -EPERM;
 
 	/*
@@ -287,7 +179,7 @@ static int sg_io(struct file *file, struct request_queue *q,
 		struct gendisk *bd_disk, struct sg_io_hdr *hdr)
 {
 	unsigned long start_time;
-	int writing = 0, ret = 0, has_write_perm = 0;
+	int writing = 0, ret = 0;
 	struct request *rq;
 	char sense[SCSI_SENSE_BUFFERSIZE];
 	struct bio *bio;
@@ -316,10 +208,7 @@ static int sg_io(struct file *file, struct request_queue *q,
 	if (!rq)
 		return -ENOMEM;
 
-	if (file)
-		has_write_perm = file->f_mode & FMODE_WRITE;
-
-	if (blk_fill_sghdr_rq(q, rq, hdr, has_write_perm)) {
+	if (blk_fill_sghdr_rq(q, rq, hdr, file)) {
 		blk_put_request(rq);
 		return -EFAULT;
 	}
@@ -451,7 +340,7 @@ int sg_scsi_ioctl(struct file *file, struct request_queue *q,
 	if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len))
 		goto error;
 
-	err = blk_verify_command(rq->cmd, file->f_mode & FMODE_WRITE);
+	err = blk_verify_command(file, rq->cmd);
 	if (err)
 		goto error;
 
-- 
cgit v1.2.3


From c265a7f41706cee20508de5b4a919214cfd7a11b Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 26 Jun 2008 13:49:33 +0200
Subject: cfq-iosched: get rid of enable_idle being unused warning

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 0ebb626a25d..1e2aff812ee 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1731,7 +1731,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
 		return;
 
-	old_idle = cfq_cfqq_idle_window(cfqq);
+	enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
 
 	if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
 	    (cfqd->hw_tag && CIC_SEEKY(cic)))
-- 
cgit v1.2.3


From 07359fc61bb8ed786f96a1c24cca6f94dd17e329 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Thu, 26 Jun 2008 19:39:23 +0200
Subject: block: add bounce support to blk_rq_map_user_iov

blk_rq_map_user_iov can't handle the bounce buffer (it means that the
bio_map_user_iov path doesn't work with a LLD that needs GFP_DMA).

This patch fixes blk_rq_map_user_iov to support the bounce buffer.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Mike Christie <michaelc@cs.wisc.edu>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-map.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'block')

diff --git a/block/blk-map.c b/block/blk-map.c
index 0b1af5a3537..813011ef827 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -210,6 +210,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 	if (!bio_flagged(bio, BIO_USER_MAPPED))
 		rq->cmd_flags |= REQ_COPY_USER;
 
+	blk_queue_bounce(q, &bio);
 	bio_get(bio);
 	blk_rq_bio_prep(q, rq, bio);
 	rq->buffer = rq->data = NULL;
-- 
cgit v1.2.3


From 06a452e5b95eb669b7ad414ccf587dfc2d91b217 Mon Sep 17 00:00:00 2001
From: Adel Gadllah <adel.gadllah@gmail.com>
Date: Fri, 27 Jun 2008 09:16:17 +0200
Subject: cmdfilter: extend default read filter

This patch adds the commands that the former sg filter allowed for read
access to the cmdfilter to keep userspace apps that rely on them working.

Signed-off-by: Adel Gadllah <adel.gadllah@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cmd-filter.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'block')

diff --git a/block/cmd-filter.c b/block/cmd-filter.c
index 35e327ceaa9..eec4404fd35 100644
--- a/block/cmd-filter.c
+++ b/block/cmd-filter.c
@@ -219,6 +219,10 @@ static struct kobj_type rcf_ktype = {
 	.default_attrs = default_attrs,
 };
 
+#ifndef MAINTENANCE_IN_CMD
+#define MAINTENANCE_IN_CMD 0xa3
+#endif
+
 static void rcf_set_defaults(struct blk_scsi_cmd_filter *filter)
 {
 	/* Basic read-only commands */
@@ -230,6 +234,7 @@ static void rcf_set_defaults(struct blk_scsi_cmd_filter *filter)
 	__set_bit(READ_16, filter->read_ok);
 	__set_bit(READ_BUFFER, filter->read_ok);
 	__set_bit(READ_DEFECT_DATA, filter->read_ok);
+	__set_bit(READ_CAPACITY, filter->read_ok);
 	__set_bit(READ_LONG, filter->read_ok);
 	__set_bit(INQUIRY, filter->read_ok);
 	__set_bit(MODE_SENSE, filter->read_ok);
@@ -238,6 +243,10 @@ static void rcf_set_defaults(struct blk_scsi_cmd_filter *filter)
 	__set_bit(START_STOP, filter->read_ok);
 	__set_bit(GPCMD_VERIFY_10, filter->read_ok);
 	__set_bit(VERIFY_16, filter->read_ok);
+	__set_bit(REPORT_LUNS, filter->read_ok);
+	__set_bit(SERVICE_ACTION_IN, filter->read_ok);
+	__set_bit(RECEIVE_DIAGNOSTIC, filter->read_ok);
+	__set_bit(MAINTENANCE_IN_CMD, filter->read_ok);
 	__set_bit(GPCMD_READ_BUFFER_CAPACITY, filter->read_ok);
 
 	/* Audio CD commands */
-- 
cgit v1.2.3


From b24498d477a14680fc3bb3ad884fa9fa76a2d237 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 27 Jun 2008 09:12:09 +0200
Subject: block: integrity flags can't use bit ops on unsigned short

Just use normal open coded bit operations instead, they need not be
atomic.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-integrity.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'block')

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 4ffa3814f6a..3f1a8478cc3 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -217,17 +217,16 @@ static ssize_t integrity_read_store(struct blk_integrity *bi,
 	unsigned long val = simple_strtoul(p, &p, 10);
 
 	if (val)
-		set_bit(INTEGRITY_FLAG_READ, &bi->flags);
+		bi->flags |= INTEGRITY_FLAG_READ;
 	else
-		clear_bit(INTEGRITY_FLAG_READ, &bi->flags);
+		bi->flags &= ~INTEGRITY_FLAG_READ;
 
 	return count;
 }
 
 static ssize_t integrity_read_show(struct blk_integrity *bi, char *page)
 {
-	return sprintf(page, "%d\n",
-		       test_bit(INTEGRITY_FLAG_READ, &bi->flags) ? 1 : 0);
+	return sprintf(page, "%d\n", (bi->flags & INTEGRITY_FLAG_READ) != 0);
 }
 
 static ssize_t integrity_write_store(struct blk_integrity *bi,
@@ -237,17 +236,16 @@ static ssize_t integrity_write_store(struct blk_integrity *bi,
 	unsigned long val = simple_strtoul(p, &p, 10);
 
 	if (val)
-		set_bit(INTEGRITY_FLAG_WRITE, &bi->flags);
+		bi->flags |= INTEGRITY_FLAG_WRITE;
 	else
-		clear_bit(INTEGRITY_FLAG_WRITE, &bi->flags);
+		bi->flags &= ~INTEGRITY_FLAG_WRITE;
 
 	return count;
 }
 
 static ssize_t integrity_write_show(struct blk_integrity *bi, char *page)
 {
-	return sprintf(page, "%d\n",
-		       test_bit(INTEGRITY_FLAG_WRITE, &bi->flags) ? 1 : 0);
+	return sprintf(page, "%d\n", (bi->flags & INTEGRITY_FLAG_WRITE) != 0);
 }
 
 static struct integrity_sysfs_entry integrity_format_entry = {
@@ -340,8 +338,7 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
 
 		kobject_uevent(&bi->kobj, KOBJ_ADD);
 
-		set_bit(INTEGRITY_FLAG_READ, &bi->flags);
-		set_bit(INTEGRITY_FLAG_WRITE, &bi->flags);
+		bi->flags |= INTEGRITY_FLAG_READ | INTEGRITY_FLAG_WRITE;
 		bi->sector_size = disk->queue->hardsect_size;
 		disk->integrity = bi;
 	} else
-- 
cgit v1.2.3


From e180f5949327e897bc35a816f4f4010186632df9 Mon Sep 17 00:00:00 2001
From: maximilian attems <max@stro.a>
Date: Tue, 1 Jul 2008 09:42:47 +0200
Subject: block: request_module(): use format string

Avoid bad things happening if the module has a printk control string in
its name.

Signed-off-by: maximilian attems <max@stro.at>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/elevator.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/elevator.c b/block/elevator.c
index 1f5bfe69602..ed6f8f32d27 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -150,7 +150,7 @@ static struct elevator_type *elevator_get(const char *name)
 		else
 			sprintf(elv, "%s-iosched", name);
 
-		request_module(elv);
+		request_module("%s", elv);
 		spin_lock(&elv_list_lock);
 		e = elevator_find(name);
 	}
-- 
cgit v1.2.3


From e48ec69005f02b70b7ecfde1bc39a599086d16ef Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 3 Jul 2008 13:18:54 +0200
Subject: block: extend queue_flag bitops

Add test_and_clear and test_and_set.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index e0fb0bcc0c1..dbc7f42b5d2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -205,8 +205,7 @@ void blk_plug_device(struct request_queue *q)
 	if (blk_queue_stopped(q))
 		return;
 
-	if (!test_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
-		__set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags);
+	if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) {
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
 		blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
 	}
@@ -221,10 +220,9 @@ int blk_remove_plug(struct request_queue *q)
 {
 	WARN_ON(!irqs_disabled());
 
-	if (!test_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
+	if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q))
 		return 0;
 
-	queue_flag_clear(QUEUE_FLAG_PLUGGED, q);
 	del_timer(&q->unplug_timer);
 	return 1;
 }
@@ -328,8 +326,7 @@ void blk_start_queue(struct request_queue *q)
 	 * one level of recursion is ok and is much faster than kicking
 	 * the unplug handling
 	 */
-	if (!test_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {
-		queue_flag_set(QUEUE_FLAG_REENTER, q);
+	if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
 		q->request_fn(q);
 		queue_flag_clear(QUEUE_FLAG_REENTER, q);
 	} else {
@@ -394,8 +391,7 @@ void __blk_run_queue(struct request_queue *q)
 	 * handling reinvoke the handler shortly if we already got there.
 	 */
 	if (!elv_queue_empty(q)) {
-		if (!test_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {
-			queue_flag_set(QUEUE_FLAG_REENTER, q);
+		if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
 			q->request_fn(q);
 			queue_flag_clear(QUEUE_FLAG_REENTER, q);
 		} else {
-- 
cgit v1.2.3


From 27f8221af406e43b529a5425bc99c9b1e9bdf521 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Fri, 4 Jul 2008 09:30:03 +0200
Subject: block: add blk_queue_update_dma_pad

This adds blk_queue_update_dma_pad to prevent LLDs from overwriting
the dma pad mask wrongly (we added blk_queue_update_dma_alignment due
to the same reason).

This also converts libata to use blk_queue_update_dma_pad instead of
blk_queue_dma_pad.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Tejun Heo <htejun@gmail.com>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 8dd86418f35..dfc77012843 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -302,11 +302,10 @@ EXPORT_SYMBOL(blk_queue_stack_limits);
  * @q:     the request queue for the device
  * @mask:  pad mask
  *
- * Set pad mask.  Direct IO requests are padded to the mask specified.
+ * Set dma pad mask.
  *
- * Appending pad buffer to a request modifies ->data_len such that it
- * includes the pad buffer.  The original requested data length can be
- * obtained using blk_rq_raw_data_len().
+ * Appending pad buffer to a request modifies the last entry of a
+ * scatter list such that it includes the pad buffer.
  **/
 void blk_queue_dma_pad(struct request_queue *q, unsigned int mask)
 {
@@ -314,6 +313,23 @@ void blk_queue_dma_pad(struct request_queue *q, unsigned int mask)
 }
 EXPORT_SYMBOL(blk_queue_dma_pad);
 
+/**
+ * blk_queue_update_dma_pad - update pad mask
+ * @q:     the request queue for the device
+ * @mask:  pad mask
+ *
+ * Update dma pad mask.
+ *
+ * Appending pad buffer to a request modifies the last entry of a
+ * scatter list such that it includes the pad buffer.
+ **/
+void blk_queue_update_dma_pad(struct request_queue *q, unsigned int mask)
+{
+	if (mask > q->dma_pad_mask)
+		q->dma_pad_mask = mask;
+}
+EXPORT_SYMBOL(blk_queue_update_dma_pad);
+
 /**
  * blk_queue_dma_drain - Set up a drain buffer for excess dma.
  * @q:  the request queue for the device
-- 
cgit v1.2.3


From 30c00eda73d5db5bd64dd0c370161abd8df5ba4a Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Fri, 4 Jul 2008 09:31:11 +0200
Subject: block: blk_rq_map_kern uses the bounce buffers for stack buffers

blk_rq_map_kern is used for kernel internal I/Os. Some callers use
this function with stack buffers but DMA to/from the stack buffers
leads to memory corruption on a non-coherent platform.

This patch make blk_rq_map_kern uses the bounce buffers if a caller
passes a stack buffer (on the all platforms for simplicity).

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Tejun Heo <htejun@gmail.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-map.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'block')

diff --git a/block/blk-map.c b/block/blk-map.c
index 813011ef827..ddd96fb11a7 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -269,6 +269,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	int reading = rq_data_dir(rq) == READ;
 	int do_copy = 0;
 	struct bio *bio;
+	unsigned long stack_mask = ~(THREAD_SIZE - 1);
 
 	if (len > (q->max_hw_sectors << 9))
 		return -EINVAL;
@@ -279,6 +280,10 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	alignment = queue_dma_alignment(q) | q->dma_pad_mask;
 	do_copy = ((kaddr & alignment) || (len & alignment));
 
+	if (!((kaddr & stack_mask) ^
+	      ((unsigned long)current->stack & stack_mask)))
+		do_copy = 1;
+
 	if (do_copy)
 		bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
 	else
-- 
cgit v1.2.3


From 3f27e3ed11e67c5ee19d560a50eafd93cf8c6682 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Thu, 29 May 2008 07:56:55 +0900
Subject: [SCSI] bsg: fix bsg_mutex hang with device removal

We don't need to hold bsg_mutex during bsg_complete_all_commands(). It
leads to a problem that we block bsg_unregister_queue during
bsg_complete_all_commands (untill all the outstanding commands
complete).

Thanks to Pete Wyckoff for finding the bug and testing the patch.

The detailed bug report is:

http://marc.info/?l=linux-scsi&m=121182137132145&w=2

Tested-by: Pete Wyckoff <pw@osc.edu>
Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 block/bsg.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/bsg.c b/block/bsg.c
index f0b7cd34321..7cdec32205d 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -724,8 +724,13 @@ static int bsg_put_device(struct bsg_device *bd)
 	mutex_lock(&bsg_mutex);
 
 	do_free = atomic_dec_and_test(&bd->ref_count);
-	if (!do_free)
+	if (!do_free) {
+		mutex_unlock(&bsg_mutex);
 		goto out;
+	}
+
+	hlist_del(&bd->dev_list);
+	mutex_unlock(&bsg_mutex);
 
 	dprintk("%s: tearing down\n", bd->name);
 
@@ -741,10 +746,8 @@ static int bsg_put_device(struct bsg_device *bd)
 	 */
 	ret = bsg_complete_all_commands(bd);
 
-	hlist_del(&bd->dev_list);
 	kfree(bd);
 out:
-	mutex_unlock(&bsg_mutex);
 	kref_put(&q->bsg_dev.ref, bsg_kref_release_function);
 	if (do_free)
 		blk_put_queue(q);
-- 
cgit v1.2.3


From 8df5fc042c8e7c08dc438c8198b62407ee1e91a0 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Mon, 7 Jul 2008 15:50:01 -0500
Subject: [SCSI] bsg: fix oops on remove

If you do a modremove of any sas driver, you run into an oops on
shutdown when the host is removed (coming from the host bsg device).
The root cause seems to be that there's a use after free of the
bsg_class_device:  In bsg_kref_release_function, this is used (to do a
put_device(bcg->parent) after bcg->release has been called.  In sas (and
possibly many other things) bcd->release frees the queue which contains
the bsg_class_device, so we get a put_device on unreferenced memory.
Fix this by taking a copy of the pointer to the parent before releasing
bsg.

Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 block/bsg.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/bsg.c b/block/bsg.c
index f0b7cd34321..54d617f7df3 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -709,11 +709,12 @@ static void bsg_kref_release_function(struct kref *kref)
 {
 	struct bsg_class_device *bcd =
 		container_of(kref, struct bsg_class_device, ref);
+	struct device *parent = bcd->parent;
 
 	if (bcd->release)
 		bcd->release(bcd->parent);
 
-	put_device(bcd->parent);
+	put_device(parent);
 }
 
 static int bsg_put_device(struct bsg_device *bd)
-- 
cgit v1.2.3


From 9a2d43b7566caeeeb414aa628bc2759028897dbb Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 15 Jul 2008 21:21:43 +0200
Subject: block: handle blk_pm_resume_request() requests in
 blk_execute_rq_nowait()

For blk_pm_resume_request() requests (which are used only by IDE subsystem
currently) the queue is stopped so we need to call ->request_fn explicitly.

Thanks to:
- Rafael for reporting/bisecting the bug
- Borislav/Rafael for testing the fix

This is a preparation for converting IDE to use blk_execute_rq().

Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Borislav Petkov <petkovbb@gmail.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 block/blk-exec.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'block')

diff --git a/block/blk-exec.c b/block/blk-exec.c
index 391dd622489..4f52f279205 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -58,6 +58,9 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 	spin_lock_irq(q->queue_lock);
 	__elv_add_request(q, rq, where, 1);
 	__generic_unplug_device(q);
+	/* the queue is stopped so it won't be plugged+unplugged */
+	if (blk_pm_resume_request(rq))
+		q->request_fn(q);
 	spin_unlock_irq(q->queue_lock);
 }
 EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
-- 
cgit v1.2.3


From 52a93ba815737e3877f85b46850cffe993a22429 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 15 Jul 2008 21:21:45 +0200
Subject: block: remove the checking for NULL queue in blk_put_request

Some uses blk_put_request asymmetrically, that is, they uses it with
requests that not allocated by blk_get_request. As a result,
blk_put_request has a hack to catch a NULL request_queue. Now such
callers are fixed (they use blk_get_request properly). So we can
safely remove the hack in blk_put_request.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 block/blk-core.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 1905aaba49f..ac83cf9a19a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1042,15 +1042,9 @@ void blk_put_request(struct request *req)
 	unsigned long flags;
 	struct request_queue *q = req->q;
 
-	/*
-	 * Gee, IDE calls in w/ NULL q.  Fix IDE and remove the
-	 * following if (q) test.
-	 */
-	if (q) {
-		spin_lock_irqsave(q->queue_lock, flags);
-		__blk_put_request(q, req);
-		spin_unlock_irqrestore(q->queue_lock, flags);
-	}
+	spin_lock_irqsave(q->queue_lock, flags);
+	__blk_put_request(q, req);
+	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_put_request);
 
-- 
cgit v1.2.3


From 681a561b7ec7fdcd8f35b68e44ac6d6c70aecc04 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 15 Jul 2008 21:21:45 +0200
Subject: block: unexport blk_end_sync_rq

All the users of blk_end_sync_rq has gone (they are converted to use
blk_execute_rq). This unexports blk_end_sync_rq.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 block/blk-exec.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-exec.c b/block/blk-exec.c
index 4f52f279205..9bceff7674f 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -18,7 +18,7 @@
  * @rq: request to complete
  * @error: end io status of the request
  */
-void blk_end_sync_rq(struct request *rq, int error)
+static void blk_end_sync_rq(struct request *rq, int error)
 {
 	struct completion *waiting = rq->end_io_data;
 
@@ -31,7 +31,6 @@ void blk_end_sync_rq(struct request *rq, int error)
 	 */
 	complete(waiting);
 }
-EXPORT_SYMBOL(blk_end_sync_rq);
 
 /**
  * blk_execute_rq_nowait - insert a request into queue for execution
-- 
cgit v1.2.3


From e105b8bfc769b0545b6f0f395179d1e43cbee822 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 21 Apr 2008 10:51:07 -0700
Subject: sysfs: add /sys/dev/{char,block} to lookup sysfs path by major:minor

Why?:
There are occasions where userspace would like to access sysfs
attributes for a device but it may not know how sysfs has named the
device or the path.  For example what is the sysfs path for
/dev/disk/by-id/ata-ST3160827AS_5MT004CK?  With this change a call to
stat(2) returns the major:minor then userspace can see that
/sys/dev/block/8:32 links to /sys/block/sdc.

What are the alternatives?:
1/ Add an ioctl to return the path: Doable, but sysfs is meant to reduce
   the need to proliferate ioctl interfaces into the kernel, so this
   seems counter productive.

2/ Use udev to create these symlinks: Also doable, but it adds a
   udev dependency to utilities that might be running in a limited
   environment like an initramfs.

3/ Do a full-tree search of sysfs.

[kay.sievers@vrfy.org: fix duplicate registrations]
[kay.sievers@vrfy.org: cleanup suggestions]

Cc: Neil Brown <neilb@suse.de>
Cc: Tejun Heo <htejun@gmail.com>
Acked-by: Kay Sievers <kay.sievers@vrfy.org>
Reviewed-by: SL Baur <steve@xemacs.org>
Acked-by: Kay Sievers <kay.sievers@vrfy.org>
Acked-by: Mark Lord <lkml@rtr.ca>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 block/genhd.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 9074f384b09..24e3fc9095f 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -370,7 +370,10 @@ static struct kobject *base_probe(dev_t devt, int *part, void *data)
 
 static int __init genhd_device_init(void)
 {
-	int error = class_register(&block_class);
+	int error;
+
+	block_class.dev_kobj = sysfs_dev_block_kobj;
+	error = class_register(&block_class);
 	if (unlikely(error))
 		return error;
 	bdev_map = kobj_map_init(base_probe, &block_class_lock);
-- 
cgit v1.2.3


From f79f060561d04a38d41e773ade9baafce3c96179 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Wed, 21 May 2008 12:52:33 -0700
Subject: device create: block: convert device_create to device_create_drvdata

device_create() is race-prone, so use the race-free
device_create_drvdata() instead as device_create() is going away.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 block/bsg.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/bsg.c b/block/bsg.c
index 5fb9b0bdbe8..5a68b09a69b 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -1044,7 +1044,8 @@ int bsg_register_queue(struct request_queue *q, struct device *parent,
 	bcd->release = release;
 	kref_init(&bcd->ref);
 	dev = MKDEV(bsg_major, bcd->minor);
-	class_dev = device_create(bsg_class, parent, dev, "%s", devname);
+	class_dev = device_create_drvdata(bsg_class, parent, dev, NULL,
+					  "%s", devname);
 	if (IS_ERR(class_dev)) {
 		ret = PTR_ERR(class_dev);
 		goto put_dev;
-- 
cgit v1.2.3


From 6ffeea77ff014f1456fcd0564eac84b34e9535ca Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Thu, 22 May 2008 17:21:08 -0400
Subject: block: fix compiler warning in genhd.c

Warn if something really bad happens if we can't create this link.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 block/genhd.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 24e3fc9095f..3ccf5c01756 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -183,6 +183,7 @@ static int exact_lock(dev_t devt, void *data)
 void add_disk(struct gendisk *disk)
 {
 	struct backing_dev_info *bdi;
+	int retval;
 
 	disk->flags |= GENHD_FL_UP;
 	blk_register_region(MKDEV(disk->major, disk->first_minor),
@@ -193,7 +194,8 @@ void add_disk(struct gendisk *disk)
 
 	bdi = &disk->queue->backing_dev_info;
 	bdi_register_dev(bdi, MKDEV(disk->major, disk->first_minor));
-	sysfs_create_link(&disk->dev.kobj, &bdi->dev->kobj, "bdi");
+	retval = sysfs_create_link(&disk->dev.kobj, &bdi->dev->kobj, "bdi");
+	WARN_ON(retval);
 }
 
 EXPORT_SYMBOL(add_disk);
-- 
cgit v1.2.3


From 5c6f35c5ece8f130cd8ec9ba0d71dc146b46a0f1 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Thu, 22 May 2008 17:21:08 -0400
Subject: block: make printk_partition use the class iterator function

Use the proper class iterator function instead of mucking around in the
internals of the class structures.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 block/genhd.c | 93 ++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 50 insertions(+), 43 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 3ccf5c01756..10b9ac46c2d 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -227,58 +227,65 @@ struct gendisk *get_gendisk(dev_t devt, int *part)
 }
 
 /*
- * print a full list of all partitions - intended for places where the root
- * filesystem can't be mounted and thus to give the victim some idea of what
- * went wrong
+ * print a partitions - intended for places where the root filesystem can't be
+ * mounted and thus to give the victim some idea of what went wrong
  */
-void __init printk_all_partitions(void)
+static int printk_partition(struct device *dev, void *data)
 {
-	struct device *dev;
 	struct gendisk *sgp;
 	char buf[BDEVNAME_SIZE];
 	int n;
 
-	mutex_lock(&block_class_lock);
-	/* For each block device... */
-	list_for_each_entry(dev, &block_class.devices, node) {
-		if (dev->type != &disk_type)
-			continue;
-		sgp = dev_to_disk(dev);
-		/*
-		 * Don't show empty devices or things that have been surpressed
-		 */
-		if (get_capacity(sgp) == 0 ||
-		    (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
-			continue;
+	if (dev->type != &disk_type)
+		goto exit;
 
-		/*
-		 * Note, unlike /proc/partitions, I am showing the numbers in
-		 * hex - the same format as the root= option takes.
-		 */
-		printk("%02x%02x %10llu %s",
-			sgp->major, sgp->first_minor,
-			(unsigned long long)get_capacity(sgp) >> 1,
-			disk_name(sgp, 0, buf));
-		if (sgp->driverfs_dev != NULL &&
-		    sgp->driverfs_dev->driver != NULL)
-			printk(" driver: %s\n",
-				sgp->driverfs_dev->driver->name);
-		else
-			printk(" (driver?)\n");
-
-		/* now show the partitions */
-		for (n = 0; n < sgp->minors - 1; ++n) {
-			if (sgp->part[n] == NULL)
-				continue;
-			if (sgp->part[n]->nr_sects == 0)
-				continue;
-			printk("  %02x%02x %10llu %s\n",
-				sgp->major, n + 1 + sgp->first_minor,
-				(unsigned long long)sgp->part[n]->nr_sects >> 1,
-				disk_name(sgp, n + 1, buf));
-		}
+	sgp = dev_to_disk(dev);
+	/*
+	 * Don't show empty devices or things that have been surpressed
+	 */
+	if (get_capacity(sgp) == 0 ||
+	    (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
+		goto exit;
+
+	/*
+	 * Note, unlike /proc/partitions, I am showing the numbers in
+	 * hex - the same format as the root= option takes.
+	 */
+	printk("%02x%02x %10llu %s",
+		sgp->major, sgp->first_minor,
+		(unsigned long long)get_capacity(sgp) >> 1,
+		disk_name(sgp, 0, buf));
+	if (sgp->driverfs_dev != NULL &&
+	    sgp->driverfs_dev->driver != NULL)
+		printk(" driver: %s\n",
+			sgp->driverfs_dev->driver->name);
+	else
+		printk(" (driver?)\n");
+
+	/* now show the partitions */
+	for (n = 0; n < sgp->minors - 1; ++n) {
+		if (sgp->part[n] == NULL)
+			goto exit;
+		if (sgp->part[n]->nr_sects == 0)
+			goto exit;
+		printk("  %02x%02x %10llu %s\n",
+			sgp->major, n + 1 + sgp->first_minor,
+			(unsigned long long)sgp->part[n]->nr_sects >> 1,
+			disk_name(sgp, n + 1, buf));
 	}
+exit:
+	return 0;
+}
 
+/*
+ * print a full list of all partitions - intended for places where the root
+ * filesystem can't be mounted and thus to give the victim some idea of what
+ * went wrong
+ */
+void __init printk_all_partitions(void)
+{
+	mutex_lock(&block_class_lock);
+	class_for_each_device(&block_class, NULL, NULL, printk_partition);
 	mutex_unlock(&block_class_lock);
 }
 
-- 
cgit v1.2.3


From a142be856f060ae8106512c0e81a8d6f8746ab0b Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Thu, 22 May 2008 17:21:08 -0400
Subject: block: make blk_lookup_devt use the class iterator function

Use the proper class iterator function instead of mucking around in the
internals of the class structures.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 block/genhd.c | 38 ++++++++++++++++++++++++++------------
 1 file changed, 26 insertions(+), 12 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 10b9ac46c2d..e8c42bfd12b 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -677,24 +677,38 @@ void genhd_media_change_notify(struct gendisk *disk)
 EXPORT_SYMBOL_GPL(genhd_media_change_notify);
 #endif  /*  0  */
 
+struct find_block {
+	const char *name;
+	int part;
+};
+
+static int match_id(struct device *dev, void *data)
+{
+	struct find_block *find = data;
+
+	if (dev->type != &disk_type)
+		return 0;
+	if (strcmp(dev->bus_id, find->name) == 0) {
+		struct gendisk *disk = dev_to_disk(dev);
+		if (find->part < disk->minors)
+			return 1;
+	}
+	return 0;
+}
+
 dev_t blk_lookup_devt(const char *name, int part)
 {
 	struct device *dev;
 	dev_t devt = MKDEV(0, 0);
+	struct find_block find;
 
 	mutex_lock(&block_class_lock);
-	list_for_each_entry(dev, &block_class.devices, node) {
-		if (dev->type != &disk_type)
-			continue;
-		if (strcmp(dev->bus_id, name) == 0) {
-			struct gendisk *disk = dev_to_disk(dev);
-
-			if (part < disk->minors)
-				devt = MKDEV(MAJOR(dev->devt),
-					     MINOR(dev->devt) + part);
-			break;
-		}
-	}
+	find.name = name;
+	find.part = part;
+	dev = class_find_device(&block_class, NULL, (void *)&find, match_id);
+	if (dev)
+		devt = MKDEV(MAJOR(dev->devt),
+			     MINOR(dev->devt) + part);
 	mutex_unlock(&block_class_lock);
 
 	return devt;
-- 
cgit v1.2.3


From a6e2ba88774bc5870ab3d9664cb86d70415f7402 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Fri, 23 May 2008 09:44:11 -0700
Subject: block: make /proc/diskstats only build if CONFIG_PROC_FS is enabled

These functions are only needed if CONFIG_PROC_FS is enabled, so save
the space when it is not.

This also makes it easier for a patch later in this series to work
properly if CONFIG_PROC_FS is not enabled.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 block/genhd.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index e8c42bfd12b..68a5f28007e 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -544,6 +544,7 @@ static struct device_type disk_type = {
 	.release	= disk_release,
 };
 
+#ifdef CONFIG_PROC_FS
 /*
  * aggregate disk stat collector.  Uses the same stats that the sysfs
  * entries do, above, but makes them available through one seq_file.
@@ -653,6 +654,7 @@ const struct seq_operations diskstats_op = {
 	.stop	= diskstats_stop,
 	.show	= diskstats_show
 };
+#endif /* CONFIG_PROC_FS */
 
 static void media_change_notify_thread(struct work_struct *work)
 {
-- 
cgit v1.2.3


From 68c4d4a7875c59f2e4b72901ab11ba978e75bde0 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Thu, 22 May 2008 17:21:08 -0400
Subject: block: make proc files seq_start use the class_find_device()

Use the proper class iterator function instead of mucking around in the
internals of the class structures.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 block/genhd.c | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 68a5f28007e..f03bdadc52a 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -291,18 +291,25 @@ void __init printk_all_partitions(void)
 
 #ifdef CONFIG_PROC_FS
 /* iterator */
+static int find_start(struct device *dev, void *data)
+{
+	loff_t k = *(loff_t *)data;
+
+	if (dev->type != &disk_type)
+		return 0;
+	if (!k--)
+		return 1;
+	return 0;
+}
+
 static void *part_start(struct seq_file *part, loff_t *pos)
 {
-	loff_t k = *pos;
 	struct device *dev;
 
 	mutex_lock(&block_class_lock);
-	list_for_each_entry(dev, &block_class.devices, node) {
-		if (dev->type != &disk_type)
-			continue;
-		if (!k--)
-			return dev_to_disk(dev);
-	}
+	dev = class_find_device(&block_class, NULL, (void *)pos, find_start);
+	if (dev)
+		return dev_to_disk(dev);
 	return NULL;
 }
 
@@ -555,16 +562,12 @@ static struct device_type disk_type = {
 
 static void *diskstats_start(struct seq_file *part, loff_t *pos)
 {
-	loff_t k = *pos;
 	struct device *dev;
 
 	mutex_lock(&block_class_lock);
-	list_for_each_entry(dev, &block_class.devices, node) {
-		if (dev->type != &disk_type)
-			continue;
-		if (!k--)
-			return dev_to_disk(dev);
-	}
+	dev = class_find_device(&block_class, NULL, (void *)pos, find_start);
+	if (dev)
+		return dev_to_disk(dev);
 	return NULL;
 }
 
-- 
cgit v1.2.3


From 66c64afec16a7b46212ecb2fa99998923bbeea3f Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Thu, 22 May 2008 17:21:08 -0400
Subject: block: move header for /proc/partitions to seq_start

The seq_start call is the better place for the header for the file, that
way we don't have to be mucking in the class structure to try to figure
out if this is the first partition or not.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 block/genhd.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index f03bdadc52a..70f1d707578 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -305,6 +305,10 @@ static int find_start(struct device *dev, void *data)
 static void *part_start(struct seq_file *part, loff_t *pos)
 {
 	struct device *dev;
+	loff_t n = *pos;
+
+	if (!n)
+		seq_puts(part, "major minor  #blocks  name\n\n");
 
 	mutex_lock(&block_class_lock);
 	dev = class_find_device(&block_class, NULL, (void *)pos, find_start);
@@ -338,9 +342,6 @@ static int show_partition(struct seq_file *part, void *v)
 	int n;
 	char buf[BDEVNAME_SIZE];
 
-	if (&sgp->dev.node == block_class.devices.next)
-		seq_puts(part, "major minor  #blocks  name\n\n");
-
 	/* Don't show non-partitionable removeable devices or empty devices */
 	if (!get_capacity(sgp) ||
 			(sgp->minors == 1 && (sgp->flags & GENHD_FL_REMOVABLE)))
-- 
cgit v1.2.3


From 27f302519148f311307637d4c9a6d0fd87d07e4c Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Thu, 22 May 2008 17:21:08 -0400
Subject: block: make /proc/partitions and /proc/diskstats use
 class_find_device()

Use the proper class iterator function instead of mucking around in the
internals of the class structures.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 block/genhd.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 70f1d707578..c13cc77291a 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -317,17 +317,21 @@ static void *part_start(struct seq_file *part, loff_t *pos)
 	return NULL;
 }
 
+static int find_next(struct device *dev, void *data)
+{
+	if (dev->type == &disk_type)
+		return 1;
+	return 0;
+}
+
 static void *part_next(struct seq_file *part, void *v, loff_t *pos)
 {
 	struct gendisk *gp = v;
 	struct device *dev;
 	++*pos;
-	list_for_each_entry(dev, &gp->dev.node, node) {
-		if (&dev->node == &block_class.devices)
-			return NULL;
-		if (dev->type == &disk_type)
-			return dev_to_disk(dev);
-	}
+	dev = class_find_device(&block_class, &gp->dev, NULL, find_next);
+	if (dev)
+		return dev_to_disk(dev);
 	return NULL;
 }
 
@@ -578,12 +582,9 @@ static void *diskstats_next(struct seq_file *part, void *v, loff_t *pos)
 	struct device *dev;
 
 	++*pos;
-	list_for_each_entry(dev, &gp->dev.node, node) {
-		if (&dev->node == &block_class.devices)
-			return NULL;
-		if (dev->type == &disk_type)
-			return dev_to_disk(dev);
-	}
+	dev = class_find_device(&block_class, &gp->dev, NULL, find_next);
+	if (dev)
+		return dev_to_disk(dev);
 	return NULL;
 }
 
-- 
cgit v1.2.3


From 04ebd4aee52b06a2c38127d9208546e5b96f3a19 Mon Sep 17 00:00:00 2001
From: Abdel Benamrouche <draconux@gmail.com>
Date: Fri, 25 Jul 2008 01:48:26 -0700
Subject: block/ioctl.c and fs/partition/check.c: check value returned by
 add_partition()

Now that add_partition() has been aught to propagate errors, let's check them.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Abdel Benamrouche <draconux@gmail.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/ioctl.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/ioctl.c b/block/ioctl.c
index 52d6385216a..77185e5c026 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -17,6 +17,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 	long long start, length;
 	int part;
 	int i;
+	int err;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
@@ -61,9 +62,9 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 				}
 			}
 			/* all seems OK */
-			add_partition(disk, part, start, length, ADDPART_FLAG_NONE);
+			err = add_partition(disk, part, start, length, ADDPART_FLAG_NONE);
 			mutex_unlock(&bdev->bd_mutex);
-			return 0;
+			return err;
 		case BLKPG_DEL_PARTITION:
 			if (!disk->part[part-1])
 				return -ENXIO;
-- 
cgit v1.2.3


From a76eef9573c93f8f324ebacfd090a3e319a64d59 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Fri, 25 Jul 2008 19:44:39 -0700
Subject: block/blk-map.c: use the new object_is_on_stack() helper

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Tejun Heo <htejun@gmail.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/blk-map.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/blk-map.c b/block/blk-map.c
index ddd96fb11a7..af37e4ae62f 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -269,7 +269,6 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	int reading = rq_data_dir(rq) == READ;
 	int do_copy = 0;
 	struct bio *bio;
-	unsigned long stack_mask = ~(THREAD_SIZE - 1);
 
 	if (len > (q->max_hw_sectors << 9))
 		return -EINVAL;
@@ -278,11 +277,8 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 
 	kaddr = (unsigned long)kbuf;
 	alignment = queue_dma_alignment(q) | q->dma_pad_mask;
-	do_copy = ((kaddr & alignment) || (len & alignment));
-
-	if (!((kaddr & stack_mask) ^
-	      ((unsigned long)current->stack & stack_mask)))
-		do_copy = 1;
+	do_copy = ((kaddr & alignment) || (len & alignment) ||
+		   object_is_on_stack(kbuf));
 
 	if (do_copy)
 		bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
-- 
cgit v1.2.3


From 12e0036818eed243c8ed6583ebf98261a2554e12 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 25 Jul 2008 19:45:38 -0700
Subject: Use WARN() in block/

Use WARN() instead of a printk+WARN_ON() pair; this way the message
becomes part of the warning section for better reporting/collection.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/as-iosched.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/as-iosched.c b/block/as-iosched.c
index 9735acb5b4f..cf4eb0eefbb 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -837,8 +837,7 @@ static void as_completed_request(struct request_queue *q, struct request *rq)
 	WARN_ON(!list_empty(&rq->queuelist));
 
 	if (RQ_STATE(rq) != AS_RQ_REMOVED) {
-		printk("rq->state %d\n", RQ_STATE(rq));
-		WARN_ON(1);
+		WARN(1, "rq->state %d\n", RQ_STATE(rq));
 		goto out;
 	}
 
-- 
cgit v1.2.3