From a3948663ed89c2f17e37cd0936d964341edb193e Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@nuerscht.ch>
Date: Mon, 20 Jun 2005 23:49:08 +0200
Subject: [PATCH] drivers/block/sx8.c: Use the DMA_{64, 32}BIT_MASK constants

Use the DMA_{64,32}BIT_MASK constants from dma-mapping.h when calling
pci_set_dma_mask() or pci_set_consistent_dma_mask()
These patches include dma-mapping.h explicitly because it caused errors
on some architectures otherwise.
See http://marc.theaimsgroup.com/?t=108001993000001&r=1&w=2 for details

Signed-off-by: Tobias Klauser <tklauser@nuerscht.ch>
Signed-off-by: Domen Puncer <domen@coderock.org>
---
 drivers/block/sx8.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 5ed3a637945..9db0a9e3e59 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -26,6 +26,7 @@
 #include <linux/delay.h>
 #include <linux/time.h>
 #include <linux/hdreg.h>
+#include <linux/dma-mapping.h>
 #include <asm/io.h>
 #include <asm/semaphore.h>
 #include <asm/uaccess.h>
@@ -1582,9 +1583,9 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto err_out;
 
 #if IF_64BIT_DMA_IS_POSSIBLE /* grrrr... */
-	rc = pci_set_dma_mask(pdev, 0xffffffffffffffffULL);
+	rc = pci_set_dma_mask(pdev, DMA_64BIT_MASK);
 	if (!rc) {
-		rc = pci_set_consistent_dma_mask(pdev, 0xffffffffffffffffULL);
+		rc = pci_set_consistent_dma_mask(pdev, DMA_64BIT_MASK);
 		if (rc) {
 			printk(KERN_ERR DRV_NAME "(%s): consistent DMA mask failure\n",
 				pci_name(pdev));
@@ -1593,7 +1594,7 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 		pci_dac = 1;
 	} else {
 #endif
-		rc = pci_set_dma_mask(pdev, 0xffffffffULL);
+		rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
 		if (rc) {
 			printk(KERN_ERR DRV_NAME "(%s): DMA mask failure\n",
 				pci_name(pdev));
-- 
cgit v1.2.3


From 22e2c507c301c3dbbcf91b4948b88f78842ee6c9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 27 Jun 2005 10:55:12 +0200
Subject: [PATCH] Update cfq io scheduler to time sliced design

This updates the CFQ io scheduler to the new time sliced design (cfq
v3).  It provides full process fairness, while giving excellent
aggregate system throughput even for many competing processes.  It
supports io priorities, either inherited from the cpu nice value or set
directly with the ioprio_get/set syscalls.  The latter closely mimic
set/getpriority.

This import is based on my latest from -mm.

Signed-off-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/as-iosched.c       |    5 +-
 drivers/block/cfq-iosched.c      | 1910 +++++++++++++++++++++++++-------------
 drivers/block/deadline-iosched.c |    3 +-
 drivers/block/elevator.c         |    9 +-
 drivers/block/ll_rw_blk.c        |   59 +-
 5 files changed, 1322 insertions(+), 664 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/as-iosched.c b/drivers/block/as-iosched.c
index 3410b4d294b..91aeb678135 100644
--- a/drivers/block/as-iosched.c
+++ b/drivers/block/as-iosched.c
@@ -1806,7 +1806,8 @@ static void as_put_request(request_queue_t *q, struct request *rq)
 	rq->elevator_private = NULL;
 }
 
-static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
+static int as_set_request(request_queue_t *q, struct request *rq,
+			  struct bio *bio, int gfp_mask)
 {
 	struct as_data *ad = q->elevator->elevator_data;
 	struct as_rq *arq = mempool_alloc(ad->arq_pool, gfp_mask);
@@ -1827,7 +1828,7 @@ static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
 	return 1;
 }
 
-static int as_may_queue(request_queue_t *q, int rw)
+static int as_may_queue(request_queue_t *q, int rw, struct bio *bio)
 {
 	int ret = ELV_MQUEUE_MAY;
 	struct as_data *ad = q->elevator->elevator_data;
diff --git a/drivers/block/cfq-iosched.c b/drivers/block/cfq-iosched.c
index 3ac47dde64d..35f6e569d5e 100644
--- a/drivers/block/cfq-iosched.c
+++ b/drivers/block/cfq-iosched.c
@@ -21,22 +21,33 @@
 #include <linux/hash.h>
 #include <linux/rbtree.h>
 #include <linux/mempool.h>
-
-static unsigned long max_elapsed_crq;
-static unsigned long max_elapsed_dispatch;
+#include <linux/ioprio.h>
+#include <linux/writeback.h>
 
 /*
  * tunables
  */
 static int cfq_quantum = 4;		/* max queue in one round of service */
 static int cfq_queued = 8;		/* minimum rq allocate limit per-queue*/
-static int cfq_service = HZ;		/* period over which service is avg */
-static int cfq_fifo_expire_r = HZ / 2;	/* fifo timeout for sync requests */
-static int cfq_fifo_expire_w = 5 * HZ;	/* fifo timeout for async requests */
-static int cfq_fifo_rate = HZ / 8;	/* fifo expiry rate */
+static int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
 static int cfq_back_max = 16 * 1024;	/* maximum backwards seek, in KiB */
 static int cfq_back_penalty = 2;	/* penalty of a backwards seek */
 
+static int cfq_slice_sync = HZ / 10;
+static int cfq_slice_async = HZ / 50;
+static int cfq_slice_async_rq = 2;
+static int cfq_slice_idle = HZ / 50;
+
+#define CFQ_IDLE_GRACE		(HZ / 10)
+#define CFQ_SLICE_SCALE		(5)
+
+#define CFQ_KEY_ASYNC		(0)
+
+/*
+ * disable queueing at the driver/hardware level
+ */
+static int cfq_max_depth = 1;
+
 /*
  * for the hash of cfqq inside the cfqd
  */
@@ -55,6 +66,7 @@ static int cfq_back_penalty = 2;	/* penalty of a backwards seek */
 #define list_entry_hash(ptr)	hlist_entry((ptr), struct cfq_rq, hash)
 
 #define list_entry_cfqq(ptr)	list_entry((ptr), struct cfq_queue, cfq_list)
+#define list_entry_fifo(ptr)	list_entry((ptr), struct request, queuelist)
 
 #define RQ_DATA(rq)		(rq)->elevator_private
 
@@ -75,78 +87,101 @@ static int cfq_back_penalty = 2;	/* penalty of a backwards seek */
 #define rb_entry_crq(node)	rb_entry((node), struct cfq_rq, rb_node)
 #define rq_rb_key(rq)		(rq)->sector
 
-/*
- * threshold for switching off non-tag accounting
- */
-#define CFQ_MAX_TAG		(4)
-
-/*
- * sort key types and names
- */
-enum {
-	CFQ_KEY_PGID,
-	CFQ_KEY_TGID,
-	CFQ_KEY_UID,
-	CFQ_KEY_GID,
-	CFQ_KEY_LAST,
-};
-
-static char *cfq_key_types[] = { "pgid", "tgid", "uid", "gid", NULL };
-
 static kmem_cache_t *crq_pool;
 static kmem_cache_t *cfq_pool;
 static kmem_cache_t *cfq_ioc_pool;
 
+#define CFQ_PRIO_LISTS		IOPRIO_BE_NR
+#define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
+#define cfq_class_be(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_BE)
+#define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
+
+#define cfq_cfqq_sync(cfqq)	((cfqq)->key != CFQ_KEY_ASYNC)
+
+/*
+ * Per block device queue structure
+ */
 struct cfq_data {
-	struct list_head rr_list;
+	atomic_t ref;
+	request_queue_t *queue;
+
+	/*
+	 * rr list of queues with requests and the count of them
+	 */
+	struct list_head rr_list[CFQ_PRIO_LISTS];
+	struct list_head busy_rr;
+	struct list_head cur_rr;
+	struct list_head idle_rr;
+	unsigned int busy_queues;
+
+	/*
+	 * non-ordered list of empty cfqq's
+	 */
 	struct list_head empty_list;
 
+	/*
+	 * cfqq lookup hash
+	 */
 	struct hlist_head *cfq_hash;
-	struct hlist_head *crq_hash;
 
-	/* queues on rr_list (ie they have pending requests */
-	unsigned int busy_queues;
+	/*
+	 * global crq hash for all queues
+	 */
+	struct hlist_head *crq_hash;
 
 	unsigned int max_queued;
 
-	atomic_t ref;
+	mempool_t *crq_pool;
 
-	int key_type;
+	int rq_in_driver;
 
-	mempool_t *crq_pool;
+	/*
+	 * schedule slice state info
+	 */
+	/*
+	 * idle window management
+	 */
+	struct timer_list idle_slice_timer;
+	struct work_struct unplug_work;
 
-	request_queue_t *queue;
+	struct cfq_queue *active_queue;
+	struct cfq_io_context *active_cic;
+	int cur_prio, cur_end_prio;
+	unsigned int dispatch_slice;
+
+	struct timer_list idle_class_timer;
 
 	sector_t last_sector;
+	unsigned long last_end_request;
 
-	int rq_in_driver;
+	unsigned int rq_starved;
 
 	/*
 	 * tunables, see top of file
 	 */
 	unsigned int cfq_quantum;
 	unsigned int cfq_queued;
-	unsigned int cfq_fifo_expire_r;
-	unsigned int cfq_fifo_expire_w;
-	unsigned int cfq_fifo_batch_expire;
+	unsigned int cfq_fifo_expire[2];
 	unsigned int cfq_back_penalty;
 	unsigned int cfq_back_max;
-	unsigned int find_best_crq;
-
-	unsigned int cfq_tagged;
+	unsigned int cfq_slice[2];
+	unsigned int cfq_slice_async_rq;
+	unsigned int cfq_slice_idle;
+	unsigned int cfq_max_depth;
 };
 
+/*
+ * Per process-grouping structure
+ */
 struct cfq_queue {
 	/* reference count */
 	atomic_t ref;
 	/* parent cfq_data */
 	struct cfq_data *cfqd;
-	/* hash of mergeable requests */
+	/* cfqq lookup hash */
 	struct hlist_node cfq_hash;
 	/* hash key */
-	unsigned long key;
-	/* whether queue is on rr (or empty) list */
-	int on_rr;
+	unsigned int key;
 	/* on either rr or empty list of cfqd */
 	struct list_head cfq_list;
 	/* sorted list of pending requests */
@@ -158,21 +193,35 @@ struct cfq_queue {
 	/* currently allocated requests */
 	int allocated[2];
 	/* fifo list of requests in sort_list */
-	struct list_head fifo[2];
-	/* last time fifo expired */
-	unsigned long last_fifo_expire;
-
-	int key_type;
-
-	unsigned long service_start;
-	unsigned long service_used;
+	struct list_head fifo;
 
-	unsigned int max_rate;
+	unsigned long slice_start;
+	unsigned long slice_end;
+	unsigned long slice_left;
+	unsigned long service_last;
 
 	/* number of requests that have been handed to the driver */
 	int in_flight;
-	/* number of currently allocated requests */
-	int alloc_limit[2];
+
+	/* io prio of this group */
+	unsigned short ioprio, org_ioprio;
+	unsigned short ioprio_class, org_ioprio_class;
+
+	/* whether queue is on rr (or empty) list */
+	unsigned on_rr : 1;
+	/* idle slice, waiting for new request submission */
+	unsigned wait_request : 1;
+	/* set when wait_request gets set, reset on first rq alloc */
+	unsigned must_alloc : 1;
+	/* only gets one must_alloc per slice */
+	unsigned must_alloc_slice : 1;
+	/* idle slice, request added, now waiting to dispatch it */
+	unsigned must_dispatch : 1;
+	/* fifo expire per-slice */
+	unsigned fifo_expire : 1;
+
+	unsigned idle_window : 1;
+	unsigned prio_changed : 1;
 };
 
 struct cfq_rq {
@@ -184,42 +233,17 @@ struct cfq_rq {
 	struct cfq_queue *cfq_queue;
 	struct cfq_io_context *io_context;
 
-	unsigned long service_start;
-	unsigned long queue_start;
-
-	unsigned int in_flight : 1;
-	unsigned int accounted : 1;
-	unsigned int is_sync   : 1;
-	unsigned int is_write  : 1;
+	unsigned in_flight : 1;
+	unsigned accounted : 1;
+	unsigned is_sync   : 1;
+	unsigned requeued  : 1;
 };
 
-static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned long);
+static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int);
 static void cfq_dispatch_sort(request_queue_t *, struct cfq_rq *);
-static void cfq_update_next_crq(struct cfq_rq *);
 static void cfq_put_cfqd(struct cfq_data *cfqd);
 
-/*
- * what the fairness is based on (ie how processes are grouped and
- * differentiated)
- */
-static inline unsigned long
-cfq_hash_key(struct cfq_data *cfqd, struct task_struct *tsk)
-{
-	/*
-	 * optimize this so that ->key_type is the offset into the struct
-	 */
-	switch (cfqd->key_type) {
-		case CFQ_KEY_PGID:
-			return process_group(tsk);
-		default:
-		case CFQ_KEY_TGID:
-			return tsk->tgid;
-		case CFQ_KEY_UID:
-			return tsk->uid;
-		case CFQ_KEY_GID:
-			return tsk->gid;
-	}
-}
+#define process_sync(tsk)	((tsk)->flags & PF_SYNCWRITE)
 
 /*
  * lots of deadline iosched dupes, can be abstracted later...
@@ -235,16 +259,12 @@ static void cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq)
 
 	if (q->last_merge == crq->request)
 		q->last_merge = NULL;
-
-	cfq_update_next_crq(crq);
 }
 
 static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
 {
 	const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request));
 
-	BUG_ON(!hlist_unhashed(&crq->hash));
-
 	hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]);
 }
 
@@ -257,8 +277,6 @@ static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
 		struct cfq_rq *crq = list_entry_hash(entry);
 		struct request *__rq = crq->request;
 
-		BUG_ON(hlist_unhashed(&crq->hash));
-
 		if (!rq_mergeable(__rq)) {
 			cfq_del_crq_hash(crq);
 			continue;
@@ -287,36 +305,16 @@ cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
 		return crq2;
 	if (crq2 == NULL)
 		return crq1;
+	if (crq1->requeued)
+		return crq1;
+	if (crq2->requeued)
+		return crq2;
 
 	s1 = crq1->request->sector;
 	s2 = crq2->request->sector;
 
 	last = cfqd->last_sector;
 
-#if 0
-	if (!list_empty(&cfqd->queue->queue_head)) {
-		struct list_head *entry = &cfqd->queue->queue_head;
-		unsigned long distance = ~0UL;
-		struct request *rq;
-
-		while ((entry = entry->prev) != &cfqd->queue->queue_head) {
-			rq = list_entry_rq(entry);
-
-			if (blk_barrier_rq(rq))
-				break;
-
-			if (distance < abs(s1 - rq->sector + rq->nr_sectors)) {
-				distance = abs(s1 - rq->sector +rq->nr_sectors);
-				last = rq->sector + rq->nr_sectors;
-			}
-			if (distance < abs(s2 - rq->sector + rq->nr_sectors)) {
-				distance = abs(s2 - rq->sector +rq->nr_sectors);
-				last = rq->sector + rq->nr_sectors;
-			}
-		}
-	}
-#endif
-
 	/*
 	 * by definition, 1KiB is 2 sectors
 	 */
@@ -377,11 +375,13 @@ cfq_find_next_crq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	struct cfq_rq *crq_next = NULL, *crq_prev = NULL;
 	struct rb_node *rbnext, *rbprev;
 
-	if (!ON_RB(&last->rb_node))
-		return NULL;
-
-	if ((rbnext = rb_next(&last->rb_node)) == NULL)
+	if (ON_RB(&last->rb_node))
+		rbnext = rb_next(&last->rb_node);
+	else {
 		rbnext = rb_first(&cfqq->sort_list);
+		if (rbnext == &last->rb_node)
+			rbnext = NULL;
+	}
 
 	rbprev = rb_prev(&last->rb_node);
 
@@ -401,67 +401,53 @@ static void cfq_update_next_crq(struct cfq_rq *crq)
 		cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq);
 }
 
-static int cfq_check_sort_rr_list(struct cfq_queue *cfqq)
+static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
 {
-	struct list_head *head = &cfqq->cfqd->rr_list;
-	struct list_head *next, *prev;
-
-	/*
-	 * list might still be ordered
-	 */
-	next = cfqq->cfq_list.next;
-	if (next != head) {
-		struct cfq_queue *cnext = list_entry_cfqq(next);
+	struct cfq_data *cfqd = cfqq->cfqd;
+	struct list_head *list, *entry;
 
-		if (cfqq->service_used > cnext->service_used)
-			return 1;
-	}
+	BUG_ON(!cfqq->on_rr);
 
-	prev = cfqq->cfq_list.prev;
-	if (prev != head) {
-		struct cfq_queue *cprev = list_entry_cfqq(prev);
+	list_del(&cfqq->cfq_list);
 
-		if (cfqq->service_used < cprev->service_used)
-			return 1;
+	if (cfq_class_rt(cfqq))
+		list = &cfqd->cur_rr;
+	else if (cfq_class_idle(cfqq))
+		list = &cfqd->idle_rr;
+	else {
+		/*
+		 * if cfqq has requests in flight, don't allow it to be
+		 * found in cfq_set_active_queue before it has finished them.
+		 * this is done to increase fairness between a process that
+		 * has lots of io pending vs one that only generates one
+		 * sporadically or synchronously
+		 */
+		if (cfqq->in_flight)
+			list = &cfqd->busy_rr;
+		else
+			list = &cfqd->rr_list[cfqq->ioprio];
 	}
 
-	return 0;
-}
-
-static void cfq_sort_rr_list(struct cfq_queue *cfqq, int new_queue)
-{
-	struct list_head *entry = &cfqq->cfqd->rr_list;
-
-	if (!cfqq->on_rr)
-		return;
-	if (!new_queue && !cfq_check_sort_rr_list(cfqq))
+	/*
+	 * if queue was preempted, just add to front to be fair. busy_rr
+	 * isn't sorted.
+	 */
+	if (preempted || list == &cfqd->busy_rr) {
+		list_add(&cfqq->cfq_list, list);
 		return;
-
-	list_del(&cfqq->cfq_list);
+	}
 
 	/*
-	 * sort by our mean service_used, sub-sort by in-flight requests
+	 * sort by when queue was last serviced
 	 */
-	while ((entry = entry->prev) != &cfqq->cfqd->rr_list) {
+	entry = list;
+	while ((entry = entry->prev) != list) {
 		struct cfq_queue *__cfqq = list_entry_cfqq(entry);
 
-		if (cfqq->service_used > __cfqq->service_used)
+		if (!__cfqq->service_last)
+			break;
+		if (time_before(__cfqq->service_last, cfqq->service_last))
 			break;
-		else if (cfqq->service_used == __cfqq->service_used) {
-			struct list_head *prv;
-
-			while ((prv = entry->prev) != &cfqq->cfqd->rr_list) {
-				__cfqq = list_entry_cfqq(prv);
-
-				WARN_ON(__cfqq->service_used > cfqq->service_used);
-				if (cfqq->service_used != __cfqq->service_used)
-					break;
-				if (cfqq->in_flight > __cfqq->in_flight)
-					break;
-
-				entry = prv;
-			}
-		}
 	}
 
 	list_add(&cfqq->cfq_list, entry);
@@ -469,28 +455,24 @@ static void cfq_sort_rr_list(struct cfq_queue *cfqq, int new_queue)
 
 /*
  * add to busy list of queues for service, trying to be fair in ordering
- * the pending list according to requests serviced
+ * the pending list according to last request service
  */
 static inline void
-cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq, int requeue)
 {
-	/*
-	 * it's currently on the empty list
-	 */
+	BUG_ON(cfqq->on_rr);
 	cfqq->on_rr = 1;
 	cfqd->busy_queues++;
 
-	if (time_after(jiffies, cfqq->service_start + cfq_service))
-		cfqq->service_used >>= 3;
-
-	cfq_sort_rr_list(cfqq, 1);
+	cfq_resort_rr_list(cfqq, requeue);
 }
 
 static inline void
 cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
-	list_move(&cfqq->cfq_list, &cfqd->empty_list);
+	BUG_ON(!cfqq->on_rr);
 	cfqq->on_rr = 0;
+	list_move(&cfqq->cfq_list, &cfqd->empty_list);
 
 	BUG_ON(!cfqd->busy_queues);
 	cfqd->busy_queues--;
@@ -505,16 +487,17 @@ static inline void cfq_del_crq_rb(struct cfq_rq *crq)
 
 	if (ON_RB(&crq->rb_node)) {
 		struct cfq_data *cfqd = cfqq->cfqd;
+		const int sync = crq->is_sync;
 
-		BUG_ON(!cfqq->queued[crq->is_sync]);
+		BUG_ON(!cfqq->queued[sync]);
+		cfqq->queued[sync]--;
 
 		cfq_update_next_crq(crq);
 
-		cfqq->queued[crq->is_sync]--;
 		rb_erase(&crq->rb_node, &cfqq->sort_list);
 		RB_CLEAR_COLOR(&crq->rb_node);
 
-		if (RB_EMPTY(&cfqq->sort_list) && cfqq->on_rr)
+		if (cfqq->on_rr && RB_EMPTY(&cfqq->sort_list))
 			cfq_del_cfqq_rr(cfqd, cfqq);
 	}
 }
@@ -562,7 +545,7 @@ static void cfq_add_crq_rb(struct cfq_rq *crq)
 	rb_insert_color(&crq->rb_node, &cfqq->sort_list);
 
 	if (!cfqq->on_rr)
-		cfq_add_cfqq_rr(cfqd, cfqq);
+		cfq_add_cfqq_rr(cfqd, cfqq, crq->requeued);
 
 	/*
 	 * check if this request is a better next-serve candidate
@@ -581,11 +564,10 @@ cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
 	cfq_add_crq_rb(crq);
 }
 
-static struct request *
-cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
+static struct request *cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
+
 {
-	const unsigned long key = cfq_hash_key(cfqd, current);
-	struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, key);
+	struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->pid);
 	struct rb_node *n;
 
 	if (!cfqq)
@@ -609,20 +591,23 @@ out:
 
 static void cfq_deactivate_request(request_queue_t *q, struct request *rq)
 {
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_rq *crq = RQ_DATA(rq);
 
 	if (crq) {
 		struct cfq_queue *cfqq = crq->cfq_queue;
 
-		if (cfqq->cfqd->cfq_tagged) {
-			cfqq->service_used--;
-			cfq_sort_rr_list(cfqq, 0);
-		}
-
 		if (crq->accounted) {
 			crq->accounted = 0;
-			cfqq->cfqd->rq_in_driver--;
+			WARN_ON(!cfqd->rq_in_driver);
+			cfqd->rq_in_driver--;
 		}
+		if (crq->in_flight) {
+			crq->in_flight = 0;
+			WARN_ON(!cfqq->in_flight);
+			cfqq->in_flight--;
+		}
+		crq->requeued = 1;
 	}
 }
 
@@ -640,11 +625,10 @@ static void cfq_remove_request(request_queue_t *q, struct request *rq)
 	struct cfq_rq *crq = RQ_DATA(rq);
 
 	if (crq) {
-		cfq_remove_merge_hints(q, crq);
 		list_del_init(&rq->queuelist);
+		cfq_del_crq_rb(crq);
+		cfq_remove_merge_hints(q, crq);
 
-		if (crq->cfq_queue)
-			cfq_del_crq_rb(crq);
 	}
 }
 
@@ -662,21 +646,15 @@ cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
 	}
 
 	__rq = cfq_find_rq_hash(cfqd, bio->bi_sector);
-	if (__rq) {
-		BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
-
-		if (elv_rq_merge_ok(__rq, bio)) {
-			ret = ELEVATOR_BACK_MERGE;
-			goto out;
-		}
+	if (__rq && elv_rq_merge_ok(__rq, bio)) {
+		ret = ELEVATOR_BACK_MERGE;
+		goto out;
 	}
 
 	__rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio));
-	if (__rq) {
-		if (elv_rq_merge_ok(__rq, bio)) {
-			ret = ELEVATOR_FRONT_MERGE;
-			goto out;
-		}
+	if (__rq && elv_rq_merge_ok(__rq, bio)) {
+		ret = ELEVATOR_FRONT_MERGE;
+		goto out;
 	}
 
 	return ELEVATOR_NO_MERGE;
@@ -709,20 +687,194 @@ static void
 cfq_merged_requests(request_queue_t *q, struct request *rq,
 		    struct request *next)
 {
-	struct cfq_rq *crq = RQ_DATA(rq);
-	struct cfq_rq *cnext = RQ_DATA(next);
-
 	cfq_merged_request(q, rq);
 
-	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist)) {
-		if (time_before(cnext->queue_start, crq->queue_start)) {
-			list_move(&rq->queuelist, &next->queuelist);
-			crq->queue_start = cnext->queue_start;
+	/*
+	 * reposition in fifo if next is older than rq
+	 */
+	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
+	    time_before(next->start_time, rq->start_time))
+		list_move(&rq->queuelist, &next->queuelist);
+
+	cfq_remove_request(q, next);
+}
+
+static inline void
+__cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	if (cfqq) {
+		/*
+		 * stop potential idle class queues waiting service
+		 */
+		del_timer(&cfqd->idle_class_timer);
+
+		cfqq->slice_start = jiffies;
+		cfqq->slice_end = 0;
+		cfqq->slice_left = 0;
+		cfqq->must_alloc_slice = 0;
+		cfqq->fifo_expire = 0;
+	}
+
+	cfqd->active_queue = cfqq;
+}
+
+/*
+ * 0
+ * 0,1
+ * 0,1,2
+ * 0,1,2,3
+ * 0,1,2,3,4
+ * 0,1,2,3,4,5
+ * 0,1,2,3,4,5,6
+ * 0,1,2,3,4,5,6,7
+ */
+static int cfq_get_next_prio_level(struct cfq_data *cfqd)
+{
+	int prio, wrap;
+
+	prio = -1;
+	wrap = 0;
+	do {
+		int p;
+
+		for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) {
+			if (!list_empty(&cfqd->rr_list[p])) {
+				prio = p;
+				break;
+			}
+		}
+
+		if (prio != -1)
+			break;
+		cfqd->cur_prio = 0;
+		if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
+			cfqd->cur_end_prio = 0;
+			if (wrap)
+				break;
+			wrap = 1;
 		}
+	} while (1);
+
+	if (unlikely(prio == -1))
+		return -1;
+
+	BUG_ON(prio >= CFQ_PRIO_LISTS);
+
+	list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr);
+
+	cfqd->cur_prio = prio + 1;
+	if (cfqd->cur_prio > cfqd->cur_end_prio) {
+		cfqd->cur_end_prio = cfqd->cur_prio;
+		cfqd->cur_prio = 0;
+	}
+	if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
+		cfqd->cur_prio = 0;
+		cfqd->cur_end_prio = 0;
 	}
 
-	cfq_update_next_crq(cnext);
-	cfq_remove_request(q, next);
+	return prio;
+}
+
+static void cfq_set_active_queue(struct cfq_data *cfqd)
+{
+	struct cfq_queue *cfqq = NULL;
+
+	/*
+	 * if current list is non-empty, grab first entry. if it is empty,
+	 * get next prio level and grab first entry then if any are spliced
+	 */
+	if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1)
+		cfqq = list_entry_cfqq(cfqd->cur_rr.next);
+
+	/*
+	 * if we have idle queues and no rt or be queues had pending
+	 * requests, either allow immediate service if the grace period
+	 * has passed or arm the idle grace timer
+	 */
+	if (!cfqq && !list_empty(&cfqd->idle_rr)) {
+		unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE;
+
+		if (time_after_eq(jiffies, end))
+			cfqq = list_entry_cfqq(cfqd->idle_rr.next);
+		else
+			mod_timer(&cfqd->idle_class_timer, end);
+	}
+
+	__cfq_set_active_queue(cfqd, cfqq);
+}
+
+/*
+ * current cfqq expired its slice (or was too idle), select new one
+ */
+static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted)
+{
+	struct cfq_queue *cfqq = cfqd->active_queue;
+
+	if (cfqq) {
+		unsigned long now = jiffies;
+
+		if (cfqq->wait_request)
+			del_timer(&cfqd->idle_slice_timer);
+
+		if (!preempted && !cfqq->in_flight)
+			cfqq->service_last = now;
+
+		cfqq->must_dispatch = 0;
+		cfqq->wait_request = 0;
+
+		/*
+		 * store what was left of this slice, if the queue idled out
+		 * or was preempted
+		 */
+		if (time_after(now, cfqq->slice_end))
+			cfqq->slice_left = now - cfqq->slice_end;
+		else
+			cfqq->slice_left = 0;
+
+		if (cfqq->on_rr)
+			cfq_resort_rr_list(cfqq, preempted);
+
+		cfqd->active_queue = NULL;
+
+		if (cfqd->active_cic) {
+			put_io_context(cfqd->active_cic->ioc);
+			cfqd->active_cic = NULL;
+		}
+	}
+
+	cfqd->dispatch_slice = 0;
+}
+
+static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+
+{
+	WARN_ON(!RB_EMPTY(&cfqq->sort_list));
+	WARN_ON(cfqq != cfqd->active_queue);
+
+	/*
+	 * idle is disabled, either manually or by past process history
+	 */
+	if (!cfqd->cfq_slice_idle)
+		return 0;
+	if (!cfqq->idle_window)
+		return 0;
+	/*
+	 * task has exited, don't wait
+	 */
+	if (cfqd->active_cic && !cfqd->active_cic->ioc->task)
+		return 0;
+
+	cfqq->wait_request = 1;
+	cfqq->must_alloc = 1;
+
+	if (!timer_pending(&cfqd->idle_slice_timer)) {
+		unsigned long slice_left = cfqq->slice_end - 1;
+
+		cfqd->idle_slice_timer.expires = min(jiffies + cfqd->cfq_slice_idle, slice_left);
+		add_timer(&cfqd->idle_slice_timer);
+	}
+
+	return 1;
 }
 
 /*
@@ -738,31 +890,39 @@ static void cfq_dispatch_sort(request_queue_t *q, struct cfq_rq *crq)
 	struct request *__rq;
 	sector_t last;
 
-	cfq_del_crq_rb(crq);
-	cfq_remove_merge_hints(q, crq);
 	list_del(&crq->request->queuelist);
 
 	last = cfqd->last_sector;
-	while ((entry = entry->prev) != head) {
-		__rq = list_entry_rq(entry);
+	list_for_each_entry_reverse(__rq, head, queuelist) {
+		struct cfq_rq *__crq = RQ_DATA(__rq);
 
-		if (blk_barrier_rq(crq->request))
+		if (blk_barrier_rq(__rq))
+			break;
+		if (!blk_fs_request(__rq))
 			break;
-		if (!blk_fs_request(crq->request))
+		if (__crq->requeued)
 			break;
 
-		if (crq->request->sector > __rq->sector)
+		if (__rq->sector <= crq->request->sector)
 			break;
 		if (__rq->sector > last && crq->request->sector < last) {
-			last = crq->request->sector;
+			last = crq->request->sector + crq->request->nr_sectors;
 			break;
 		}
+		entry = &__rq->queuelist;
 	}
 
 	cfqd->last_sector = last;
+
+	cfqq->next_crq = cfq_find_next_crq(cfqd, cfqq, crq);
+
+	cfq_del_crq_rb(crq);
+	cfq_remove_merge_hints(q, crq);
+
 	crq->in_flight = 1;
+	crq->requeued = 0;
 	cfqq->in_flight++;
-	list_add(&crq->request->queuelist, entry);
+	list_add_tail(&crq->request->queuelist, entry);
 }
 
 /*
@@ -771,105 +931,176 @@ static void cfq_dispatch_sort(request_queue_t *q, struct cfq_rq *crq)
 static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq)
 {
 	struct cfq_data *cfqd = cfqq->cfqd;
-	const int reads = !list_empty(&cfqq->fifo[0]);
-	const int writes = !list_empty(&cfqq->fifo[1]);
-	unsigned long now = jiffies;
+	struct request *rq;
 	struct cfq_rq *crq;
 
-	if (time_before(now, cfqq->last_fifo_expire + cfqd->cfq_fifo_batch_expire))
+	if (cfqq->fifo_expire)
 		return NULL;
 
-	crq = RQ_DATA(list_entry(cfqq->fifo[0].next, struct request, queuelist));
-	if (reads && time_after(now, crq->queue_start + cfqd->cfq_fifo_expire_r)) {
-		cfqq->last_fifo_expire = now;
-		return crq;
-	}
+	if (!list_empty(&cfqq->fifo)) {
+		int fifo = cfq_cfqq_sync(cfqq);
 
-	crq = RQ_DATA(list_entry(cfqq->fifo[1].next, struct request, queuelist));
-	if (writes && time_after(now, crq->queue_start + cfqd->cfq_fifo_expire_w)) {
-		cfqq->last_fifo_expire = now;
-		return crq;
+		crq = RQ_DATA(list_entry_fifo(cfqq->fifo.next));
+		rq = crq->request;
+		if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) {
+			cfqq->fifo_expire = 1;
+			return crq;
+		}
 	}
 
 	return NULL;
 }
 
 /*
- * dispatch a single request from given queue
+ * Scale schedule slice based on io priority
  */
+static inline int
+cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)];
+
+	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
+
+	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - cfqq->ioprio));
+}
+
 static inline void
-cfq_dispatch_request(request_queue_t *q, struct cfq_data *cfqd,
-		     struct cfq_queue *cfqq)
+cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
-	struct cfq_rq *crq;
+	cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
+}
 
-	/*
-	 * follow expired path, else get first next available
-	 */
-	if ((crq = cfq_check_fifo(cfqq)) == NULL) {
-		if (cfqd->find_best_crq)
-			crq = cfqq->next_crq;
-		else
-			crq = rb_entry_crq(rb_first(&cfqq->sort_list));
-	}
+static inline int
+cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	const int base_rq = cfqd->cfq_slice_async_rq;
 
-	cfqd->last_sector = crq->request->sector + crq->request->nr_sectors;
+	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
 
-	/*
-	 * finally, insert request into driver list
-	 */
-	cfq_dispatch_sort(q, crq);
+	return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
 }
 
-static int cfq_dispatch_requests(request_queue_t *q, int max_dispatch)
+/*
+ * get next queue for service
+ */
+static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd, int force)
 {
-	struct cfq_data *cfqd = q->elevator->elevator_data;
+	unsigned long now = jiffies;
 	struct cfq_queue *cfqq;
-	struct list_head *entry, *tmp;
-	int queued, busy_queues, first_round;
 
-	if (list_empty(&cfqd->rr_list))
-		return 0;
+	cfqq = cfqd->active_queue;
+	if (!cfqq)
+		goto new_queue;
 
-	queued = 0;
-	first_round = 1;
-restart:
-	busy_queues = 0;
-	list_for_each_safe(entry, tmp, &cfqd->rr_list) {
-		cfqq = list_entry_cfqq(entry);
+	/*
+	 * slice has expired
+	 */
+	if (!cfqq->must_dispatch && time_after(jiffies, cfqq->slice_end))
+		goto new_queue;
 
-		BUG_ON(RB_EMPTY(&cfqq->sort_list));
+	/*
+	 * if queue has requests, dispatch one. if not, check if
+	 * enough slice is left to wait for one
+	 */
+	if (!RB_EMPTY(&cfqq->sort_list))
+		goto keep_queue;
+	else if (!force && cfq_cfqq_sync(cfqq) &&
+		 time_before(now, cfqq->slice_end)) {
+		if (cfq_arm_slice_timer(cfqd, cfqq))
+			return NULL;
+	}
+
+new_queue:
+	cfq_slice_expired(cfqd, 0);
+	cfq_set_active_queue(cfqd);
+keep_queue:
+	return cfqd->active_queue;
+}
+
+static int
+__cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+			int max_dispatch)
+{
+	int dispatched = 0;
+
+	BUG_ON(RB_EMPTY(&cfqq->sort_list));
+
+	do {
+		struct cfq_rq *crq;
 
 		/*
-		 * first round of queueing, only select from queues that
-		 * don't already have io in-flight
+		 * follow expired path, else get first next available
 		 */
-		if (first_round && cfqq->in_flight)
-			continue;
+		if ((crq = cfq_check_fifo(cfqq)) == NULL)
+			crq = cfqq->next_crq;
+
+		/*
+		 * finally, insert request into driver dispatch list
+		 */
+		cfq_dispatch_sort(cfqd->queue, crq);
 
-		cfq_dispatch_request(q, cfqd, cfqq);
+		cfqd->dispatch_slice++;
+		dispatched++;
 
-		if (!RB_EMPTY(&cfqq->sort_list))
-			busy_queues++;
+		if (!cfqd->active_cic) {
+			atomic_inc(&crq->io_context->ioc->refcount);
+			cfqd->active_cic = crq->io_context;
+		}
 
-		queued++;
-	}
+		if (RB_EMPTY(&cfqq->sort_list))
+			break;
+
+	} while (dispatched < max_dispatch);
+
+	/*
+	 * if slice end isn't set yet, set it. if at least one request was
+	 * sync, use the sync time slice value
+	 */
+	if (!cfqq->slice_end)
+		cfq_set_prio_slice(cfqd, cfqq);
+
+	/*
+	 * expire an async queue immediately if it has used up its slice. idle
+	 * queue always expire after 1 dispatch round.
+	 */
+	if ((!cfq_cfqq_sync(cfqq) &&
+	    cfqd->dispatch_slice >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
+	    cfq_class_idle(cfqq))
+		cfq_slice_expired(cfqd, 0);
+
+	return dispatched;
+}
+
+static int
+cfq_dispatch_requests(request_queue_t *q, int max_dispatch, int force)
+{
+	struct cfq_data *cfqd = q->elevator->elevator_data;
+	struct cfq_queue *cfqq;
+
+	if (!cfqd->busy_queues)
+		return 0;
+
+	cfqq = cfq_select_queue(cfqd, force);
+	if (cfqq) {
+		cfqq->wait_request = 0;
+		cfqq->must_dispatch = 0;
+		del_timer(&cfqd->idle_slice_timer);
+
+		if (cfq_class_idle(cfqq))
+			max_dispatch = 1;
 
-	if ((queued < max_dispatch) && (busy_queues || first_round)) {
-		first_round = 0;
-		goto restart;
+		return __cfq_dispatch_requests(cfqd, cfqq, max_dispatch);
 	}
 
-	return queued;
+	return 0;
 }
 
 static inline void cfq_account_dispatch(struct cfq_rq *crq)
 {
 	struct cfq_queue *cfqq = crq->cfq_queue;
 	struct cfq_data *cfqd = cfqq->cfqd;
-	unsigned long now, elapsed;
 
-	if (!blk_fs_request(crq->request))
+	if (unlikely(!blk_fs_request(crq->request)))
 		return;
 
 	/*
@@ -879,65 +1110,34 @@ static inline void cfq_account_dispatch(struct cfq_rq *crq)
 	if (crq->accounted)
 		return;
 
-	now = jiffies;
-	if (cfqq->service_start == ~0UL)
-		cfqq->service_start = now;
-
-	/*
-	 * on drives with tagged command queueing, command turn-around time
-	 * doesn't necessarily reflect the time spent processing this very
-	 * command inside the drive. so do the accounting differently there,
-	 * by just sorting on the number of requests
-	 */
-	if (cfqd->cfq_tagged) {
-		if (time_after(now, cfqq->service_start + cfq_service)) {
-			cfqq->service_start = now;
-			cfqq->service_used /= 10;
-		}
-
-		cfqq->service_used++;
-		cfq_sort_rr_list(cfqq, 0);
-	}
-
-	elapsed = now - crq->queue_start;
-	if (elapsed > max_elapsed_dispatch)
-		max_elapsed_dispatch = elapsed;
-
 	crq->accounted = 1;
-	crq->service_start = now;
-
-	if (++cfqd->rq_in_driver >= CFQ_MAX_TAG && !cfqd->cfq_tagged) {
-		cfqq->cfqd->cfq_tagged = 1;
-		printk("cfq: depth %d reached, tagging now on\n", CFQ_MAX_TAG);
-	}
+	cfqd->rq_in_driver++;
 }
 
 static inline void
 cfq_account_completion(struct cfq_queue *cfqq, struct cfq_rq *crq)
 {
 	struct cfq_data *cfqd = cfqq->cfqd;
+	unsigned long now;
 
 	if (!crq->accounted)
 		return;
 
+	now = jiffies;
+
 	WARN_ON(!cfqd->rq_in_driver);
 	cfqd->rq_in_driver--;
 
-	if (!cfqd->cfq_tagged) {
-		unsigned long now = jiffies;
-		unsigned long duration = now - crq->service_start;
+	if (!cfq_class_idle(cfqq))
+		cfqd->last_end_request = now;
 
-		if (time_after(now, cfqq->service_start + cfq_service)) {
-			cfqq->service_start = now;
-			cfqq->service_used >>= 3;
-		}
-
-		cfqq->service_used += duration;
-		cfq_sort_rr_list(cfqq, 0);
-
-		if (duration > max_elapsed_crq)
-			max_elapsed_crq = duration;
+	if (!cfqq->in_flight && cfqq->on_rr) {
+		cfqq->service_last = now;
+		cfq_resort_rr_list(cfqq, 0);
 	}
+
+	if (crq->is_sync)
+		crq->io_context->last_end_request = now;
 }
 
 static struct request *cfq_next_request(request_queue_t *q)
@@ -950,7 +1150,15 @@ static struct request *cfq_next_request(request_queue_t *q)
 dispatch:
 		rq = list_entry_rq(q->queue_head.next);
 
-		if ((crq = RQ_DATA(rq)) != NULL) {
+		crq = RQ_DATA(rq);
+		if (crq) {
+			/*
+			 * if idle window is disabled, allow queue buildup
+			 */
+			if (!crq->in_flight && !crq->cfq_queue->idle_window &&
+			    cfqd->rq_in_driver >= cfqd->cfq_max_depth)
+				return NULL;
+
 			cfq_remove_merge_hints(q, crq);
 			cfq_account_dispatch(crq);
 		}
@@ -958,7 +1166,7 @@ dispatch:
 		return rq;
 	}
 
-	if (cfq_dispatch_requests(q, cfqd->cfq_quantum))
+	if (cfq_dispatch_requests(q, cfqd->cfq_quantum, 0))
 		goto dispatch;
 
 	return NULL;
@@ -972,14 +1180,22 @@ dispatch:
  */
 static void cfq_put_queue(struct cfq_queue *cfqq)
 {
-	BUG_ON(!atomic_read(&cfqq->ref));
+	struct cfq_data *cfqd = cfqq->cfqd;
+
+	BUG_ON(atomic_read(&cfqq->ref) <= 0);
 
 	if (!atomic_dec_and_test(&cfqq->ref))
 		return;
 
 	BUG_ON(rb_first(&cfqq->sort_list));
+	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
 	BUG_ON(cfqq->on_rr);
 
+	if (unlikely(cfqd->active_queue == cfqq)) {
+		cfq_slice_expired(cfqd, 0);
+		kblockd_schedule_work(&cfqd->unplug_work);
+	}
+
 	cfq_put_cfqd(cfqq->cfqd);
 
 	/*
@@ -991,7 +1207,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 }
 
 static inline struct cfq_queue *
-__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key, const int hashval)
+__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, const int hashval)
 {
 	struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
 	struct hlist_node *entry, *next;
@@ -1007,94 +1223,220 @@ __cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key, const int hashval)
 }
 
 static struct cfq_queue *
-cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key)
+cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key)
 {
 	return __cfq_find_cfq_hash(cfqd, key, hash_long(key, CFQ_QHASH_SHIFT));
 }
 
-static inline void
-cfq_rehash_cfqq(struct cfq_data *cfqd, struct cfq_queue **cfqq,
-		struct cfq_io_context *cic)
+static void cfq_free_io_context(struct cfq_io_context *cic)
 {
-	unsigned long hashkey = cfq_hash_key(cfqd, current);
-	unsigned long hashval = hash_long(hashkey, CFQ_QHASH_SHIFT);
-	struct cfq_queue *__cfqq;
-	unsigned long flags;
+	struct cfq_io_context *__cic;
+	struct list_head *entry, *next;
 
-	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
-
-	hlist_del(&(*cfqq)->cfq_hash);
-
-	__cfqq = __cfq_find_cfq_hash(cfqd, hashkey, hashval);
-	if (!__cfqq || __cfqq == *cfqq) {
-		__cfqq = *cfqq;
-		hlist_add_head(&__cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
-		__cfqq->key_type = cfqd->key_type;
-	} else {
-		atomic_inc(&__cfqq->ref);
-		cic->cfqq = __cfqq;
-		cfq_put_queue(*cfqq);
-		*cfqq = __cfqq;
+	list_for_each_safe(entry, next, &cic->list) {
+		__cic = list_entry(entry, struct cfq_io_context, list);
+		kmem_cache_free(cfq_ioc_pool, __cic);
 	}
 
-	cic->cfqq = __cfqq;
-	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
+	kmem_cache_free(cfq_ioc_pool, cic);
 }
 
-static void cfq_free_io_context(struct cfq_io_context *cic)
+/*
+ * Called with interrupts disabled
+ */
+static void cfq_exit_single_io_context(struct cfq_io_context *cic)
 {
-	kmem_cache_free(cfq_ioc_pool, cic);
+	struct cfq_data *cfqd = cic->cfqq->cfqd;
+	request_queue_t *q = cfqd->queue;
+
+	WARN_ON(!irqs_disabled());
+
+	spin_lock(q->queue_lock);
+
+	if (unlikely(cic->cfqq == cfqd->active_queue)) {
+		cfq_slice_expired(cfqd, 0);
+		kblockd_schedule_work(&cfqd->unplug_work);
+	}
+
+	cfq_put_queue(cic->cfqq);
+	cic->cfqq = NULL;
+	spin_unlock(q->queue_lock);
 }
 
 /*
- * locking hierarchy is: io_context lock -> queue locks
+ * Another task may update the task cic list, if it is doing a queue lookup
+ * on its behalf. cfq_cic_lock excludes such concurrent updates
  */
 static void cfq_exit_io_context(struct cfq_io_context *cic)
 {
-	struct cfq_queue *cfqq = cic->cfqq;
-	struct list_head *entry = &cic->list;
-	request_queue_t *q;
+	struct cfq_io_context *__cic;
+	struct list_head *entry;
 	unsigned long flags;
 
+	local_irq_save(flags);
+
 	/*
 	 * put the reference this task is holding to the various queues
 	 */
-	spin_lock_irqsave(&cic->ioc->lock, flags);
-	while ((entry = cic->list.next) != &cic->list) {
-		struct cfq_io_context *__cic;
-
+	list_for_each(entry, &cic->list) {
 		__cic = list_entry(entry, struct cfq_io_context, list);
-		list_del(entry);
-
-		q = __cic->cfqq->cfqd->queue;
-		spin_lock(q->queue_lock);
-		cfq_put_queue(__cic->cfqq);
-		spin_unlock(q->queue_lock);
+		cfq_exit_single_io_context(__cic);
 	}
 
-	q = cfqq->cfqd->queue;
-	spin_lock(q->queue_lock);
-	cfq_put_queue(cfqq);
-	spin_unlock(q->queue_lock);
-
-	cic->cfqq = NULL;
-	spin_unlock_irqrestore(&cic->ioc->lock, flags);
+	cfq_exit_single_io_context(cic);
+	local_irq_restore(flags);
 }
 
-static struct cfq_io_context *cfq_alloc_io_context(int gfp_flags)
+static struct cfq_io_context *
+cfq_alloc_io_context(struct cfq_data *cfqd, int gfp_mask)
 {
-	struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_flags);
+	struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_mask);
 
 	if (cic) {
-		cic->dtor = cfq_free_io_context;
-		cic->exit = cfq_exit_io_context;
 		INIT_LIST_HEAD(&cic->list);
 		cic->cfqq = NULL;
+		cic->key = NULL;
+		cic->last_end_request = jiffies;
+		cic->ttime_total = 0;
+		cic->ttime_samples = 0;
+		cic->ttime_mean = 0;
+		cic->dtor = cfq_free_io_context;
+		cic->exit = cfq_exit_io_context;
 	}
 
 	return cic;
 }
 
+static void cfq_init_prio_data(struct cfq_queue *cfqq)
+{
+	struct task_struct *tsk = current;
+	int ioprio_class;
+
+	if (!cfqq->prio_changed)
+		return;
+
+	ioprio_class = IOPRIO_PRIO_CLASS(tsk->ioprio);
+	switch (ioprio_class) {
+		default:
+			printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
+		case IOPRIO_CLASS_NONE:
+			/*
+			 * no prio set, place us in the middle of the BE classes
+			 */
+			cfqq->ioprio = task_nice_ioprio(tsk);
+			cfqq->ioprio_class = IOPRIO_CLASS_BE;
+			break;
+		case IOPRIO_CLASS_RT:
+			cfqq->ioprio = task_ioprio(tsk);
+			cfqq->ioprio_class = IOPRIO_CLASS_RT;
+			break;
+		case IOPRIO_CLASS_BE:
+			cfqq->ioprio = task_ioprio(tsk);
+			cfqq->ioprio_class = IOPRIO_CLASS_BE;
+			break;
+		case IOPRIO_CLASS_IDLE:
+			cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
+			cfqq->ioprio = 7;
+			cfqq->idle_window = 0;
+			break;
+	}
+
+	/*
+	 * keep track of original prio settings in case we have to temporarily
+	 * elevate the priority of this queue
+	 */
+	cfqq->org_ioprio = cfqq->ioprio;
+	cfqq->org_ioprio_class = cfqq->ioprio_class;
+
+	if (cfqq->on_rr)
+		cfq_resort_rr_list(cfqq, 0);
+
+	cfqq->prio_changed = 0;
+}
+
+static inline void changed_ioprio(struct cfq_queue *cfqq)
+{
+	if (cfqq) {
+		struct cfq_data *cfqd = cfqq->cfqd;
+
+		spin_lock(cfqd->queue->queue_lock);
+		cfqq->prio_changed = 1;
+		cfq_init_prio_data(cfqq);
+		spin_unlock(cfqd->queue->queue_lock);
+	}
+}
+
+/*
+ * callback from sys_ioprio_set, irqs are disabled
+ */
+static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio)
+{
+	struct cfq_io_context *cic = ioc->cic;
+
+	changed_ioprio(cic->cfqq);
+
+	list_for_each_entry(cic, &cic->list, list)
+		changed_ioprio(cic->cfqq);
+
+	return 0;
+}
+
+static struct cfq_queue *
+cfq_get_queue(struct cfq_data *cfqd, unsigned int key, int gfp_mask)
+{
+	const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
+	struct cfq_queue *cfqq, *new_cfqq = NULL;
+
+retry:
+	cfqq = __cfq_find_cfq_hash(cfqd, key, hashval);
+
+	if (!cfqq) {
+		if (new_cfqq) {
+			cfqq = new_cfqq;
+			new_cfqq = NULL;
+		} else if (gfp_mask & __GFP_WAIT) {
+			spin_unlock_irq(cfqd->queue->queue_lock);
+			new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
+			spin_lock_irq(cfqd->queue->queue_lock);
+			goto retry;
+		} else {
+			cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
+			if (!cfqq)
+				goto out;
+		}
+
+		memset(cfqq, 0, sizeof(*cfqq));
+
+		INIT_HLIST_NODE(&cfqq->cfq_hash);
+		INIT_LIST_HEAD(&cfqq->cfq_list);
+		RB_CLEAR_ROOT(&cfqq->sort_list);
+		INIT_LIST_HEAD(&cfqq->fifo);
+
+		cfqq->key = key;
+		hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
+		atomic_set(&cfqq->ref, 0);
+		cfqq->cfqd = cfqd;
+		atomic_inc(&cfqd->ref);
+		cfqq->service_last = 0;
+		/*
+		 * set ->slice_left to allow preemption for a new process
+		 */
+		cfqq->slice_left = 2 * cfqd->cfq_slice_idle;
+		cfqq->idle_window = 1;
+		cfqq->ioprio = -1;
+		cfqq->ioprio_class = -1;
+		cfqq->prio_changed = 1;
+	}
+
+	if (new_cfqq)
+		kmem_cache_free(cfq_pool, new_cfqq);
+
+	atomic_inc(&cfqq->ref);
+out:
+	WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq);
+	return cfqq;
+}
+
 /*
  * Setup general io context and cfq io context. There can be several cfq
  * io contexts per general io context, if this process is doing io to more
@@ -1102,39 +1444,39 @@ static struct cfq_io_context *cfq_alloc_io_context(int gfp_flags)
  * cfqq, so we don't need to worry about it disappearing
  */
 static struct cfq_io_context *
-cfq_get_io_context(struct cfq_queue **cfqq, int gfp_flags)
+cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, int gfp_mask)
 {
-	struct cfq_data *cfqd = (*cfqq)->cfqd;
-	struct cfq_queue *__cfqq = *cfqq;
+	struct io_context *ioc = NULL;
 	struct cfq_io_context *cic;
-	struct io_context *ioc;
 
-	might_sleep_if(gfp_flags & __GFP_WAIT);
+	might_sleep_if(gfp_mask & __GFP_WAIT);
 
-	ioc = get_io_context(gfp_flags);
+	ioc = get_io_context(gfp_mask);
 	if (!ioc)
 		return NULL;
 
 	if ((cic = ioc->cic) == NULL) {
-		cic = cfq_alloc_io_context(gfp_flags);
+		cic = cfq_alloc_io_context(cfqd, gfp_mask);
 
 		if (cic == NULL)
 			goto err;
 
+		/*
+		 * manually increment generic io_context usage count, it
+		 * cannot go away since we are already holding one ref to it
+		 */
 		ioc->cic = cic;
+		ioc->set_ioprio = cfq_ioc_set_ioprio;
 		cic->ioc = ioc;
-		cic->cfqq = __cfqq;
-		atomic_inc(&__cfqq->ref);
+		cic->key = cfqd;
+		atomic_inc(&cfqd->ref);
 	} else {
 		struct cfq_io_context *__cic;
-		unsigned long flags;
 
 		/*
-		 * since the first cic on the list is actually the head
-		 * itself, need to check this here or we'll duplicate an
-		 * cic per ioc for no reason
+		 * the first cic on the list is actually the head itself
 		 */
-		if (cic->cfqq == __cfqq)
+		if (cic->key == cfqd)
 			goto out;
 
 		/*
@@ -1142,152 +1484,259 @@ cfq_get_io_context(struct cfq_queue **cfqq, int gfp_flags)
 		 * should be ok here, the list will usually not be more than
 		 * 1 or a few entries long
 		 */
-		spin_lock_irqsave(&ioc->lock, flags);
 		list_for_each_entry(__cic, &cic->list, list) {
 			/*
 			 * this process is already holding a reference to
 			 * this queue, so no need to get one more
 			 */
-			if (__cic->cfqq == __cfqq) {
+			if (__cic->key == cfqd) {
 				cic = __cic;
-				spin_unlock_irqrestore(&ioc->lock, flags);
 				goto out;
 			}
 		}
-		spin_unlock_irqrestore(&ioc->lock, flags);
 
 		/*
 		 * nope, process doesn't have a cic assoicated with this
 		 * cfqq yet. get a new one and add to list
 		 */
-		__cic = cfq_alloc_io_context(gfp_flags);
+		__cic = cfq_alloc_io_context(cfqd, gfp_mask);
 		if (__cic == NULL)
 			goto err;
 
 		__cic->ioc = ioc;
-		__cic->cfqq = __cfqq;
-		atomic_inc(&__cfqq->ref);
-		spin_lock_irqsave(&ioc->lock, flags);
+		__cic->key = cfqd;
+		atomic_inc(&cfqd->ref);
 		list_add(&__cic->list, &cic->list);
-		spin_unlock_irqrestore(&ioc->lock, flags);
-
 		cic = __cic;
-		*cfqq = __cfqq;
 	}
 
 out:
-	/*
-	 * if key_type has been changed on the fly, we lazily rehash
-	 * each queue at lookup time
-	 */
-	if ((*cfqq)->key_type != cfqd->key_type)
-		cfq_rehash_cfqq(cfqd, cfqq, cic);
-
 	return cic;
 err:
 	put_io_context(ioc);
 	return NULL;
 }
 
-static struct cfq_queue *
-__cfq_get_queue(struct cfq_data *cfqd, unsigned long key, int gfp_mask)
+static void
+cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
 {
-	const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
-	struct cfq_queue *cfqq, *new_cfqq = NULL;
+	unsigned long elapsed, ttime;
 
-retry:
-	cfqq = __cfq_find_cfq_hash(cfqd, key, hashval);
+	/*
+	 * if this context already has stuff queued, thinktime is from
+	 * last queue not last end
+	 */
+#if 0
+	if (time_after(cic->last_end_request, cic->last_queue))
+		elapsed = jiffies - cic->last_end_request;
+	else
+		elapsed = jiffies - cic->last_queue;
+#else
+		elapsed = jiffies - cic->last_end_request;
+#endif
 
-	if (!cfqq) {
-		if (new_cfqq) {
-			cfqq = new_cfqq;
-			new_cfqq = NULL;
-		} else {
-			spin_unlock_irq(cfqd->queue->queue_lock);
-			new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
-			spin_lock_irq(cfqd->queue->queue_lock);
+	ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
 
-			if (!new_cfqq && !(gfp_mask & __GFP_WAIT))
-				goto out;
-
-			goto retry;
-		}
+	cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
+	cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
+	cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
+}
 
-		memset(cfqq, 0, sizeof(*cfqq));
+#define sample_valid(samples)	((samples) > 80)
 
-		INIT_HLIST_NODE(&cfqq->cfq_hash);
-		INIT_LIST_HEAD(&cfqq->cfq_list);
-		RB_CLEAR_ROOT(&cfqq->sort_list);
-		INIT_LIST_HEAD(&cfqq->fifo[0]);
-		INIT_LIST_HEAD(&cfqq->fifo[1]);
+/*
+ * Disable idle window if the process thinks too long or seeks so much that
+ * it doesn't matter
+ */
+static void
+cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+		       struct cfq_io_context *cic)
+{
+	int enable_idle = cfqq->idle_window;
 
-		cfqq->key = key;
-		hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
-		atomic_set(&cfqq->ref, 0);
-		cfqq->cfqd = cfqd;
-		atomic_inc(&cfqd->ref);
-		cfqq->key_type = cfqd->key_type;
-		cfqq->service_start = ~0UL;
+	if (!cic->ioc->task || !cfqd->cfq_slice_idle)
+		enable_idle = 0;
+	else if (sample_valid(cic->ttime_samples)) {
+		if (cic->ttime_mean > cfqd->cfq_slice_idle)
+			enable_idle = 0;
+		else
+			enable_idle = 1;
 	}
 
-	if (new_cfqq)
-		kmem_cache_free(cfq_pool, new_cfqq);
+	cfqq->idle_window = enable_idle;
+}
 
-	atomic_inc(&cfqq->ref);
-out:
-	WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq);
-	return cfqq;
+
+/*
+ * Check if new_cfqq should preempt the currently active queue. Return 0 for
+ * no or if we aren't sure, a 1 will cause a preempt.
+ */
+static int
+cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
+		   struct cfq_rq *crq)
+{
+	struct cfq_queue *cfqq = cfqd->active_queue;
+
+	if (cfq_class_idle(new_cfqq))
+		return 0;
+
+	if (!cfqq)
+		return 1;
+
+	if (cfq_class_idle(cfqq))
+		return 1;
+	if (!new_cfqq->wait_request)
+		return 0;
+	/*
+	 * if it doesn't have slice left, forget it
+	 */
+	if (new_cfqq->slice_left < cfqd->cfq_slice_idle)
+		return 0;
+	if (crq->is_sync && !cfq_cfqq_sync(cfqq))
+		return 1;
+
+	return 0;
+}
+
+/*
+ * cfqq preempts the active queue. if we allowed preempt with no slice left,
+ * let it have half of its nominal slice.
+ */
+static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	struct cfq_queue *__cfqq, *next;
+
+	list_for_each_entry_safe(__cfqq, next, &cfqd->cur_rr, cfq_list)
+		cfq_resort_rr_list(__cfqq, 1);
+
+	if (!cfqq->slice_left)
+		cfqq->slice_left = cfq_prio_to_slice(cfqd, cfqq) / 2;
+
+	cfqq->slice_end = cfqq->slice_left + jiffies;
+	cfq_slice_expired(cfqd, 1);
+	__cfq_set_active_queue(cfqd, cfqq);
+}
+
+/*
+ * should really be a ll_rw_blk.c helper
+ */
+static void cfq_start_queueing(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	request_queue_t *q = cfqd->queue;
+
+	if (!blk_queue_plugged(q))
+		q->request_fn(q);
+	else
+		__generic_unplug_device(q);
+}
+
+/*
+ * Called when a new fs request (crq) is added (to cfqq). Check if there's
+ * something we should do about it
+ */
+static void
+cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+		 struct cfq_rq *crq)
+{
+	const int sync = crq->is_sync;
+
+	cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
+
+	if (sync) {
+		struct cfq_io_context *cic = crq->io_context;
+
+		cfq_update_io_thinktime(cfqd, cic);
+		cfq_update_idle_window(cfqd, cfqq, cic);
+
+		cic->last_queue = jiffies;
+	}
+
+	if (cfqq == cfqd->active_queue) {
+		/*
+		 * if we are waiting for a request for this queue, let it rip
+		 * immediately and flag that we must not expire this queue
+		 * just now
+		 */
+		if (cfqq->wait_request) {
+			cfqq->must_dispatch = 1;
+			del_timer(&cfqd->idle_slice_timer);
+			cfq_start_queueing(cfqd, cfqq);
+		}
+	} else if (cfq_should_preempt(cfqd, cfqq, crq)) {
+		/*
+		 * not the active queue - expire current slice if it is
+		 * idle and has expired it's mean thinktime or this new queue
+		 * has some old slice time left and is of higher priority
+		 */
+		cfq_preempt_queue(cfqd, cfqq);
+		cfqq->must_dispatch = 1;
+		cfq_start_queueing(cfqd, cfqq);
+	}
 }
 
-static void cfq_enqueue(struct cfq_data *cfqd, struct cfq_rq *crq)
+static void cfq_enqueue(struct cfq_data *cfqd, struct request *rq)
 {
-	crq->is_sync = 0;
-	if (rq_data_dir(crq->request) == READ || current->flags & PF_SYNCWRITE)
-		crq->is_sync = 1;
+	struct cfq_rq *crq = RQ_DATA(rq);
+	struct cfq_queue *cfqq = crq->cfq_queue;
+
+	cfq_init_prio_data(cfqq);
 
 	cfq_add_crq_rb(crq);
-	crq->queue_start = jiffies;
 
-	list_add_tail(&crq->request->queuelist, &crq->cfq_queue->fifo[crq->is_sync]);
+	list_add_tail(&rq->queuelist, &cfqq->fifo);
+
+	if (rq_mergeable(rq)) {
+		cfq_add_crq_hash(cfqd, crq);
+
+		if (!cfqd->queue->last_merge)
+			cfqd->queue->last_merge = rq;
+	}
+
+	cfq_crq_enqueued(cfqd, cfqq, crq);
 }
 
 static void
 cfq_insert_request(request_queue_t *q, struct request *rq, int where)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
-	struct cfq_rq *crq = RQ_DATA(rq);
 
 	switch (where) {
 		case ELEVATOR_INSERT_BACK:
-			while (cfq_dispatch_requests(q, cfqd->cfq_quantum))
+			while (cfq_dispatch_requests(q, INT_MAX, 1))
 				;
 			list_add_tail(&rq->queuelist, &q->queue_head);
+			/*
+			 * If we were idling with pending requests on
+			 * inactive cfqqs, force dispatching will
+			 * remove the idle timer and the queue won't
+			 * be kicked by __make_request() afterward.
+			 * Kick it here.
+			 */
+			kblockd_schedule_work(&cfqd->unplug_work);
 			break;
 		case ELEVATOR_INSERT_FRONT:
 			list_add(&rq->queuelist, &q->queue_head);
 			break;
 		case ELEVATOR_INSERT_SORT:
 			BUG_ON(!blk_fs_request(rq));
-			cfq_enqueue(cfqd, crq);
+			cfq_enqueue(cfqd, rq);
 			break;
 		default:
 			printk("%s: bad insert point %d\n", __FUNCTION__,where);
 			return;
 	}
+}
 
-	if (rq_mergeable(rq)) {
-		cfq_add_crq_hash(cfqd, crq);
-
-		if (!q->last_merge)
-			q->last_merge = rq;
-	}
+static inline int cfq_pending_requests(struct cfq_data *cfqd)
+{
+	return !list_empty(&cfqd->queue->queue_head) || cfqd->busy_queues;
 }
 
 static int cfq_queue_empty(request_queue_t *q)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 
-	return list_empty(&q->queue_head) && list_empty(&cfqd->rr_list);
+	return !cfq_pending_requests(cfqd);
 }
 
 static void cfq_completed_request(request_queue_t *q, struct request *rq)
@@ -1332,51 +1781,132 @@ cfq_latter_request(request_queue_t *q, struct request *rq)
 	return NULL;
 }
 
-static int cfq_may_queue(request_queue_t *q, int rw)
+/*
+ * we temporarily boost lower priority queues if they are holding fs exclusive
+ * resources. they are boosted to normal prio (CLASS_BE/4)
+ */
+static void cfq_prio_boost(struct cfq_queue *cfqq)
 {
-	struct cfq_data *cfqd = q->elevator->elevator_data;
-	struct cfq_queue *cfqq;
-	int ret = ELV_MQUEUE_MAY;
+	const int ioprio_class = cfqq->ioprio_class;
+	const int ioprio = cfqq->ioprio;
 
-	if (current->flags & PF_MEMALLOC)
-		return ELV_MQUEUE_MAY;
+	if (has_fs_excl()) {
+		/*
+		 * boost idle prio on transactions that would lock out other
+		 * users of the filesystem
+		 */
+		if (cfq_class_idle(cfqq))
+			cfqq->ioprio_class = IOPRIO_CLASS_BE;
+		if (cfqq->ioprio > IOPRIO_NORM)
+			cfqq->ioprio = IOPRIO_NORM;
+	} else {
+		/*
+		 * check if we need to unboost the queue
+		 */
+		if (cfqq->ioprio_class != cfqq->org_ioprio_class)
+			cfqq->ioprio_class = cfqq->org_ioprio_class;
+		if (cfqq->ioprio != cfqq->org_ioprio)
+			cfqq->ioprio = cfqq->org_ioprio;
+	}
 
-	cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(cfqd, current));
-	if (cfqq) {
-		int limit = cfqd->max_queued;
+	/*
+	 * refile between round-robin lists if we moved the priority class
+	 */
+	if ((ioprio_class != cfqq->ioprio_class || ioprio != cfqq->ioprio) &&
+	    cfqq->on_rr)
+		cfq_resort_rr_list(cfqq, 0);
+}
 
-		if (cfqq->allocated[rw] < cfqd->cfq_queued)
-			return ELV_MQUEUE_MUST;
+static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
+{
+	if (rw == READ || process_sync(task))
+		return task->pid;
 
-		if (cfqd->busy_queues)
-			limit = q->nr_requests / cfqd->busy_queues;
+	return CFQ_KEY_ASYNC;
+}
 
-		if (limit < cfqd->cfq_queued)
-			limit = cfqd->cfq_queued;
-		else if (limit > cfqd->max_queued)
-			limit = cfqd->max_queued;
+static inline int
+__cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+		struct task_struct *task, int rw)
+{
+	if (cfqq->wait_request && cfqq->must_alloc)
+		return ELV_MQUEUE_MUST;
 
-		if (cfqq->allocated[rw] >= limit) {
-			if (limit > cfqq->alloc_limit[rw])
-				cfqq->alloc_limit[rw] = limit;
+	return ELV_MQUEUE_MAY;
+#if 0
+	if (!cfqq || task->flags & PF_MEMALLOC)
+		return ELV_MQUEUE_MAY;
+	if (!cfqq->allocated[rw] || cfqq->must_alloc) {
+		if (cfqq->wait_request)
+			return ELV_MQUEUE_MUST;
 
-			ret = ELV_MQUEUE_NO;
+		/*
+		 * only allow 1 ELV_MQUEUE_MUST per slice, otherwise we
+		 * can quickly flood the queue with writes from a single task
+		 */
+		if (rw == READ || !cfqq->must_alloc_slice) {
+			cfqq->must_alloc_slice = 1;
+			return ELV_MQUEUE_MUST;
 		}
+
+		return ELV_MQUEUE_MAY;
 	}
+	if (cfq_class_idle(cfqq))
+		return ELV_MQUEUE_NO;
+	if (cfqq->allocated[rw] >= cfqd->max_queued) {
+		struct io_context *ioc = get_io_context(GFP_ATOMIC);
+		int ret = ELV_MQUEUE_NO;
 
-	return ret;
+		if (ioc && ioc->nr_batch_requests)
+			ret = ELV_MQUEUE_MAY;
+
+		put_io_context(ioc);
+		return ret;
+	}
+
+	return ELV_MQUEUE_MAY;
+#endif
+}
+
+static int cfq_may_queue(request_queue_t *q, int rw, struct bio *bio)
+{
+	struct cfq_data *cfqd = q->elevator->elevator_data;
+	struct task_struct *tsk = current;
+	struct cfq_queue *cfqq;
+
+	/*
+	 * don't force setup of a queue from here, as a call to may_queue
+	 * does not necessarily imply that a request actually will be queued.
+	 * so just lookup a possibly existing queue, or return 'may queue'
+	 * if that fails
+	 */
+	cfqq = cfq_find_cfq_hash(cfqd, cfq_queue_pid(tsk, rw));
+	if (cfqq) {
+		cfq_init_prio_data(cfqq);
+		cfq_prio_boost(cfqq);
+
+		return __cfq_may_queue(cfqd, cfqq, tsk, rw);
+	}
+
+	return ELV_MQUEUE_MAY;
 }
 
 static void cfq_check_waiters(request_queue_t *q, struct cfq_queue *cfqq)
 {
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct request_list *rl = &q->rq;
-	const int write = waitqueue_active(&rl->wait[WRITE]);
-	const int read = waitqueue_active(&rl->wait[READ]);
 
-	if (read && cfqq->allocated[READ] < cfqq->alloc_limit[READ])
-		wake_up(&rl->wait[READ]);
-	if (write && cfqq->allocated[WRITE] < cfqq->alloc_limit[WRITE])
-		wake_up(&rl->wait[WRITE]);
+	if (cfqq->allocated[READ] <= cfqd->max_queued || cfqd->rq_starved) {
+		smp_mb();
+		if (waitqueue_active(&rl->wait[READ]))
+			wake_up(&rl->wait[READ]);
+	}
+
+	if (cfqq->allocated[WRITE] <= cfqd->max_queued || cfqd->rq_starved) {
+		smp_mb();
+		if (waitqueue_active(&rl->wait[WRITE]))
+			wake_up(&rl->wait[WRITE]);
+	}
 }
 
 /*
@@ -1389,69 +1919,59 @@ static void cfq_put_request(request_queue_t *q, struct request *rq)
 
 	if (crq) {
 		struct cfq_queue *cfqq = crq->cfq_queue;
+		const int rw = rq_data_dir(rq);
 
-		BUG_ON(q->last_merge == rq);
-		BUG_ON(!hlist_unhashed(&crq->hash));
-
-		if (crq->io_context)
-			put_io_context(crq->io_context->ioc);
+		BUG_ON(!cfqq->allocated[rw]);
+		cfqq->allocated[rw]--;
 
-		BUG_ON(!cfqq->allocated[crq->is_write]);
-		cfqq->allocated[crq->is_write]--;
+		put_io_context(crq->io_context->ioc);
 
 		mempool_free(crq, cfqd->crq_pool);
 		rq->elevator_private = NULL;
 
-		smp_mb();
 		cfq_check_waiters(q, cfqq);
 		cfq_put_queue(cfqq);
 	}
 }
 
 /*
- * Allocate cfq data structures associated with this request. A queue and
+ * Allocate cfq data structures associated with this request.
  */
-static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
+static int
+cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
+		int gfp_mask)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_io_context *cic;
 	const int rw = rq_data_dir(rq);
-	struct cfq_queue *cfqq, *saved_cfqq;
+	struct cfq_queue *cfqq;
 	struct cfq_rq *crq;
 	unsigned long flags;
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 
+	cic = cfq_get_io_context(cfqd, cfq_queue_pid(current, rw), gfp_mask);
+
 	spin_lock_irqsave(q->queue_lock, flags);
 
-	cfqq = __cfq_get_queue(cfqd, cfq_hash_key(cfqd, current), gfp_mask);
-	if (!cfqq)
-		goto out_lock;
+	if (!cic)
+		goto queue_fail;
+
+	if (!cic->cfqq) {
+		cfqq = cfq_get_queue(cfqd, current->pid, gfp_mask);
+		if (!cfqq)
+			goto queue_fail;
 
-repeat:
-	if (cfqq->allocated[rw] >= cfqd->max_queued)
-		goto out_lock;
+		cic->cfqq = cfqq;
+	} else
+		cfqq = cic->cfqq;
 
 	cfqq->allocated[rw]++;
+	cfqq->must_alloc = 0;
+	cfqd->rq_starved = 0;
+	atomic_inc(&cfqq->ref);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
-	/*
-	 * if hashing type has changed, the cfq_queue might change here.
-	 */
-	saved_cfqq = cfqq;
-	cic = cfq_get_io_context(&cfqq, gfp_mask);
-	if (!cic)
-		goto err;
-
-	/*
-	 * repeat allocation checks on queue change
-	 */
-	if (unlikely(saved_cfqq != cfqq)) {
-		spin_lock_irqsave(q->queue_lock, flags);
-		saved_cfqq->allocated[rw]--;
-		goto repeat;
-	}
-
 	crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
 	if (crq) {
 		RB_CLEAR(&crq->rb_node);
@@ -1460,24 +1980,130 @@ repeat:
 		INIT_HLIST_NODE(&crq->hash);
 		crq->cfq_queue = cfqq;
 		crq->io_context = cic;
-		crq->service_start = crq->queue_start = 0;
-		crq->in_flight = crq->accounted = crq->is_sync = 0;
-		crq->is_write = rw;
+		crq->in_flight = crq->accounted = 0;
+		crq->is_sync = (rw == READ || process_sync(current));
+		crq->requeued = 0;
 		rq->elevator_private = crq;
-		cfqq->alloc_limit[rw] = 0;
 		return 0;
 	}
 
-	put_io_context(cic->ioc);
-err:
 	spin_lock_irqsave(q->queue_lock, flags);
 	cfqq->allocated[rw]--;
+	if (!(cfqq->allocated[0] + cfqq->allocated[1]))
+		cfqq->must_alloc = 1;
 	cfq_put_queue(cfqq);
-out_lock:
+queue_fail:
+	if (cic)
+		put_io_context(cic->ioc);
+	/*
+	 * mark us rq allocation starved. we need to kickstart the process
+	 * ourselves if there are no pending requests that can do it for us.
+	 * that would be an extremely rare OOM situation
+	 */
+	cfqd->rq_starved = 1;
+	kblockd_schedule_work(&cfqd->unplug_work);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 	return 1;
 }
 
+static void cfq_kick_queue(void *data)
+{
+	request_queue_t *q = data;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+
+	if (cfqd->rq_starved) {
+		struct request_list *rl = &q->rq;
+
+		/*
+		 * we aren't guaranteed to get a request after this, but we
+		 * have to be opportunistic
+		 */
+		smp_mb();
+		if (waitqueue_active(&rl->wait[READ]))
+			wake_up(&rl->wait[READ]);
+		if (waitqueue_active(&rl->wait[WRITE]))
+			wake_up(&rl->wait[WRITE]);
+	}
+
+	blk_remove_plug(q);
+	q->request_fn(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+/*
+ * Timer running if the active_queue is currently idling inside its time slice
+ */
+static void cfq_idle_slice_timer(unsigned long data)
+{
+	struct cfq_data *cfqd = (struct cfq_data *) data;
+	struct cfq_queue *cfqq;
+	unsigned long flags;
+
+	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
+
+	if ((cfqq = cfqd->active_queue) != NULL) {
+		unsigned long now = jiffies;
+
+		/*
+		 * expired
+		 */
+		if (time_after(now, cfqq->slice_end))
+			goto expire;
+
+		/*
+		 * only expire and reinvoke request handler, if there are
+		 * other queues with pending requests
+		 */
+		if (!cfq_pending_requests(cfqd)) {
+			cfqd->idle_slice_timer.expires = min(now + cfqd->cfq_slice_idle, cfqq->slice_end);
+			add_timer(&cfqd->idle_slice_timer);
+			goto out_cont;
+		}
+
+		/*
+		 * not expired and it has a request pending, let it dispatch
+		 */
+		if (!RB_EMPTY(&cfqq->sort_list)) {
+			cfqq->must_dispatch = 1;
+			goto out_kick;
+		}
+	}
+expire:
+	cfq_slice_expired(cfqd, 0);
+out_kick:
+	if (cfq_pending_requests(cfqd))
+		kblockd_schedule_work(&cfqd->unplug_work);
+out_cont:
+	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
+}
+
+/*
+ * Timer running if an idle class queue is waiting for service
+ */
+static void cfq_idle_class_timer(unsigned long data)
+{
+	struct cfq_data *cfqd = (struct cfq_data *) data;
+	unsigned long flags, end;
+
+	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
+
+	/*
+	 * race with a non-idle queue, reset timer
+	 */
+	end = cfqd->last_end_request + CFQ_IDLE_GRACE;
+	if (!time_after_eq(jiffies, end)) {
+		cfqd->idle_class_timer.expires = end;
+		add_timer(&cfqd->idle_class_timer);
+	} else
+		kblockd_schedule_work(&cfqd->unplug_work);
+
+	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
+}
+
+
 static void cfq_put_cfqd(struct cfq_data *cfqd)
 {
 	request_queue_t *q = cfqd->queue;
@@ -1485,6 +2111,8 @@ static void cfq_put_cfqd(struct cfq_data *cfqd)
 	if (!atomic_dec_and_test(&cfqd->ref))
 		return;
 
+	blk_sync_queue(q);
+
 	blk_put_queue(q);
 
 	mempool_destroy(cfqd->crq_pool);
@@ -1495,7 +2123,11 @@ static void cfq_put_cfqd(struct cfq_data *cfqd)
 
 static void cfq_exit_queue(elevator_t *e)
 {
-	cfq_put_cfqd(e->elevator_data);
+	struct cfq_data *cfqd = e->elevator_data;
+
+	del_timer_sync(&cfqd->idle_slice_timer);
+	del_timer_sync(&cfqd->idle_class_timer);
+	cfq_put_cfqd(cfqd);
 }
 
 static int cfq_init_queue(request_queue_t *q, elevator_t *e)
@@ -1508,7 +2140,13 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e)
 		return -ENOMEM;
 
 	memset(cfqd, 0, sizeof(*cfqd));
-	INIT_LIST_HEAD(&cfqd->rr_list);
+
+	for (i = 0; i < CFQ_PRIO_LISTS; i++)
+		INIT_LIST_HEAD(&cfqd->rr_list[i]);
+
+	INIT_LIST_HEAD(&cfqd->busy_rr);
+	INIT_LIST_HEAD(&cfqd->cur_rr);
+	INIT_LIST_HEAD(&cfqd->idle_rr);
 	INIT_LIST_HEAD(&cfqd->empty_list);
 
 	cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
@@ -1533,25 +2171,32 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e)
 	cfqd->queue = q;
 	atomic_inc(&q->refcnt);
 
-	/*
-	 * just set it to some high value, we want anyone to be able to queue
-	 * some requests. fairness is handled differently
-	 */
-	q->nr_requests = 1024;
-	cfqd->max_queued = q->nr_requests / 16;
+	cfqd->max_queued = q->nr_requests / 4;
 	q->nr_batching = cfq_queued;
-	cfqd->key_type = CFQ_KEY_TGID;
-	cfqd->find_best_crq = 1;
+
+	init_timer(&cfqd->idle_slice_timer);
+	cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
+	cfqd->idle_slice_timer.data = (unsigned long) cfqd;
+
+	init_timer(&cfqd->idle_class_timer);
+	cfqd->idle_class_timer.function = cfq_idle_class_timer;
+	cfqd->idle_class_timer.data = (unsigned long) cfqd;
+
+	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue, q);
+
 	atomic_set(&cfqd->ref, 1);
 
 	cfqd->cfq_queued = cfq_queued;
 	cfqd->cfq_quantum = cfq_quantum;
-	cfqd->cfq_fifo_expire_r = cfq_fifo_expire_r;
-	cfqd->cfq_fifo_expire_w = cfq_fifo_expire_w;
-	cfqd->cfq_fifo_batch_expire = cfq_fifo_rate;
+	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
+	cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
 	cfqd->cfq_back_max = cfq_back_max;
 	cfqd->cfq_back_penalty = cfq_back_penalty;
-
+	cfqd->cfq_slice[0] = cfq_slice_async;
+	cfqd->cfq_slice[1] = cfq_slice_sync;
+	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
+	cfqd->cfq_slice_idle = cfq_slice_idle;
+	cfqd->cfq_max_depth = cfq_max_depth;
 	return 0;
 out_crqpool:
 	kfree(cfqd->cfq_hash);
@@ -1595,7 +2240,6 @@ fail:
 	return -ENOMEM;
 }
 
-
 /*
  * sysfs parts below -->
  */
@@ -1620,45 +2264,6 @@ cfq_var_store(unsigned int *var, const char *page, size_t count)
 	return count;
 }
 
-static ssize_t
-cfq_clear_elapsed(struct cfq_data *cfqd, const char *page, size_t count)
-{
-	max_elapsed_dispatch = max_elapsed_crq = 0;
-	return count;
-}
-
-static ssize_t
-cfq_set_key_type(struct cfq_data *cfqd, const char *page, size_t count)
-{
-	spin_lock_irq(cfqd->queue->queue_lock);
-	if (!strncmp(page, "pgid", 4))
-		cfqd->key_type = CFQ_KEY_PGID;
-	else if (!strncmp(page, "tgid", 4))
-		cfqd->key_type = CFQ_KEY_TGID;
-	else if (!strncmp(page, "uid", 3))
-		cfqd->key_type = CFQ_KEY_UID;
-	else if (!strncmp(page, "gid", 3))
-		cfqd->key_type = CFQ_KEY_GID;
-	spin_unlock_irq(cfqd->queue->queue_lock);
-	return count;
-}
-
-static ssize_t
-cfq_read_key_type(struct cfq_data *cfqd, char *page)
-{
-	ssize_t len = 0;
-	int i;
-
-	for (i = CFQ_KEY_PGID; i < CFQ_KEY_LAST; i++) {
-		if (cfqd->key_type == i)
-			len += sprintf(page+len, "[%s] ", cfq_key_types[i]);
-		else
-			len += sprintf(page+len, "%s ", cfq_key_types[i]);
-	}
-	len += sprintf(page+len, "\n");
-	return len;
-}
-
 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
 static ssize_t __FUNC(struct cfq_data *cfqd, char *page)		\
 {									\
@@ -1669,12 +2274,15 @@ static ssize_t __FUNC(struct cfq_data *cfqd, char *page)		\
 }
 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
 SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0);
-SHOW_FUNCTION(cfq_fifo_expire_r_show, cfqd->cfq_fifo_expire_r, 1);
-SHOW_FUNCTION(cfq_fifo_expire_w_show, cfqd->cfq_fifo_expire_w, 1);
-SHOW_FUNCTION(cfq_fifo_batch_expire_show, cfqd->cfq_fifo_batch_expire, 1);
-SHOW_FUNCTION(cfq_find_best_show, cfqd->find_best_crq, 0);
+SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
+SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
 SHOW_FUNCTION(cfq_back_max_show, cfqd->cfq_back_max, 0);
 SHOW_FUNCTION(cfq_back_penalty_show, cfqd->cfq_back_penalty, 0);
+SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
+SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
+SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
+SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
+SHOW_FUNCTION(cfq_max_depth_show, cfqd->cfq_max_depth, 0);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
@@ -1694,12 +2302,15 @@ static ssize_t __FUNC(struct cfq_data *cfqd, const char *page, size_t count)	\
 }
 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
 STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0);
-STORE_FUNCTION(cfq_fifo_expire_r_store, &cfqd->cfq_fifo_expire_r, 1, UINT_MAX, 1);
-STORE_FUNCTION(cfq_fifo_expire_w_store, &cfqd->cfq_fifo_expire_w, 1, UINT_MAX, 1);
-STORE_FUNCTION(cfq_fifo_batch_expire_store, &cfqd->cfq_fifo_batch_expire, 0, UINT_MAX, 1);
-STORE_FUNCTION(cfq_find_best_store, &cfqd->find_best_crq, 0, 1, 0);
+STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1);
+STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_back_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
 STORE_FUNCTION(cfq_back_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
+STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
+STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
+STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
+STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0);
+STORE_FUNCTION(cfq_max_depth_store, &cfqd->cfq_max_depth, 1, UINT_MAX, 0);
 #undef STORE_FUNCTION
 
 static struct cfq_fs_entry cfq_quantum_entry = {
@@ -1712,25 +2323,15 @@ static struct cfq_fs_entry cfq_queued_entry = {
 	.show = cfq_queued_show,
 	.store = cfq_queued_store,
 };
-static struct cfq_fs_entry cfq_fifo_expire_r_entry = {
+static struct cfq_fs_entry cfq_fifo_expire_sync_entry = {
 	.attr = {.name = "fifo_expire_sync", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_fifo_expire_r_show,
-	.store = cfq_fifo_expire_r_store,
+	.show = cfq_fifo_expire_sync_show,
+	.store = cfq_fifo_expire_sync_store,
 };
-static struct cfq_fs_entry cfq_fifo_expire_w_entry = {
+static struct cfq_fs_entry cfq_fifo_expire_async_entry = {
 	.attr = {.name = "fifo_expire_async", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_fifo_expire_w_show,
-	.store = cfq_fifo_expire_w_store,
-};
-static struct cfq_fs_entry cfq_fifo_batch_expire_entry = {
-	.attr = {.name = "fifo_batch_expire", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_fifo_batch_expire_show,
-	.store = cfq_fifo_batch_expire_store,
-};
-static struct cfq_fs_entry cfq_find_best_entry = {
-	.attr = {.name = "find_best_crq", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_find_best_show,
-	.store = cfq_find_best_store,
+	.show = cfq_fifo_expire_async_show,
+	.store = cfq_fifo_expire_async_store,
 };
 static struct cfq_fs_entry cfq_back_max_entry = {
 	.attr = {.name = "back_seek_max", .mode = S_IRUGO | S_IWUSR },
@@ -1742,27 +2343,43 @@ static struct cfq_fs_entry cfq_back_penalty_entry = {
 	.show = cfq_back_penalty_show,
 	.store = cfq_back_penalty_store,
 };
-static struct cfq_fs_entry cfq_clear_elapsed_entry = {
-	.attr = {.name = "clear_elapsed", .mode = S_IWUSR },
-	.store = cfq_clear_elapsed,
+static struct cfq_fs_entry cfq_slice_sync_entry = {
+	.attr = {.name = "slice_sync", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_slice_sync_show,
+	.store = cfq_slice_sync_store,
 };
-static struct cfq_fs_entry cfq_key_type_entry = {
-	.attr = {.name = "key_type", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_read_key_type,
-	.store = cfq_set_key_type,
+static struct cfq_fs_entry cfq_slice_async_entry = {
+	.attr = {.name = "slice_async", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_slice_async_show,
+	.store = cfq_slice_async_store,
+};
+static struct cfq_fs_entry cfq_slice_async_rq_entry = {
+	.attr = {.name = "slice_async_rq", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_slice_async_rq_show,
+	.store = cfq_slice_async_rq_store,
+};
+static struct cfq_fs_entry cfq_slice_idle_entry = {
+	.attr = {.name = "slice_idle", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_slice_idle_show,
+	.store = cfq_slice_idle_store,
+};
+static struct cfq_fs_entry cfq_max_depth_entry = {
+	.attr = {.name = "max_depth", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_max_depth_show,
+	.store = cfq_max_depth_store,
 };
-
 static struct attribute *default_attrs[] = {
 	&cfq_quantum_entry.attr,
 	&cfq_queued_entry.attr,
-	&cfq_fifo_expire_r_entry.attr,
-	&cfq_fifo_expire_w_entry.attr,
-	&cfq_fifo_batch_expire_entry.attr,
-	&cfq_key_type_entry.attr,
-	&cfq_find_best_entry.attr,
+	&cfq_fifo_expire_sync_entry.attr,
+	&cfq_fifo_expire_async_entry.attr,
 	&cfq_back_max_entry.attr,
 	&cfq_back_penalty_entry.attr,
-	&cfq_clear_elapsed_entry.attr,
+	&cfq_slice_sync_entry.attr,
+	&cfq_slice_async_entry.attr,
+	&cfq_slice_async_rq_entry.attr,
+	&cfq_slice_idle_entry.attr,
+	&cfq_max_depth_entry.attr,
 	NULL,
 };
 
@@ -1832,21 +2449,46 @@ static int __init cfq_init(void)
 {
 	int ret;
 
+	/*
+	 * could be 0 on HZ < 1000 setups
+	 */
+	if (!cfq_slice_async)
+		cfq_slice_async = 1;
+	if (!cfq_slice_idle)
+		cfq_slice_idle = 1;
+
 	if (cfq_slab_setup())
 		return -ENOMEM;
 
 	ret = elv_register(&iosched_cfq);
-	if (!ret) {
-		__module_get(THIS_MODULE);
-		return 0;
-	}
+	if (ret)
+		cfq_slab_kill();
 
-	cfq_slab_kill();
 	return ret;
 }
 
 static void __exit cfq_exit(void)
 {
+	struct task_struct *g, *p;
+	unsigned long flags;
+
+	read_lock_irqsave(&tasklist_lock, flags);
+
+	/*
+	 * iterate each process in the system, removing our io_context
+	 */
+	do_each_thread(g, p) {
+		struct io_context *ioc = p->io_context;
+
+		if (ioc && ioc->cic) {
+			ioc->cic->exit(ioc->cic);
+			cfq_free_io_context(ioc->cic);
+			ioc->cic = NULL;
+		}
+	} while_each_thread(g, p);
+
+	read_unlock_irqrestore(&tasklist_lock, flags);
+
 	cfq_slab_kill();
 	elv_unregister(&iosched_cfq);
 }
diff --git a/drivers/block/deadline-iosched.c b/drivers/block/deadline-iosched.c
index 4bc2fea7327..ff5201e0215 100644
--- a/drivers/block/deadline-iosched.c
+++ b/drivers/block/deadline-iosched.c
@@ -760,7 +760,8 @@ static void deadline_put_request(request_queue_t *q, struct request *rq)
 }
 
 static int
-deadline_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
+deadline_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
+		     int gfp_mask)
 {
 	struct deadline_data *dd = q->elevator->elevator_data;
 	struct deadline_rq *drq;
diff --git a/drivers/block/elevator.c b/drivers/block/elevator.c
index f831f08f839..98f0126a2de 100644
--- a/drivers/block/elevator.c
+++ b/drivers/block/elevator.c
@@ -486,12 +486,13 @@ struct request *elv_former_request(request_queue_t *q, struct request *rq)
 	return NULL;
 }
 
-int elv_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
+int elv_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
+		    int gfp_mask)
 {
 	elevator_t *e = q->elevator;
 
 	if (e->ops->elevator_set_req_fn)
-		return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
+		return e->ops->elevator_set_req_fn(q, rq, bio, gfp_mask);
 
 	rq->elevator_private = NULL;
 	return 0;
@@ -505,12 +506,12 @@ void elv_put_request(request_queue_t *q, struct request *rq)
 		e->ops->elevator_put_req_fn(q, rq);
 }
 
-int elv_may_queue(request_queue_t *q, int rw)
+int elv_may_queue(request_queue_t *q, int rw, struct bio *bio)
 {
 	elevator_t *e = q->elevator;
 
 	if (e->ops->elevator_may_queue_fn)
-		return e->ops->elevator_may_queue_fn(q, rw);
+		return e->ops->elevator_may_queue_fn(q, rw, bio);
 
 	return ELV_MQUEUE_MAY;
 }
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 60e64091de1..234fdcfbdf0 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -276,6 +276,7 @@ static inline void rq_init(request_queue_t *q, struct request *rq)
 	rq->errors = 0;
 	rq->rq_status = RQ_ACTIVE;
 	rq->bio = rq->biotail = NULL;
+	rq->ioprio = 0;
 	rq->buffer = NULL;
 	rq->ref_count = 1;
 	rq->q = q;
@@ -1442,11 +1443,7 @@ void __generic_unplug_device(request_queue_t *q)
 	if (!blk_remove_plug(q))
 		return;
 
-	/*
-	 * was plugged, fire request_fn if queue has stuff to do
-	 */
-	if (elv_next_request(q))
-		q->request_fn(q);
+	q->request_fn(q);
 }
 EXPORT_SYMBOL(__generic_unplug_device);
 
@@ -1776,8 +1773,8 @@ static inline void blk_free_request(request_queue_t *q, struct request *rq)
 	mempool_free(rq, q->rq.rq_pool);
 }
 
-static inline struct request *blk_alloc_request(request_queue_t *q, int rw,
-						int gfp_mask)
+static inline struct request *
+blk_alloc_request(request_queue_t *q, int rw, struct bio *bio, int gfp_mask)
 {
 	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
 
@@ -1790,7 +1787,7 @@ static inline struct request *blk_alloc_request(request_queue_t *q, int rw,
 	 */
 	rq->flags = rw;
 
-	if (!elv_set_request(q, rq, gfp_mask))
+	if (!elv_set_request(q, rq, bio, gfp_mask))
 		return rq;
 
 	mempool_free(rq, q->rq.rq_pool);
@@ -1872,7 +1869,8 @@ static void freed_request(request_queue_t *q, int rw)
 /*
  * Get a free request, queue_lock must not be held
  */
-static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
+static struct request *get_request(request_queue_t *q, int rw, struct bio *bio,
+				   int gfp_mask)
 {
 	struct request *rq = NULL;
 	struct request_list *rl = &q->rq;
@@ -1895,7 +1893,7 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
 		}
 	}
 
-	switch (elv_may_queue(q, rw)) {
+	switch (elv_may_queue(q, rw, bio)) {
 		case ELV_MQUEUE_NO:
 			goto rq_starved;
 		case ELV_MQUEUE_MAY:
@@ -1920,7 +1918,7 @@ get_rq:
 		set_queue_congested(q, rw);
 	spin_unlock_irq(q->queue_lock);
 
-	rq = blk_alloc_request(q, rw, gfp_mask);
+	rq = blk_alloc_request(q, rw, bio, gfp_mask);
 	if (!rq) {
 		/*
 		 * Allocation failed presumably due to memory. Undo anything
@@ -1961,7 +1959,8 @@ out:
  * No available requests for this queue, unplug the device and wait for some
  * requests to become available.
  */
-static struct request *get_request_wait(request_queue_t *q, int rw)
+static struct request *get_request_wait(request_queue_t *q, int rw,
+					struct bio *bio)
 {
 	DEFINE_WAIT(wait);
 	struct request *rq;
@@ -1972,7 +1971,7 @@ static struct request *get_request_wait(request_queue_t *q, int rw)
 		prepare_to_wait_exclusive(&rl->wait[rw], &wait,
 				TASK_UNINTERRUPTIBLE);
 
-		rq = get_request(q, rw, GFP_NOIO);
+		rq = get_request(q, rw, bio, GFP_NOIO);
 
 		if (!rq) {
 			struct io_context *ioc;
@@ -2003,9 +2002,9 @@ struct request *blk_get_request(request_queue_t *q, int rw, int gfp_mask)
 	BUG_ON(rw != READ && rw != WRITE);
 
 	if (gfp_mask & __GFP_WAIT)
-		rq = get_request_wait(q, rw);
+		rq = get_request_wait(q, rw, NULL);
 	else
-		rq = get_request(q, rw, gfp_mask);
+		rq = get_request(q, rw, NULL, gfp_mask);
 
 	return rq;
 }
@@ -2333,7 +2332,6 @@ static void __blk_put_request(request_queue_t *q, struct request *req)
 		return;
 
 	req->rq_status = RQ_INACTIVE;
-	req->q = NULL;
 	req->rl = NULL;
 
 	/*
@@ -2462,6 +2460,8 @@ static int attempt_merge(request_queue_t *q, struct request *req,
 		req->rq_disk->in_flight--;
 	}
 
+	req->ioprio = ioprio_best(req->ioprio, next->ioprio);
+
 	__blk_put_request(q, next);
 	return 1;
 }
@@ -2514,11 +2514,13 @@ static int __make_request(request_queue_t *q, struct bio *bio)
 {
 	struct request *req, *freereq = NULL;
 	int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err, sync;
+	unsigned short prio;
 	sector_t sector;
 
 	sector = bio->bi_sector;
 	nr_sectors = bio_sectors(bio);
 	cur_nr_sectors = bio_cur_sectors(bio);
+	prio = bio_prio(bio);
 
 	rw = bio_data_dir(bio);
 	sync = bio_sync(bio);
@@ -2559,6 +2561,7 @@ again:
 			req->biotail->bi_next = bio;
 			req->biotail = bio;
 			req->nr_sectors = req->hard_nr_sectors += nr_sectors;
+			req->ioprio = ioprio_best(req->ioprio, prio);
 			drive_stat_acct(req, nr_sectors, 0);
 			if (!attempt_back_merge(q, req))
 				elv_merged_request(q, req);
@@ -2583,6 +2586,7 @@ again:
 			req->hard_cur_sectors = cur_nr_sectors;
 			req->sector = req->hard_sector = sector;
 			req->nr_sectors = req->hard_nr_sectors += nr_sectors;
+			req->ioprio = ioprio_best(req->ioprio, prio);
 			drive_stat_acct(req, nr_sectors, 0);
 			if (!attempt_front_merge(q, req))
 				elv_merged_request(q, req);
@@ -2610,7 +2614,7 @@ get_rq:
 		freereq = NULL;
 	} else {
 		spin_unlock_irq(q->queue_lock);
-		if ((freereq = get_request(q, rw, GFP_ATOMIC)) == NULL) {
+		if ((freereq = get_request(q, rw, bio, GFP_ATOMIC)) == NULL) {
 			/*
 			 * READA bit set
 			 */
@@ -2618,7 +2622,7 @@ get_rq:
 			if (bio_rw_ahead(bio))
 				goto end_io;
 	
-			freereq = get_request_wait(q, rw);
+			freereq = get_request_wait(q, rw, bio);
 		}
 		goto again;
 	}
@@ -2646,6 +2650,7 @@ get_rq:
 	req->buffer = bio_data(bio);	/* see ->buffer comment above */
 	req->waiting = NULL;
 	req->bio = req->biotail = bio;
+	req->ioprio = prio;
 	req->rq_disk = bio->bi_bdev->bd_disk;
 	req->start_time = jiffies;
 
@@ -2674,7 +2679,7 @@ static inline void blk_partition_remap(struct bio *bio)
 	if (bdev != bdev->bd_contains) {
 		struct hd_struct *p = bdev->bd_part;
 
-		switch (bio->bi_rw) {
+		switch (bio_data_dir(bio)) {
 		case READ:
 			p->read_sectors += bio_sectors(bio);
 			p->reads++;
@@ -2693,6 +2698,7 @@ void blk_finish_queue_drain(request_queue_t *q)
 {
 	struct request_list *rl = &q->rq;
 	struct request *rq;
+	int requeued = 0;
 
 	spin_lock_irq(q->queue_lock);
 	clear_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
@@ -2701,9 +2707,13 @@ void blk_finish_queue_drain(request_queue_t *q)
 		rq = list_entry_rq(q->drain_list.next);
 
 		list_del_init(&rq->queuelist);
-		__elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 1);
+		elv_requeue_request(q, rq);
+		requeued++;
 	}
 
+	if (requeued)
+		q->request_fn(q);
+
 	spin_unlock_irq(q->queue_lock);
 
 	wake_up(&rl->wait[0]);
@@ -2900,7 +2910,7 @@ void submit_bio(int rw, struct bio *bio)
 
 	BIO_BUG_ON(!bio->bi_size);
 	BIO_BUG_ON(!bio->bi_io_vec);
-	bio->bi_rw = rw;
+	bio->bi_rw |= rw;
 	if (rw & WRITE)
 		mod_page_state(pgpgout, count);
 	else
@@ -3257,8 +3267,11 @@ void exit_io_context(void)
 	struct io_context *ioc;
 
 	local_irq_save(flags);
+	task_lock(current);
 	ioc = current->io_context;
 	current->io_context = NULL;
+	ioc->task = NULL;
+	task_unlock(current);
 	local_irq_restore(flags);
 
 	if (ioc->aic && ioc->aic->exit)
@@ -3293,12 +3306,12 @@ struct io_context *get_io_context(int gfp_flags)
 	ret = kmem_cache_alloc(iocontext_cachep, gfp_flags);
 	if (ret) {
 		atomic_set(&ret->refcount, 1);
-		ret->pid = tsk->pid;
+		ret->task = current;
+		ret->set_ioprio = NULL;
 		ret->last_waited = jiffies; /* doesn't matter... */
 		ret->nr_batch_requests = 0; /* because this is 0 */
 		ret->aic = NULL;
 		ret->cic = NULL;
-		spin_lock_init(&ret->lock);
 
 		local_irq_save(flags);
 
-- 
cgit v1.2.3


From 3d25f3566bb606720a67caef77b16190df10dd98 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 27 Jun 2005 10:55:49 +0200
Subject: [PATCH] Fix cfq_find_next_crq()

In cfq_find_next_crq(), cfq tries to find the next request by choosing
one of two requests before and after the current one.  Currently, when
choosing the next request, if there's no next request, the next
candidate is NULL, resulting in selection of the previous request.  This
results in weird scheduling.  Once we reach the end, we always seek
backward.

The correct behavior is using the first request as the next candidate.
cfq_choose_req() already has logics for handling wrapped requests.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/cfq-iosched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/cfq-iosched.c b/drivers/block/cfq-iosched.c
index 35f6e569d5e..baa3e268250 100644
--- a/drivers/block/cfq-iosched.c
+++ b/drivers/block/cfq-iosched.c
@@ -375,9 +375,10 @@ cfq_find_next_crq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	struct cfq_rq *crq_next = NULL, *crq_prev = NULL;
 	struct rb_node *rbnext, *rbprev;
 
+	rbnext = NULL;
 	if (ON_RB(&last->rb_node))
 		rbnext = rb_next(&last->rb_node);
-	else {
+	if (!rbnext) {
 		rbnext = rb_first(&cfqq->sort_list);
 		if (rbnext == &last->rb_node)
 			rbnext = NULL;
-- 
cgit v1.2.3


From 3b18152c327707ae6a2eeba4cfb66457143753bc Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 27 Jun 2005 10:56:24 +0200
Subject: [PATCH] CFQ io scheduler updates

- Adjust slice values

- Instead of one async queue, one is defined per priority level. This
  prevents kernel threads (such as reiserfs/x and others) that run at
  higher io priority from conflicting with others. Previously, it was a
  coin toss what io prio the async queue got, it was defined by who
  first set up the queue.

- Let a time slice only begin, when the previous slice is completely
  done. Previously we could be somewhat unfair to a new sync slice, if
  the previous slice was async and had several ios queued. This might
  need a little tweaking if throughput suffers a little due to this,
  allowing perhaps an overlap of a single request or so.

- Optimize the calling of kblockd_schedule_work() by doing it only when
  it is strictly necessary (no requests in driver and work left to do).

- Correct sync vs async logic. A 'normal' process can be purely async as
  well, and a flusher can be purely sync as well. Sync or async is now a
  property of the class defined and requests pending. Previously writers
  could be considered sync, when they were really async.

- Get rid of the bit fields in cfqq and crq, use flags instead.

- Various other cleanups and fixes

Signed-off-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/cfq-iosched.c | 460 ++++++++++++++++++++++++++++----------------
 1 file changed, 299 insertions(+), 161 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/cfq-iosched.c b/drivers/block/cfq-iosched.c
index baa3e268250..1ecb179b860 100644
--- a/drivers/block/cfq-iosched.c
+++ b/drivers/block/cfq-iosched.c
@@ -34,14 +34,15 @@ static int cfq_back_max = 16 * 1024;	/* maximum backwards seek, in KiB */
 static int cfq_back_penalty = 2;	/* penalty of a backwards seek */
 
 static int cfq_slice_sync = HZ / 10;
-static int cfq_slice_async = HZ / 50;
+static int cfq_slice_async = HZ / 25;
 static int cfq_slice_async_rq = 2;
-static int cfq_slice_idle = HZ / 50;
+static int cfq_slice_idle = HZ / 100;
 
 #define CFQ_IDLE_GRACE		(HZ / 10)
 #define CFQ_SLICE_SCALE		(5)
 
 #define CFQ_KEY_ASYNC		(0)
+#define CFQ_KEY_ANY		(0xffff)
 
 /*
  * disable queueing at the driver/hardware level
@@ -96,7 +97,16 @@ static kmem_cache_t *cfq_ioc_pool;
 #define cfq_class_be(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_BE)
 #define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
 
-#define cfq_cfqq_sync(cfqq)	((cfqq)->key != CFQ_KEY_ASYNC)
+#define ASYNC			(0)
+#define SYNC			(1)
+
+#define cfq_cfqq_dispatched(cfqq)	\
+	((cfqq)->on_dispatch[ASYNC] + (cfqq)->on_dispatch[SYNC])
+
+#define cfq_cfqq_class_sync(cfqq)	((cfqq)->key != CFQ_KEY_ASYNC)
+
+#define cfq_cfqq_sync(cfqq)		\
+	(cfq_cfqq_class_sync(cfqq) || (cfqq)->on_dispatch[SYNC])
 
 /*
  * Per block device queue structure
@@ -200,28 +210,15 @@ struct cfq_queue {
 	unsigned long slice_left;
 	unsigned long service_last;
 
-	/* number of requests that have been handed to the driver */
-	int in_flight;
+	/* number of requests that are on the dispatch list */
+	int on_dispatch[2];
 
 	/* io prio of this group */
 	unsigned short ioprio, org_ioprio;
 	unsigned short ioprio_class, org_ioprio_class;
 
-	/* whether queue is on rr (or empty) list */
-	unsigned on_rr : 1;
-	/* idle slice, waiting for new request submission */
-	unsigned wait_request : 1;
-	/* set when wait_request gets set, reset on first rq alloc */
-	unsigned must_alloc : 1;
-	/* only gets one must_alloc per slice */
-	unsigned must_alloc_slice : 1;
-	/* idle slice, request added, now waiting to dispatch it */
-	unsigned must_dispatch : 1;
-	/* fifo expire per-slice */
-	unsigned fifo_expire : 1;
-
-	unsigned idle_window : 1;
-	unsigned prio_changed : 1;
+	/* various state flags, see below */
+	unsigned int flags;
 };
 
 struct cfq_rq {
@@ -233,15 +230,77 @@ struct cfq_rq {
 	struct cfq_queue *cfq_queue;
 	struct cfq_io_context *io_context;
 
-	unsigned in_flight : 1;
-	unsigned accounted : 1;
-	unsigned is_sync   : 1;
-	unsigned requeued  : 1;
+	unsigned int crq_flags;
 };
 
-static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int);
+enum cfqq_state_flags {
+	CFQ_CFQQ_FLAG_on_rr = 0,
+	CFQ_CFQQ_FLAG_wait_request,
+	CFQ_CFQQ_FLAG_must_alloc,
+	CFQ_CFQQ_FLAG_must_alloc_slice,
+	CFQ_CFQQ_FLAG_must_dispatch,
+	CFQ_CFQQ_FLAG_fifo_expire,
+	CFQ_CFQQ_FLAG_idle_window,
+	CFQ_CFQQ_FLAG_prio_changed,
+	CFQ_CFQQ_FLAG_expired,
+};
+
+#define CFQ_CFQQ_FNS(name)						\
+static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)		\
+{									\
+	cfqq->flags |= (1 << CFQ_CFQQ_FLAG_##name);			\
+}									\
+static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)	\
+{									\
+	cfqq->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);			\
+}									\
+static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)		\
+{									\
+	return (cfqq->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;	\
+}
+
+CFQ_CFQQ_FNS(on_rr);
+CFQ_CFQQ_FNS(wait_request);
+CFQ_CFQQ_FNS(must_alloc);
+CFQ_CFQQ_FNS(must_alloc_slice);
+CFQ_CFQQ_FNS(must_dispatch);
+CFQ_CFQQ_FNS(fifo_expire);
+CFQ_CFQQ_FNS(idle_window);
+CFQ_CFQQ_FNS(prio_changed);
+CFQ_CFQQ_FNS(expired);
+#undef CFQ_CFQQ_FNS
+
+enum cfq_rq_state_flags {
+	CFQ_CRQ_FLAG_in_flight = 0,
+	CFQ_CRQ_FLAG_in_driver,
+	CFQ_CRQ_FLAG_is_sync,
+	CFQ_CRQ_FLAG_requeued,
+};
+
+#define CFQ_CRQ_FNS(name)						\
+static inline void cfq_mark_crq_##name(struct cfq_rq *crq)		\
+{									\
+	crq->crq_flags |= (1 << CFQ_CRQ_FLAG_##name);			\
+}									\
+static inline void cfq_clear_crq_##name(struct cfq_rq *crq)		\
+{									\
+	crq->crq_flags &= ~(1 << CFQ_CRQ_FLAG_##name);			\
+}									\
+static inline int cfq_crq_##name(const struct cfq_rq *crq)		\
+{									\
+	return (crq->crq_flags & (1 << CFQ_CRQ_FLAG_##name)) != 0;	\
+}
+
+CFQ_CRQ_FNS(in_flight);
+CFQ_CRQ_FNS(in_driver);
+CFQ_CRQ_FNS(is_sync);
+CFQ_CRQ_FNS(requeued);
+#undef CFQ_CRQ_FNS
+
+static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short);
 static void cfq_dispatch_sort(request_queue_t *, struct cfq_rq *);
 static void cfq_put_cfqd(struct cfq_data *cfqd);
+static inline int cfq_pending_requests(struct cfq_data *cfqd);
 
 #define process_sync(tsk)	((tsk)->flags & PF_SYNCWRITE)
 
@@ -305,9 +364,9 @@ cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
 		return crq2;
 	if (crq2 == NULL)
 		return crq1;
-	if (crq1->requeued)
+	if (cfq_crq_requeued(crq1))
 		return crq1;
-	if (crq2->requeued)
+	if (cfq_crq_requeued(crq2))
 		return crq2;
 
 	s1 = crq1->request->sector;
@@ -407,7 +466,7 @@ static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
 	struct cfq_data *cfqd = cfqq->cfqd;
 	struct list_head *list, *entry;
 
-	BUG_ON(!cfqq->on_rr);
+	BUG_ON(!cfq_cfqq_on_rr(cfqq));
 
 	list_del(&cfqq->cfq_list);
 
@@ -423,7 +482,7 @@ static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
 		 * has lots of io pending vs one that only generates one
 		 * sporadically or synchronously
 		 */
-		if (cfqq->in_flight)
+		if (cfq_cfqq_dispatched(cfqq))
 			list = &cfqd->busy_rr;
 		else
 			list = &cfqd->rr_list[cfqq->ioprio];
@@ -461,8 +520,8 @@ static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
 static inline void
 cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq, int requeue)
 {
-	BUG_ON(cfqq->on_rr);
-	cfqq->on_rr = 1;
+	BUG_ON(cfq_cfqq_on_rr(cfqq));
+	cfq_mark_cfqq_on_rr(cfqq);
 	cfqd->busy_queues++;
 
 	cfq_resort_rr_list(cfqq, requeue);
@@ -471,8 +530,8 @@ cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq, int requeue)
 static inline void
 cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
-	BUG_ON(!cfqq->on_rr);
-	cfqq->on_rr = 0;
+	BUG_ON(!cfq_cfqq_on_rr(cfqq));
+	cfq_clear_cfqq_on_rr(cfqq);
 	list_move(&cfqq->cfq_list, &cfqd->empty_list);
 
 	BUG_ON(!cfqd->busy_queues);
@@ -488,7 +547,7 @@ static inline void cfq_del_crq_rb(struct cfq_rq *crq)
 
 	if (ON_RB(&crq->rb_node)) {
 		struct cfq_data *cfqd = cfqq->cfqd;
-		const int sync = crq->is_sync;
+		const int sync = cfq_crq_is_sync(crq);
 
 		BUG_ON(!cfqq->queued[sync]);
 		cfqq->queued[sync]--;
@@ -498,7 +557,7 @@ static inline void cfq_del_crq_rb(struct cfq_rq *crq)
 		rb_erase(&crq->rb_node, &cfqq->sort_list);
 		RB_CLEAR_COLOR(&crq->rb_node);
 
-		if (cfqq->on_rr && RB_EMPTY(&cfqq->sort_list))
+		if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY(&cfqq->sort_list))
 			cfq_del_cfqq_rr(cfqd, cfqq);
 	}
 }
@@ -534,7 +593,7 @@ static void cfq_add_crq_rb(struct cfq_rq *crq)
 	struct cfq_rq *__alias;
 
 	crq->rb_key = rq_rb_key(rq);
-	cfqq->queued[crq->is_sync]++;
+	cfqq->queued[cfq_crq_is_sync(crq)]++;
 
 	/*
 	 * looks a little odd, but the first insert might return an alias.
@@ -545,8 +604,8 @@ static void cfq_add_crq_rb(struct cfq_rq *crq)
 
 	rb_insert_color(&crq->rb_node, &cfqq->sort_list);
 
-	if (!cfqq->on_rr)
-		cfq_add_cfqq_rr(cfqd, cfqq, crq->requeued);
+	if (!cfq_cfqq_on_rr(cfqq))
+		cfq_add_cfqq_rr(cfqd, cfqq, cfq_crq_requeued(crq));
 
 	/*
 	 * check if this request is a better next-serve candidate
@@ -559,7 +618,7 @@ cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
 {
 	if (ON_RB(&crq->rb_node)) {
 		rb_erase(&crq->rb_node, &cfqq->sort_list);
-		cfqq->queued[crq->is_sync]--;
+		cfqq->queued[cfq_crq_is_sync(crq)]--;
 	}
 
 	cfq_add_crq_rb(crq);
@@ -568,7 +627,7 @@ cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
 static struct request *cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
 
 {
-	struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->pid);
+	struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->pid, CFQ_KEY_ANY);
 	struct rb_node *n;
 
 	if (!cfqq)
@@ -598,17 +657,19 @@ static void cfq_deactivate_request(request_queue_t *q, struct request *rq)
 	if (crq) {
 		struct cfq_queue *cfqq = crq->cfq_queue;
 
-		if (crq->accounted) {
-			crq->accounted = 0;
+		if (cfq_crq_in_driver(crq)) {
+			cfq_clear_crq_in_driver(crq);
 			WARN_ON(!cfqd->rq_in_driver);
 			cfqd->rq_in_driver--;
 		}
-		if (crq->in_flight) {
-			crq->in_flight = 0;
-			WARN_ON(!cfqq->in_flight);
-			cfqq->in_flight--;
+		if (cfq_crq_in_flight(crq)) {
+			const int sync = cfq_crq_is_sync(crq);
+
+			cfq_clear_crq_in_flight(crq);
+			WARN_ON(!cfqq->on_dispatch[sync]);
+			cfqq->on_dispatch[sync]--;
 		}
-		crq->requeued = 1;
+		cfq_mark_crq_requeued(crq);
 	}
 }
 
@@ -712,8 +773,9 @@ __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 		cfqq->slice_start = jiffies;
 		cfqq->slice_end = 0;
 		cfqq->slice_left = 0;
-		cfqq->must_alloc_slice = 0;
-		cfqq->fifo_expire = 0;
+		cfq_clear_cfqq_must_alloc_slice(cfqq);
+		cfq_clear_cfqq_fifo_expire(cfqq);
+		cfq_clear_cfqq_expired(cfqq);
 	}
 
 	cfqd->active_queue = cfqq;
@@ -776,9 +838,18 @@ static int cfq_get_next_prio_level(struct cfq_data *cfqd)
 	return prio;
 }
 
-static void cfq_set_active_queue(struct cfq_data *cfqd)
+static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
 {
-	struct cfq_queue *cfqq = NULL;
+	struct cfq_queue *cfqq;
+
+	/*
+	 * if current queue is expired but not done with its requests yet,
+	 * wait for that to happen
+	 */
+	if ((cfqq = cfqd->active_queue) != NULL) {
+		if (cfq_cfqq_expired(cfqq) && cfq_cfqq_dispatched(cfqq))
+			return NULL;
+	}
 
 	/*
 	 * if current list is non-empty, grab first entry. if it is empty,
@@ -802,50 +873,66 @@ static void cfq_set_active_queue(struct cfq_data *cfqd)
 	}
 
 	__cfq_set_active_queue(cfqd, cfqq);
+	return cfqq;
 }
 
 /*
  * current cfqq expired its slice (or was too idle), select new one
  */
-static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted)
+static void
+__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+		    int preempted)
 {
-	struct cfq_queue *cfqq = cfqd->active_queue;
-
-	if (cfqq) {
-		unsigned long now = jiffies;
+	unsigned long now = jiffies;
 
-		if (cfqq->wait_request)
-			del_timer(&cfqd->idle_slice_timer);
+	if (cfq_cfqq_wait_request(cfqq))
+		del_timer(&cfqd->idle_slice_timer);
 
-		if (!preempted && !cfqq->in_flight)
-			cfqq->service_last = now;
+	if (!preempted && !cfq_cfqq_dispatched(cfqq))
+		cfqq->service_last = now;
 
-		cfqq->must_dispatch = 0;
-		cfqq->wait_request = 0;
+	cfq_clear_cfqq_must_dispatch(cfqq);
+	cfq_clear_cfqq_wait_request(cfqq);
 
-		/*
-		 * store what was left of this slice, if the queue idled out
-		 * or was preempted
-		 */
-		if (time_after(now, cfqq->slice_end))
-			cfqq->slice_left = now - cfqq->slice_end;
-		else
-			cfqq->slice_left = 0;
+	/*
+	 * store what was left of this slice, if the queue idled out
+	 * or was preempted
+	 */
+	if (time_after(now, cfqq->slice_end))
+		cfqq->slice_left = now - cfqq->slice_end;
+	else
+		cfqq->slice_left = 0;
 
-		if (cfqq->on_rr)
-			cfq_resort_rr_list(cfqq, preempted);
+	if (cfq_cfqq_on_rr(cfqq))
+		cfq_resort_rr_list(cfqq, preempted);
 
+	if (cfqq == cfqd->active_queue)
 		cfqd->active_queue = NULL;
 
-		if (cfqd->active_cic) {
-			put_io_context(cfqd->active_cic->ioc);
-			cfqd->active_cic = NULL;
-		}
+	if (cfqd->active_cic) {
+		put_io_context(cfqd->active_cic->ioc);
+		cfqd->active_cic = NULL;
 	}
 
 	cfqd->dispatch_slice = 0;
 }
 
+static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted)
+{
+	struct cfq_queue *cfqq = cfqd->active_queue;
+
+	if (cfqq) {
+		/*
+		 * use deferred expiry, if there are requests in progress as
+		 * not to disturb the slice of the next queue
+		 */
+		if (cfq_cfqq_dispatched(cfqq))
+			cfq_mark_cfqq_expired(cfqq);
+		else
+			__cfq_slice_expired(cfqd, cfqq, preempted);
+	}
+}
+
 static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 
 {
@@ -857,7 +944,7 @@ static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	 */
 	if (!cfqd->cfq_slice_idle)
 		return 0;
-	if (!cfqq->idle_window)
+	if (!cfq_cfqq_idle_window(cfqq))
 		return 0;
 	/*
 	 * task has exited, don't wait
@@ -865,13 +952,13 @@ static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	if (cfqd->active_cic && !cfqd->active_cic->ioc->task)
 		return 0;
 
-	cfqq->wait_request = 1;
-	cfqq->must_alloc = 1;
+	cfq_mark_cfqq_must_dispatch(cfqq);
+	cfq_mark_cfqq_wait_request(cfqq);
 
 	if (!timer_pending(&cfqd->idle_slice_timer)) {
-		unsigned long slice_left = cfqq->slice_end - 1;
+		unsigned long slice_left = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle);
 
-		cfqd->idle_slice_timer.expires = min(jiffies + cfqd->cfq_slice_idle, slice_left);
+		cfqd->idle_slice_timer.expires = jiffies + slice_left;
 		add_timer(&cfqd->idle_slice_timer);
 	}
 
@@ -901,7 +988,7 @@ static void cfq_dispatch_sort(request_queue_t *q, struct cfq_rq *crq)
 			break;
 		if (!blk_fs_request(__rq))
 			break;
-		if (__crq->requeued)
+		if (cfq_crq_requeued(__crq))
 			break;
 
 		if (__rq->sector <= crq->request->sector)
@@ -920,9 +1007,10 @@ static void cfq_dispatch_sort(request_queue_t *q, struct cfq_rq *crq)
 	cfq_del_crq_rb(crq);
 	cfq_remove_merge_hints(q, crq);
 
-	crq->in_flight = 1;
-	crq->requeued = 0;
-	cfqq->in_flight++;
+	cfq_mark_crq_in_flight(crq);
+	cfq_clear_crq_requeued(crq);
+
+	cfqq->on_dispatch[cfq_crq_is_sync(crq)]++;
 	list_add_tail(&crq->request->queuelist, entry);
 }
 
@@ -935,16 +1023,16 @@ static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq)
 	struct request *rq;
 	struct cfq_rq *crq;
 
-	if (cfqq->fifo_expire)
+	if (cfq_cfqq_fifo_expire(cfqq))
 		return NULL;
 
 	if (!list_empty(&cfqq->fifo)) {
-		int fifo = cfq_cfqq_sync(cfqq);
+		int fifo = cfq_cfqq_class_sync(cfqq);
 
 		crq = RQ_DATA(list_entry_fifo(cfqq->fifo.next));
 		rq = crq->request;
 		if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) {
-			cfqq->fifo_expire = 1;
+			cfq_mark_cfqq_fifo_expire(cfqq);
 			return crq;
 		}
 	}
@@ -953,7 +1041,9 @@ static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq)
 }
 
 /*
- * Scale schedule slice based on io priority
+ * Scale schedule slice based on io priority. Use the sync time slice only
+ * if a queue is marked sync and has sync io queued. A sync queue with async
+ * io only, should not get full sync slice length.
  */
 static inline int
 cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
@@ -981,6 +1071,16 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
 }
 
+/*
+ * scheduler run of queue, if there are requests pending and no one in the
+ * driver that will restart queueing
+ */
+static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
+{
+	if (!cfqd->rq_in_driver && cfq_pending_requests(cfqd))
+		kblockd_schedule_work(&cfqd->unplug_work);
+}
+
 /*
  * get next queue for service
  */
@@ -993,11 +1093,14 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd, int force)
 	if (!cfqq)
 		goto new_queue;
 
+	if (cfq_cfqq_expired(cfqq))
+		goto new_queue;
+
 	/*
 	 * slice has expired
 	 */
-	if (!cfqq->must_dispatch && time_after(jiffies, cfqq->slice_end))
-		goto new_queue;
+	if (!cfq_cfqq_must_dispatch(cfqq) && time_after(now, cfqq->slice_end))
+		goto expire;
 
 	/*
 	 * if queue has requests, dispatch one. if not, check if
@@ -1005,17 +1108,18 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd, int force)
 	 */
 	if (!RB_EMPTY(&cfqq->sort_list))
 		goto keep_queue;
-	else if (!force && cfq_cfqq_sync(cfqq) &&
+	else if (!force && cfq_cfqq_class_sync(cfqq) &&
 		 time_before(now, cfqq->slice_end)) {
 		if (cfq_arm_slice_timer(cfqd, cfqq))
 			return NULL;
 	}
 
-new_queue:
+expire:
 	cfq_slice_expired(cfqd, 0);
-	cfq_set_active_queue(cfqd);
+new_queue:
+	cfqq = cfq_set_active_queue(cfqd);
 keep_queue:
-	return cfqd->active_queue;
+	return cfqq;
 }
 
 static int
@@ -1083,8 +1187,8 @@ cfq_dispatch_requests(request_queue_t *q, int max_dispatch, int force)
 
 	cfqq = cfq_select_queue(cfqd, force);
 	if (cfqq) {
-		cfqq->wait_request = 0;
-		cfqq->must_dispatch = 0;
+		cfq_clear_cfqq_must_dispatch(cfqq);
+		cfq_clear_cfqq_wait_request(cfqq);
 		del_timer(&cfqd->idle_slice_timer);
 
 		if (cfq_class_idle(cfqq))
@@ -1108,10 +1212,10 @@ static inline void cfq_account_dispatch(struct cfq_rq *crq)
 	 * accounted bit is necessary since some drivers will call
 	 * elv_next_request() many times for the same request (eg ide)
 	 */
-	if (crq->accounted)
+	if (cfq_crq_in_driver(crq))
 		return;
 
-	crq->accounted = 1;
+	cfq_mark_crq_in_driver(crq);
 	cfqd->rq_in_driver++;
 }
 
@@ -1121,7 +1225,7 @@ cfq_account_completion(struct cfq_queue *cfqq, struct cfq_rq *crq)
 	struct cfq_data *cfqd = cfqq->cfqd;
 	unsigned long now;
 
-	if (!crq->accounted)
+	if (!cfq_crq_in_driver(crq))
 		return;
 
 	now = jiffies;
@@ -1132,12 +1236,18 @@ cfq_account_completion(struct cfq_queue *cfqq, struct cfq_rq *crq)
 	if (!cfq_class_idle(cfqq))
 		cfqd->last_end_request = now;
 
-	if (!cfqq->in_flight && cfqq->on_rr) {
-		cfqq->service_last = now;
-		cfq_resort_rr_list(cfqq, 0);
+	if (!cfq_cfqq_dispatched(cfqq)) {
+		if (cfq_cfqq_on_rr(cfqq)) {
+			cfqq->service_last = now;
+			cfq_resort_rr_list(cfqq, 0);
+		}
+		if (cfq_cfqq_expired(cfqq)) {
+			__cfq_slice_expired(cfqd, cfqq, 0);
+			cfq_schedule_dispatch(cfqd);
+		}
 	}
 
-	if (crq->is_sync)
+	if (cfq_crq_is_sync(crq))
 		crq->io_context->last_end_request = now;
 }
 
@@ -1153,10 +1263,13 @@ dispatch:
 
 		crq = RQ_DATA(rq);
 		if (crq) {
+			struct cfq_queue *cfqq = crq->cfq_queue;
+
 			/*
 			 * if idle window is disabled, allow queue buildup
 			 */
-			if (!crq->in_flight && !crq->cfq_queue->idle_window &&
+			if (!cfq_crq_in_driver(crq) &&
+			    !cfq_cfqq_idle_window(cfqq) &&
 			    cfqd->rq_in_driver >= cfqd->cfq_max_depth)
 				return NULL;
 
@@ -1190,11 +1303,11 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 
 	BUG_ON(rb_first(&cfqq->sort_list));
 	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
-	BUG_ON(cfqq->on_rr);
+	BUG_ON(cfq_cfqq_on_rr(cfqq));
 
 	if (unlikely(cfqd->active_queue == cfqq)) {
-		cfq_slice_expired(cfqd, 0);
-		kblockd_schedule_work(&cfqd->unplug_work);
+		__cfq_slice_expired(cfqd, cfqq, 0);
+		cfq_schedule_dispatch(cfqd);
 	}
 
 	cfq_put_cfqd(cfqq->cfqd);
@@ -1208,15 +1321,17 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 }
 
 static inline struct cfq_queue *
-__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, const int hashval)
+__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio,
+		    const int hashval)
 {
 	struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
 	struct hlist_node *entry, *next;
 
 	hlist_for_each_safe(entry, next, hash_list) {
 		struct cfq_queue *__cfqq = list_entry_qhash(entry);
+		const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->ioprio_class, __cfqq->ioprio);
 
-		if (__cfqq->key == key)
+		if (__cfqq->key == key && (__p == prio || prio == CFQ_KEY_ANY))
 			return __cfqq;
 	}
 
@@ -1224,9 +1339,9 @@ __cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, const int hashval)
 }
 
 static struct cfq_queue *
-cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key)
+cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio)
 {
-	return __cfq_find_cfq_hash(cfqd, key, hash_long(key, CFQ_QHASH_SHIFT));
+	return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT));
 }
 
 static void cfq_free_io_context(struct cfq_io_context *cic)
@@ -1255,8 +1370,8 @@ static void cfq_exit_single_io_context(struct cfq_io_context *cic)
 	spin_lock(q->queue_lock);
 
 	if (unlikely(cic->cfqq == cfqd->active_queue)) {
-		cfq_slice_expired(cfqd, 0);
-		kblockd_schedule_work(&cfqd->unplug_work);
+		__cfq_slice_expired(cfqd, cic->cfqq, 0);
+		cfq_schedule_dispatch(cfqd);
 	}
 
 	cfq_put_queue(cic->cfqq);
@@ -1313,7 +1428,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq)
 	struct task_struct *tsk = current;
 	int ioprio_class;
 
-	if (!cfqq->prio_changed)
+	if (!cfq_cfqq_prio_changed(cfqq))
 		return;
 
 	ioprio_class = IOPRIO_PRIO_CLASS(tsk->ioprio);
@@ -1338,7 +1453,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq)
 		case IOPRIO_CLASS_IDLE:
 			cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
 			cfqq->ioprio = 7;
-			cfqq->idle_window = 0;
+			cfq_clear_cfqq_idle_window(cfqq);
 			break;
 	}
 
@@ -1349,10 +1464,10 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq)
 	cfqq->org_ioprio = cfqq->ioprio;
 	cfqq->org_ioprio_class = cfqq->ioprio_class;
 
-	if (cfqq->on_rr)
+	if (cfq_cfqq_on_rr(cfqq))
 		cfq_resort_rr_list(cfqq, 0);
 
-	cfqq->prio_changed = 0;
+	cfq_clear_cfqq_prio_changed(cfqq);
 }
 
 static inline void changed_ioprio(struct cfq_queue *cfqq)
@@ -1361,7 +1476,7 @@ static inline void changed_ioprio(struct cfq_queue *cfqq)
 		struct cfq_data *cfqd = cfqq->cfqd;
 
 		spin_lock(cfqd->queue->queue_lock);
-		cfqq->prio_changed = 1;
+		cfq_mark_cfqq_prio_changed(cfqq);
 		cfq_init_prio_data(cfqq);
 		spin_unlock(cfqd->queue->queue_lock);
 	}
@@ -1383,13 +1498,14 @@ static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio)
 }
 
 static struct cfq_queue *
-cfq_get_queue(struct cfq_data *cfqd, unsigned int key, int gfp_mask)
+cfq_get_queue(struct cfq_data *cfqd, unsigned int key, unsigned short ioprio,
+	      int gfp_mask)
 {
 	const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
 	struct cfq_queue *cfqq, *new_cfqq = NULL;
 
 retry:
-	cfqq = __cfq_find_cfq_hash(cfqd, key, hashval);
+	cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval);
 
 	if (!cfqq) {
 		if (new_cfqq) {
@@ -1423,10 +1539,9 @@ retry:
 		 * set ->slice_left to allow preemption for a new process
 		 */
 		cfqq->slice_left = 2 * cfqd->cfq_slice_idle;
-		cfqq->idle_window = 1;
-		cfqq->ioprio = -1;
-		cfqq->ioprio_class = -1;
-		cfqq->prio_changed = 1;
+		cfq_mark_cfqq_idle_window(cfqq);
+		cfq_mark_cfqq_prio_changed(cfqq);
+		cfq_init_prio_data(cfqq);
 	}
 
 	if (new_cfqq)
@@ -1553,7 +1668,7 @@ static void
 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		       struct cfq_io_context *cic)
 {
-	int enable_idle = cfqq->idle_window;
+	int enable_idle = cfq_cfqq_idle_window(cfqq);
 
 	if (!cic->ioc->task || !cfqd->cfq_slice_idle)
 		enable_idle = 0;
@@ -1564,7 +1679,10 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			enable_idle = 1;
 	}
 
-	cfqq->idle_window = enable_idle;
+	if (enable_idle)
+		cfq_mark_cfqq_idle_window(cfqq);
+	else
+		cfq_clear_cfqq_idle_window(cfqq);
 }
 
 
@@ -1586,14 +1704,14 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 
 	if (cfq_class_idle(cfqq))
 		return 1;
-	if (!new_cfqq->wait_request)
+	if (!cfq_cfqq_wait_request(new_cfqq))
 		return 0;
 	/*
 	 * if it doesn't have slice left, forget it
 	 */
 	if (new_cfqq->slice_left < cfqd->cfq_slice_idle)
 		return 0;
-	if (crq->is_sync && !cfq_cfqq_sync(cfqq))
+	if (cfq_crq_is_sync(crq) && !cfq_cfqq_sync(cfqq))
 		return 1;
 
 	return 0;
@@ -1614,7 +1732,7 @@ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 		cfqq->slice_left = cfq_prio_to_slice(cfqd, cfqq) / 2;
 
 	cfqq->slice_end = cfqq->slice_left + jiffies;
-	cfq_slice_expired(cfqd, 1);
+	__cfq_slice_expired(cfqd, cfqq, 1);
 	__cfq_set_active_queue(cfqd, cfqq);
 }
 
@@ -1639,7 +1757,7 @@ static void
 cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		 struct cfq_rq *crq)
 {
-	const int sync = crq->is_sync;
+	const int sync = cfq_crq_is_sync(crq);
 
 	cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
 
@@ -1658,8 +1776,8 @@ cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		 * immediately and flag that we must not expire this queue
 		 * just now
 		 */
-		if (cfqq->wait_request) {
-			cfqq->must_dispatch = 1;
+		if (cfq_cfqq_wait_request(cfqq)) {
+			cfq_mark_cfqq_must_dispatch(cfqq);
 			del_timer(&cfqd->idle_slice_timer);
 			cfq_start_queueing(cfqd, cfqq);
 		}
@@ -1670,7 +1788,7 @@ cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		 * has some old slice time left and is of higher priority
 		 */
 		cfq_preempt_queue(cfqd, cfqq);
-		cfqq->must_dispatch = 1;
+		cfq_mark_cfqq_must_dispatch(cfqq);
 		cfq_start_queueing(cfqd, cfqq);
 	}
 }
@@ -1713,7 +1831,7 @@ cfq_insert_request(request_queue_t *q, struct request *rq, int where)
 			 * be kicked by __make_request() afterward.
 			 * Kick it here.
 			 */
-			kblockd_schedule_work(&cfqd->unplug_work);
+			cfq_schedule_dispatch(cfqd);
 			break;
 		case ELEVATOR_INSERT_FRONT:
 			list_add(&rq->queuelist, &q->queue_head);
@@ -1750,9 +1868,11 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq)
 
 	cfqq = crq->cfq_queue;
 
-	if (crq->in_flight) {
-		WARN_ON(!cfqq->in_flight);
-		cfqq->in_flight--;
+	if (cfq_crq_in_flight(crq)) {
+		const int sync = cfq_crq_is_sync(crq);
+
+		WARN_ON(!cfqq->on_dispatch[sync]);
+		cfqq->on_dispatch[sync]--;
 	}
 
 	cfq_account_completion(cfqq, crq);
@@ -1814,7 +1934,7 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
 	 * refile between round-robin lists if we moved the priority class
 	 */
 	if ((ioprio_class != cfqq->ioprio_class || ioprio != cfqq->ioprio) &&
-	    cfqq->on_rr)
+	    cfq_cfqq_on_rr(cfqq))
 		cfq_resort_rr_list(cfqq, 0);
 }
 
@@ -1830,23 +1950,27 @@ static inline int
 __cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		struct task_struct *task, int rw)
 {
-	if (cfqq->wait_request && cfqq->must_alloc)
+#if 1
+	if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) &&
+	    !cfq_cfqq_must_alloc_slice) {
+		cfq_mark_cfqq_must_alloc_slice(cfqq);
 		return ELV_MQUEUE_MUST;
+	}
 
 	return ELV_MQUEUE_MAY;
-#if 0
+#else
 	if (!cfqq || task->flags & PF_MEMALLOC)
 		return ELV_MQUEUE_MAY;
-	if (!cfqq->allocated[rw] || cfqq->must_alloc) {
-		if (cfqq->wait_request)
+	if (!cfqq->allocated[rw] || cfq_cfqq_must_alloc(cfqq)) {
+		if (cfq_cfqq_wait_request(cfqq))
 			return ELV_MQUEUE_MUST;
 
 		/*
 		 * only allow 1 ELV_MQUEUE_MUST per slice, otherwise we
 		 * can quickly flood the queue with writes from a single task
 		 */
-		if (rw == READ || !cfqq->must_alloc_slice) {
-			cfqq->must_alloc_slice = 1;
+		if (rw == READ || !cfq_cfqq_must_alloc_slice) {
+			cfq_mark_cfqq_must_alloc_slice(cfqq);
 			return ELV_MQUEUE_MUST;
 		}
 
@@ -1881,7 +2005,7 @@ static int cfq_may_queue(request_queue_t *q, int rw, struct bio *bio)
 	 * so just lookup a possibly existing queue, or return 'may queue'
 	 * if that fails
 	 */
-	cfqq = cfq_find_cfq_hash(cfqd, cfq_queue_pid(tsk, rw));
+	cfqq = cfq_find_cfq_hash(cfqd, cfq_queue_pid(tsk, rw), tsk->ioprio);
 	if (cfqq) {
 		cfq_init_prio_data(cfqq);
 		cfq_prio_boost(cfqq);
@@ -1943,15 +2067,17 @@ cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
 		int gfp_mask)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
+	struct task_struct *tsk = current;
 	struct cfq_io_context *cic;
 	const int rw = rq_data_dir(rq);
+	pid_t key = cfq_queue_pid(tsk, rw);
 	struct cfq_queue *cfqq;
 	struct cfq_rq *crq;
 	unsigned long flags;
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 
-	cic = cfq_get_io_context(cfqd, cfq_queue_pid(current, rw), gfp_mask);
+	cic = cfq_get_io_context(cfqd, key, gfp_mask);
 
 	spin_lock_irqsave(q->queue_lock, flags);
 
@@ -1959,7 +2085,7 @@ cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
 		goto queue_fail;
 
 	if (!cic->cfqq) {
-		cfqq = cfq_get_queue(cfqd, current->pid, gfp_mask);
+		cfqq = cfq_get_queue(cfqd, key, tsk->ioprio, gfp_mask);
 		if (!cfqq)
 			goto queue_fail;
 
@@ -1968,7 +2094,7 @@ cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
 		cfqq = cic->cfqq;
 
 	cfqq->allocated[rw]++;
-	cfqq->must_alloc = 0;
+	cfq_clear_cfqq_must_alloc(cfqq);
 	cfqd->rq_starved = 0;
 	atomic_inc(&cfqq->ref);
 	spin_unlock_irqrestore(q->queue_lock, flags);
@@ -1981,9 +2107,15 @@ cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
 		INIT_HLIST_NODE(&crq->hash);
 		crq->cfq_queue = cfqq;
 		crq->io_context = cic;
-		crq->in_flight = crq->accounted = 0;
-		crq->is_sync = (rw == READ || process_sync(current));
-		crq->requeued = 0;
+		cfq_clear_crq_in_flight(crq);
+		cfq_clear_crq_in_driver(crq);
+		cfq_clear_crq_requeued(crq);
+
+		if (rw == READ || process_sync(tsk))
+			cfq_mark_crq_is_sync(crq);
+		else
+			cfq_clear_crq_is_sync(crq);
+
 		rq->elevator_private = crq;
 		return 0;
 	}
@@ -1991,7 +2123,7 @@ cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
 	spin_lock_irqsave(q->queue_lock, flags);
 	cfqq->allocated[rw]--;
 	if (!(cfqq->allocated[0] + cfqq->allocated[1]))
-		cfqq->must_alloc = 1;
+		cfq_mark_cfqq_must_alloc(cfqq);
 	cfq_put_queue(cfqq);
 queue_fail:
 	if (cic)
@@ -2002,7 +2134,7 @@ queue_fail:
 	 * that would be an extremely rare OOM situation
 	 */
 	cfqd->rq_starved = 1;
-	kblockd_schedule_work(&cfqd->unplug_work);
+	cfq_schedule_dispatch(cfqd);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 	return 1;
 }
@@ -2068,15 +2200,14 @@ static void cfq_idle_slice_timer(unsigned long data)
 		 * not expired and it has a request pending, let it dispatch
 		 */
 		if (!RB_EMPTY(&cfqq->sort_list)) {
-			cfqq->must_dispatch = 1;
+			cfq_mark_cfqq_must_dispatch(cfqq);
 			goto out_kick;
 		}
 	}
 expire:
 	cfq_slice_expired(cfqd, 0);
 out_kick:
-	if (cfq_pending_requests(cfqd))
-		kblockd_schedule_work(&cfqd->unplug_work);
+	cfq_schedule_dispatch(cfqd);
 out_cont:
 	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
 }
@@ -2099,11 +2230,17 @@ static void cfq_idle_class_timer(unsigned long data)
 		cfqd->idle_class_timer.expires = end;
 		add_timer(&cfqd->idle_class_timer);
 	} else
-		kblockd_schedule_work(&cfqd->unplug_work);
+		cfq_schedule_dispatch(cfqd);
 
 	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
 }
 
+static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
+{
+	del_timer_sync(&cfqd->idle_slice_timer);
+	del_timer_sync(&cfqd->idle_class_timer);
+	blk_sync_queue(cfqd->queue);
+}
 
 static void cfq_put_cfqd(struct cfq_data *cfqd)
 {
@@ -2112,7 +2249,7 @@ static void cfq_put_cfqd(struct cfq_data *cfqd)
 	if (!atomic_dec_and_test(&cfqd->ref))
 		return;
 
-	blk_sync_queue(q);
+	cfq_shutdown_timer_wq(cfqd);
 
 	blk_put_queue(q);
 
@@ -2126,8 +2263,7 @@ static void cfq_exit_queue(elevator_t *e)
 {
 	struct cfq_data *cfqd = e->elevator_data;
 
-	del_timer_sync(&cfqd->idle_slice_timer);
-	del_timer_sync(&cfqd->idle_class_timer);
+	cfq_shutdown_timer_wq(cfqd);
 	cfq_put_cfqd(cfqd);
 }
 
@@ -2198,6 +2334,7 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e)
 	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
 	cfqd->cfq_slice_idle = cfq_slice_idle;
 	cfqd->cfq_max_depth = cfq_max_depth;
+
 	return 0;
 out_crqpool:
 	kfree(cfqd->cfq_hash);
@@ -2369,6 +2506,7 @@ static struct cfq_fs_entry cfq_max_depth_entry = {
 	.show = cfq_max_depth_show,
 	.store = cfq_max_depth_store,
 };
+
 static struct attribute *default_attrs[] = {
 	&cfq_quantum_entry.attr,
 	&cfq_queued_entry.attr,
-- 
cgit v1.2.3


From 96c51ce94e8415d2dfb08358bbd50e1589111f33 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 27 Jun 2005 14:49:39 +0200
Subject: [PATCH] CFQ io scheduler: scheduler switch oops

If cfq is managing a queue and a new scheduler is later selected, it is
possible for the cfqd unplug_work work to be queued after the kblockd
work struct has been flushed.  The problem is the ordering of
cfq_shutdown_timer_wq() and blk_put_queue() in cfq_put_cfqd().  The
latter may rearm the work, leaving cfq_kick_queue() with dead data.

Signed-off-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/cfq-iosched.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/cfq-iosched.c b/drivers/block/cfq-iosched.c
index 1ecb179b860..ff1cc968f96 100644
--- a/drivers/block/cfq-iosched.c
+++ b/drivers/block/cfq-iosched.c
@@ -2249,10 +2249,11 @@ static void cfq_put_cfqd(struct cfq_data *cfqd)
 	if (!atomic_dec_and_test(&cfqd->ref))
 		return;
 
-	cfq_shutdown_timer_wq(cfqd);
-
 	blk_put_queue(q);
 
+	cfq_shutdown_timer_wq(cfqd);
+	q->elevator->elevator_data = NULL;
+
 	mempool_destroy(cfqd->crq_pool);
 	kfree(cfqd->crq_hash);
 	kfree(cfqd->cfq_hash);
-- 
cgit v1.2.3


From 8c8709334cec803368a432a33e0f2e116d48fe07 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Mon, 27 Jun 2005 14:36:34 -0700
Subject: [PATCH] ppc32: Remove CONFIG_PMAC_PBOOK

This patch removes CONFIG_PMAC_PBOOK (PowerBook support).  This is now
split into CONFIG_PMAC_MEDIABAY for the actual hotswap bay that some
powerbooks have, CONFIG_PM for power management related code, and just left
out of any CONFIG_* option for some generally useful stuff that can be used
on non-laptops as well.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/swim3.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index 5b09cf154ac..e5f7494c00e 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -253,7 +253,7 @@ static int floppy_revalidate(struct gendisk *disk);
 static int swim3_add_device(struct device_node *swims);
 int swim3_init(void);
 
-#ifndef CONFIG_PMAC_PBOOK
+#ifndef CONFIG_PMAC_MEDIABAY
 #define check_media_bay(which, what)	1
 #endif
 
@@ -297,9 +297,11 @@ static void do_fd_request(request_queue_t * q)
 	int i;
 	for(i=0;i<floppy_count;i++)
 	{
+#ifdef CONFIG_PMAC_MEDIABAY
 		if (floppy_states[i].media_bay &&
 			check_media_bay(floppy_states[i].media_bay, MB_FD))
 			continue;
+#endif /* CONFIG_PMAC_MEDIABAY */
 		start_request(&floppy_states[i]);
 	}
 	sti();
@@ -856,8 +858,10 @@ static int floppy_ioctl(struct inode *inode, struct file *filp,
 	if ((cmd & 0x80) && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+#ifdef CONFIG_PMAC_MEDIABAY
 	if (fs->media_bay && check_media_bay(fs->media_bay, MB_FD))
 		return -ENXIO;
+#endif
 
 	switch (cmd) {
 	case FDEJECT:
@@ -881,8 +885,10 @@ static int floppy_open(struct inode *inode, struct file *filp)
 	int n, err = 0;
 
 	if (fs->ref_count == 0) {
+#ifdef CONFIG_PMAC_MEDIABAY
 		if (fs->media_bay && check_media_bay(fs->media_bay, MB_FD))
 			return -ENXIO;
+#endif
 		out_8(&sw->setup, S_IBM_DRIVE | S_FCLK_DIV2);
 		out_8(&sw->control_bic, 0xff);
 		out_8(&sw->mode, 0x95);
@@ -967,8 +973,10 @@ static int floppy_revalidate(struct gendisk *disk)
 	struct swim3 __iomem *sw;
 	int ret, n;
 
+#ifdef CONFIG_PMAC_MEDIABAY
 	if (fs->media_bay && check_media_bay(fs->media_bay, MB_FD))
 		return -ENXIO;
+#endif
 
 	sw = fs->swim3;
 	grab_drive(fs, revalidating, 0);
-- 
cgit v1.2.3


From 3de0a70bd926ff974adb27a38d4fd1049f05e54e Mon Sep 17 00:00:00 2001
From: Mike Miller <mike.miller@hp.com>
Date: Mon, 27 Jun 2005 14:36:48 -0700
Subject: [PATCH] cciss: pci id fix

This patch fixes a PCI ID I got wrong before.  It also adds support for
another new SAS controller due out this summer.  I didn't have a marketing
name prior to my last submission.  Also modifies the copyright date range.

Signed-off-by: Mike Miller <mike.miller@hp.com>
Acked-by: Jeff Garzik <jgarzik@pobox.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/cciss.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index abde27027c0..0cd606ce222 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1,6 +1,6 @@
 /*
  *    Disk Array driver for HP SA 5xxx and 6xxx Controllers
- *    Copyright 2000, 2002 Hewlett-Packard Development Company, L.P.
+ *    Copyright 2000, 2005 Hewlett-Packard Development Company, L.P.
  *
  *    This program is free software; you can redistribute it and/or modify
  *    it under the terms of the GNU General Public License as published by
@@ -54,7 +54,7 @@
 MODULE_AUTHOR("Hewlett-Packard Company");
 MODULE_DESCRIPTION("Driver for HP Controller SA5xxx SA6xxx version 2.6.6");
 MODULE_SUPPORTED_DEVICE("HP SA5i SA5i+ SA532 SA5300 SA5312 SA641 SA642 SA6400"
-			" SA6i P600 P800 E400");
+			" SA6i P600 P800 E400 E300");
 MODULE_LICENSE("GPL");
 
 #include "cciss_cmd.h"
@@ -85,8 +85,10 @@ static const struct pci_device_id cciss_pci_device_id[] = {
 		0x103C, 0x3225, 0, 0, 0},
 	{ PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSB,
 		0x103c, 0x3223, 0, 0, 0},
-	{ PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSB,
+	{ PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC,
 		0x103c, 0x3231, 0, 0, 0},
+	{ PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC,
+		0x103c, 0x3233, 0, 0, 0},
 	{0,}
 };
 MODULE_DEVICE_TABLE(pci, cciss_pci_device_id);
@@ -110,6 +112,7 @@ static struct board_type products[] = {
 	{ 0x3225103C, "Smart Array P600", &SA5_access},
 	{ 0x3223103C, "Smart Array P800", &SA5_access},
 	{ 0x3231103C, "Smart Array E400", &SA5_access},
+	{ 0x3233103C, "Smart Array E300", &SA5_access},
 };
 
 /* How long to wait (in millesconds) for board to go into simple mode */
-- 
cgit v1.2.3


From cd6fb584cf7f18ec6b221192b57d712ecc8c1859 Mon Sep 17 00:00:00 2001
From: Mike Miller <mike.miller@hp.com>
Date: Mon, 27 Jun 2005 14:36:49 -0700
Subject: [PATCH] cciss: pci domain info pass 2

This is pass 2 of my patch to add pci domain info to an existing ioctl.  This
time I insert the domain between dev_fn and board_id as Willy suggested and
change the var to unsigned short to ease Christoph's concerns.  Although I
thought unsigned int was the correct var type for this.  I also thought it
didn't matter where I inserted it in the structure.

Signed-off-by: Mike Miller <mike.miller@hp.com>
Acked-by: Jeff Garzik <jgarzik@pobox.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/cciss.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/block')

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 0cd606ce222..d5d0fa538f1 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -638,6 +638,7 @@ static int cciss_ioctl(struct inode *inode, struct file *filep,
 		cciss_pci_info_struct pciinfo;
 
 		if (!arg) return -EINVAL;
+		pciinfo.domain = pci_domain_nr(host->pdev->bus);
 		pciinfo.bus = host->pdev->bus->number;
 		pciinfo.dev_fn = host->pdev->devfn;
 		pciinfo.board_id = host->board_id;
-- 
cgit v1.2.3


From 60564a313a5738960064d6c555ec066d9332f278 Mon Sep 17 00:00:00 2001
From: Mike Miller <mike.miller@hp.com>
Date: Mon, 27 Jun 2005 14:36:50 -0700
Subject: [PATCH] cciss: remove partition info from CCISS_GETLUNINFO

This patch fulfills a promise I made to Christoph sometime back.  I am
removing the partition info from the CCISS_GETLUNINFO ioctl as I was informed
my "driver had no damn business reading that structure." ;)

The application folks are to use /proc or /sys for partition info from now on.
 I am only aware of a few apps that use this ioctl and I'm not sure they ever
used the partition info.

Signed-off-by: Mike Miller <mike.miller@hp.com>
Acked-by: Jeff Garzik <jgarzik@pobox.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/cciss.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index d5d0fa538f1..653512b7757 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -791,13 +791,6 @@ static int cciss_ioctl(struct inode *inode, struct file *filep,
  		luninfo.LunID = drv->LunID;
  		luninfo.num_opens = drv->usage_count;
  		luninfo.num_parts = 0;
- 		/* count partitions 1 to 15 with sizes > 0 */
- 		for (i = 0; i < MAX_PART - 1; i++) {
-			if (!disk->part[i])
-				continue;
-			if (disk->part[i]->nr_sects != 0)
-				luninfo.num_parts++;
-		}
  		if (copy_to_user(argp, &luninfo,
  				sizeof(LogvolInfo_struct)))
  			return -EFAULT;
-- 
cgit v1.2.3


From 99f95e5286df2f69edab8a04c7080d986ee4233b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 27 Jun 2005 20:14:05 -0700
Subject: [PATCH] cfq build fix

drivers/block/cfq-iosched.c: In function 'cfq_put_queue':
drivers/block/cfq-iosched.c:303: sorry, unimplemented: inlining failed in call to 'cfq_pending_requests': function body not available
drivers/block/cfq-iosched.c:1080: sorry, unimplemented: called from here
drivers/block/cfq-iosched.c: In function '__cfq_may_queue':
drivers/block/cfq-iosched.c:1955: warning: the address of 'cfq_cfqq_must_alloc_slice', will always evaluate as 'true'
make[1]: *** [drivers/block/cfq-iosched.o] Error 1
make: *** [drivers/block/cfq-iosched.o] Error 2

Cc: Jeff Garzik <jgarzik@pobox.com>
Cc: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/cfq-iosched.c | 49 ++++++++++++++++++++++-----------------------
 1 file changed, 24 insertions(+), 25 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/cfq-iosched.c b/drivers/block/cfq-iosched.c
index ff1cc968f96..de5746e38af 100644
--- a/drivers/block/cfq-iosched.c
+++ b/drivers/block/cfq-iosched.c
@@ -300,7 +300,6 @@ CFQ_CRQ_FNS(requeued);
 static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short);
 static void cfq_dispatch_sort(request_queue_t *, struct cfq_rq *);
 static void cfq_put_cfqd(struct cfq_data *cfqd);
-static inline int cfq_pending_requests(struct cfq_data *cfqd);
 
 #define process_sync(tsk)	((tsk)->flags & PF_SYNCWRITE)
 
@@ -348,6 +347,28 @@ static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
 	return NULL;
 }
 
+static inline int cfq_pending_requests(struct cfq_data *cfqd)
+{
+	return !list_empty(&cfqd->queue->queue_head) || cfqd->busy_queues;
+}
+
+/*
+ * scheduler run of queue, if there are requests pending and no one in the
+ * driver that will restart queueing
+ */
+static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
+{
+	if (!cfqd->rq_in_driver && cfq_pending_requests(cfqd))
+		kblockd_schedule_work(&cfqd->unplug_work);
+}
+
+static int cfq_queue_empty(request_queue_t *q)
+{
+	struct cfq_data *cfqd = q->elevator->elevator_data;
+
+	return !cfq_pending_requests(cfqd);
+}
+
 /*
  * Lifted from AS - choose which of crq1 and crq2 that is best served now.
  * We choose the request that is closest to the head right now. Distance
@@ -1071,16 +1092,6 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
 }
 
-/*
- * scheduler run of queue, if there are requests pending and no one in the
- * driver that will restart queueing
- */
-static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
-{
-	if (!cfqd->rq_in_driver && cfq_pending_requests(cfqd))
-		kblockd_schedule_work(&cfqd->unplug_work);
-}
-
 /*
  * get next queue for service
  */
@@ -1846,18 +1857,6 @@ cfq_insert_request(request_queue_t *q, struct request *rq, int where)
 	}
 }
 
-static inline int cfq_pending_requests(struct cfq_data *cfqd)
-{
-	return !list_empty(&cfqd->queue->queue_head) || cfqd->busy_queues;
-}
-
-static int cfq_queue_empty(request_queue_t *q)
-{
-	struct cfq_data *cfqd = q->elevator->elevator_data;
-
-	return !cfq_pending_requests(cfqd);
-}
-
 static void cfq_completed_request(request_queue_t *q, struct request *rq)
 {
 	struct cfq_rq *crq = RQ_DATA(rq);
@@ -1952,7 +1951,7 @@ __cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 {
 #if 1
 	if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) &&
-	    !cfq_cfqq_must_alloc_slice) {
+	    !cfq_cfqq_must_alloc_slice(cfqq)) {
 		cfq_mark_cfqq_must_alloc_slice(cfqq);
 		return ELV_MQUEUE_MUST;
 	}
@@ -1969,7 +1968,7 @@ __cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		 * only allow 1 ELV_MQUEUE_MUST per slice, otherwise we
 		 * can quickly flood the queue with writes from a single task
 		 */
-		if (rw == READ || !cfq_cfqq_must_alloc_slice) {
+		if (rw == READ || !cfq_cfqq_must_alloc_slice(cfqq)) {
 			cfq_mark_cfqq_must_alloc_slice(cfqq);
 			return ELV_MQUEUE_MUST;
 		}
-- 
cgit v1.2.3


From 97afa0a25afb43a82954662773a9d48d61b2996a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 27 Jun 2005 22:29:31 -0700
Subject: [PATCH] cciss_ioctl() warning fix

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/cciss.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 653512b7757..3e9fb6e4a52 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -786,7 +786,6 @@ static int cciss_ioctl(struct inode *inode, struct file *filep,
 
  	case CCISS_GETLUNINFO: {
  		LogvolInfo_struct luninfo;
- 		int i;
  		
  		luninfo.LunID = drv->LunID;
  		luninfo.num_opens = drv->usage_count;
-- 
cgit v1.2.3


From 082cf69eb82681f4eacb3a5653834c7970714bef Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 28 Jun 2005 16:35:11 +0200
Subject: [PATCH] ll_rw_blk: prevent huge request allocations

Currently we cap request allocations at q->nr_requests, but we allow a
batching io context to allocate up to 32 more (default setting).  This
can flood the queue with request allocations, with only a few batching
processes.  The real fix would be to limit the number of batchers, but
as that isn't currently tracked, I suggest we just cap the maximum
number of allocated requests to eg 50% over the limit.

This was observed in real life, users typically see this as vmstat bo
numbers going off the wall with seconds of no queueing afterwards.
Behaviour this bursty is not beneficial.

Signed-off-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/ll_rw_blk.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'drivers/block')

diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 234fdcfbdf0..6c98cf04271 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -1912,6 +1912,15 @@ static struct request *get_request(request_queue_t *q, int rw, struct bio *bio,
 	}
 
 get_rq:
+	/*
+	 * Only allow batching queuers to allocate up to 50% over the defined
+	 * limit of requests, otherwise we could have thousands of requests
+	 * allocated with any setting of ->nr_requests
+	 */
+	if (rl->count[rw] >= (3 * q->nr_requests / 2)) {
+		spin_unlock_irq(q->queue_lock);
+		goto out;
+	}
 	rl->count[rw]++;
 	rl->starved[rw] = 0;
 	if (rl->count[rw] >= queue_congestion_on_threshold(q))
-- 
cgit v1.2.3


From 450991bc1026135ee30482a4a806d069915ab2f6 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Tue, 28 Jun 2005 20:45:13 -0700
Subject: [PATCH] blk: __make_request efficiency

In the case where the request is not able to be merged by the elevator, don't
retake the lock and retry the merge mechanism after allocating a new request.

Instead assume that the chance of a merge remains slim, and now that we've
done most of the work allocating a request we may as well just go with it.

Also be rid of the GFP_ATOMIC allocation: we've got working mempools for the
block layer now, so let's save atomic memory for things like networking.

Lastly, in get_request_wait, do an initial get_request call before going into
the waitqueue.  This is reported to help efficiency.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/ll_rw_blk.c | 62 ++++++++++++++++-------------------------------
 1 file changed, 21 insertions(+), 41 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 6c98cf04271..67431f28015 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -1971,10 +1971,11 @@ out:
 static struct request *get_request_wait(request_queue_t *q, int rw,
 					struct bio *bio)
 {
-	DEFINE_WAIT(wait);
 	struct request *rq;
 
-	do {
+	rq = get_request(q, rw, bio, GFP_NOIO);
+	while (!rq) {
+		DEFINE_WAIT(wait);
 		struct request_list *rl = &q->rq;
 
 		prepare_to_wait_exclusive(&rl->wait[rw], &wait,
@@ -1999,7 +2000,7 @@ static struct request *get_request_wait(request_queue_t *q, int rw,
 			put_io_context(ioc);
 		}
 		finish_wait(&rl->wait[rw], &wait);
-	} while (!rq);
+	}
 
 	return rq;
 }
@@ -2521,7 +2522,7 @@ EXPORT_SYMBOL(blk_attempt_remerge);
 
 static int __make_request(request_queue_t *q, struct bio *bio)
 {
-	struct request *req, *freereq = NULL;
+	struct request *req;
 	int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err, sync;
 	unsigned short prio;
 	sector_t sector;
@@ -2549,14 +2550,9 @@ static int __make_request(request_queue_t *q, struct bio *bio)
 		goto end_io;
 	}
 
-again:
 	spin_lock_irq(q->queue_lock);
 
-	if (elv_queue_empty(q)) {
-		blk_plug_device(q);
-		goto get_rq;
-	}
-	if (barrier)
+	if (unlikely(barrier) || elv_queue_empty(q))
 		goto get_rq;
 
 	el_ret = elv_merge(q, &req, bio);
@@ -2601,40 +2597,23 @@ again:
 				elv_merged_request(q, req);
 			goto out;
 
-		/*
-		 * elevator says don't/can't merge. get new request
-		 */
-		case ELEVATOR_NO_MERGE:
-			break;
-
+		/* ELV_NO_MERGE: elevator says don't/can't merge. */
 		default:
-			printk("elevator returned crap (%d)\n", el_ret);
-			BUG();
+			;
 	}
 
+get_rq:
 	/*
-	 * Grab a free request from the freelist - if that is empty, check
-	 * if we are doing read ahead and abort instead of blocking for
-	 * a free slot.
+	 * Grab a free request. This is might sleep but can not fail.
+	 */
+	spin_unlock_irq(q->queue_lock);
+	req = get_request_wait(q, rw, bio);
+	/*
+	 * After dropping the lock and possibly sleeping here, our request
+	 * may now be mergeable after it had proven unmergeable (above).
+	 * We don't worry about that case for efficiency. It won't happen
+	 * often, and the elevators are able to handle it.
 	 */
-get_rq:
-	if (freereq) {
-		req = freereq;
-		freereq = NULL;
-	} else {
-		spin_unlock_irq(q->queue_lock);
-		if ((freereq = get_request(q, rw, bio, GFP_ATOMIC)) == NULL) {
-			/*
-			 * READA bit set
-			 */
-			err = -EWOULDBLOCK;
-			if (bio_rw_ahead(bio))
-				goto end_io;
-	
-			freereq = get_request_wait(q, rw, bio);
-		}
-		goto again;
-	}
 
 	req->flags |= REQ_CMD;
 
@@ -2663,10 +2642,11 @@ get_rq:
 	req->rq_disk = bio->bi_bdev->bd_disk;
 	req->start_time = jiffies;
 
+	spin_lock_irq(q->queue_lock);
+	if (elv_queue_empty(q))
+		blk_plug_device(q);
 	add_request(q, req);
 out:
-	if (freereq)
-		__blk_put_request(q, freereq);
 	if (sync)
 		__generic_unplug_device(q);
 
-- 
cgit v1.2.3


From d6344532a26a318c128102507f6328aaafe02d4d Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Tue, 28 Jun 2005 20:45:14 -0700
Subject: [PATCH] blk: reduce locking

Change around locking a bit for a result of 1-2 less spin lock unlock pairs in
request submission paths.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/ll_rw_blk.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 67431f28015..5caebe2cf0a 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -1867,19 +1867,20 @@ static void freed_request(request_queue_t *q, int rw)
 
 #define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)
 /*
- * Get a free request, queue_lock must not be held
+ * Get a free request, queue_lock must be held.
+ * Returns NULL on failure, with queue_lock held.
+ * Returns !NULL on success, with queue_lock *not held*.
  */
 static struct request *get_request(request_queue_t *q, int rw, struct bio *bio,
 				   int gfp_mask)
 {
 	struct request *rq = NULL;
 	struct request_list *rl = &q->rq;
-	struct io_context *ioc = get_io_context(gfp_mask);
+	struct io_context *ioc = get_io_context(GFP_ATOMIC);
 
 	if (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags)))
 		goto out;
 
-	spin_lock_irq(q->queue_lock);
 	if (rl->count[rw]+1 >= q->nr_requests) {
 		/*
 		 * The queue will fill after this allocation, so set it as
@@ -1907,7 +1908,6 @@ static struct request *get_request(request_queue_t *q, int rw, struct bio *bio,
 		 * The queue is full and the allocating process is not a
 		 * "batcher", and not exempted by the IO scheduler
 		 */
-		spin_unlock_irq(q->queue_lock);
 		goto out;
 	}
 
@@ -1950,7 +1950,6 @@ rq_starved:
 		if (unlikely(rl->count[rw] == 0))
 			rl->starved[rw] = 1;
 
-		spin_unlock_irq(q->queue_lock);
 		goto out;
 	}
 
@@ -1967,6 +1966,8 @@ out:
 /*
  * No available requests for this queue, unplug the device and wait for some
  * requests to become available.
+ *
+ * Called with q->queue_lock held, and returns with it unlocked.
  */
 static struct request *get_request_wait(request_queue_t *q, int rw,
 					struct bio *bio)
@@ -1986,7 +1987,8 @@ static struct request *get_request_wait(request_queue_t *q, int rw,
 		if (!rq) {
 			struct io_context *ioc;
 
-			generic_unplug_device(q);
+			__generic_unplug_device(q);
+			spin_unlock_irq(q->queue_lock);
 			io_schedule();
 
 			/*
@@ -1998,6 +2000,8 @@ static struct request *get_request_wait(request_queue_t *q, int rw,
 			ioc = get_io_context(GFP_NOIO);
 			ioc_set_batching(q, ioc);
 			put_io_context(ioc);
+
+			spin_lock_irq(q->queue_lock);
 		}
 		finish_wait(&rl->wait[rw], &wait);
 	}
@@ -2011,14 +2015,18 @@ struct request *blk_get_request(request_queue_t *q, int rw, int gfp_mask)
 
 	BUG_ON(rw != READ && rw != WRITE);
 
-	if (gfp_mask & __GFP_WAIT)
+	spin_lock_irq(q->queue_lock);
+	if (gfp_mask & __GFP_WAIT) {
 		rq = get_request_wait(q, rw, NULL);
-	else
+	} else {
 		rq = get_request(q, rw, NULL, gfp_mask);
+		if (!rq)
+			spin_unlock_irq(q->queue_lock);
+	}
+	/* q->queue_lock is unlocked at this point */
 
 	return rq;
 }
-
 EXPORT_SYMBOL(blk_get_request);
 
 /**
@@ -2605,9 +2613,10 @@ static int __make_request(request_queue_t *q, struct bio *bio)
 get_rq:
 	/*
 	 * Grab a free request. This is might sleep but can not fail.
+	 * Returns with the queue unlocked.
 	 */
-	spin_unlock_irq(q->queue_lock);
 	req = get_request_wait(q, rw, bio);
+
 	/*
 	 * After dropping the lock and possibly sleeping here, our request
 	 * may now be mergeable after it had proven unmergeable (above).
-- 
cgit v1.2.3


From fb3cc4320e1fd87143683b540e459a2e20fdc9bb Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Tue, 28 Jun 2005 20:45:15 -0700
Subject: [PATCH] blk: light iocontext ops

get_io_context needlessly turned off interrupts and checked for racing io
context creations.  Both of which aren't needed, because the io context can
only be created while in process context of the current process.

Also, split the function in 2.  A light version, current_io_context does not
elevate the reference count specifically, but can be used when in process
context, because the process holds a reference itself.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/ll_rw_blk.c | 56 +++++++++++++++++++++--------------------------
 1 file changed, 25 insertions(+), 31 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 5caebe2cf0a..1197462bb6b 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -1876,7 +1876,7 @@ static struct request *get_request(request_queue_t *q, int rw, struct bio *bio,
 {
 	struct request *rq = NULL;
 	struct request_list *rl = &q->rq;
-	struct io_context *ioc = get_io_context(GFP_ATOMIC);
+	struct io_context *ioc = current_io_context(GFP_ATOMIC);
 
 	if (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags)))
 		goto out;
@@ -1959,7 +1959,6 @@ rq_starved:
 	rq_init(q, rq);
 	rq->rl = rl;
 out:
-	put_io_context(ioc);
 	return rq;
 }
 
@@ -1997,9 +1996,8 @@ static struct request *get_request_wait(request_queue_t *q, int rw,
 			 * up to a big batch of them for a small period time.
 			 * See ioc_batching, ioc_set_batching
 			 */
-			ioc = get_io_context(GFP_NOIO);
+			ioc = current_io_context(GFP_NOIO);
 			ioc_set_batching(q, ioc);
-			put_io_context(ioc);
 
 			spin_lock_irq(q->queue_lock);
 		}
@@ -3282,24 +3280,20 @@ void exit_io_context(void)
 
 /*
  * If the current task has no IO context then create one and initialise it.
- * If it does have a context, take a ref on it.
+ * Otherwise, return its existing IO context.
  *
- * This is always called in the context of the task which submitted the I/O.
- * But weird things happen, so we disable local interrupts to ensure exclusive
- * access to *current.
+ * This returned IO context doesn't have a specifically elevated refcount,
+ * but since the current task itself holds a reference, the context can be
+ * used in general code, so long as it stays within `current` context.
  */
-struct io_context *get_io_context(int gfp_flags)
+struct io_context *current_io_context(int gfp_flags)
 {
 	struct task_struct *tsk = current;
-	unsigned long flags;
 	struct io_context *ret;
 
-	local_irq_save(flags);
 	ret = tsk->io_context;
-	if (ret)
-		goto out;
-
-	local_irq_restore(flags);
+	if (likely(ret))
+		return ret;
 
 	ret = kmem_cache_alloc(iocontext_cachep, gfp_flags);
 	if (ret) {
@@ -3310,25 +3304,25 @@ struct io_context *get_io_context(int gfp_flags)
 		ret->nr_batch_requests = 0; /* because this is 0 */
 		ret->aic = NULL;
 		ret->cic = NULL;
+		tsk->io_context = ret;
+	}
 
-		local_irq_save(flags);
-
-		/*
-		 * very unlikely, someone raced with us in setting up the task
-		 * io context. free new context and just grab a reference.
-		 */
-		if (!tsk->io_context)
-			tsk->io_context = ret;
-		else {
-			kmem_cache_free(iocontext_cachep, ret);
-			ret = tsk->io_context;
-		}
+	return ret;
+}
+EXPORT_SYMBOL(current_io_context);
 
-out:
+/*
+ * If the current task has no IO context then create one and initialise it.
+ * If it does have a context, take a ref on it.
+ *
+ * This is always called in the context of the task which submitted the I/O.
+ */
+struct io_context *get_io_context(int gfp_flags)
+{
+	struct io_context *ret;
+	ret = current_io_context(gfp_flags);
+	if (likely(ret))
 		atomic_inc(&ret->refcount);
-		local_irq_restore(flags);
-	}
-
 	return ret;
 }
 EXPORT_SYMBOL(get_io_context);
-- 
cgit v1.2.3


From fd782a4a99d2d3e818b9465c427b10f7f027d7da Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 29 Jun 2005 15:15:40 +0100
Subject: [PATCH] Fix get_request nastiness

get_request is now expected to be holding on to queue_lock, with interrupts
disabled, when it returns NULL; but one path forgot that, causing all kinds
of nastiness under swap load - badness backtraces, strange failures, BUGs.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/ll_rw_blk.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 1197462bb6b..692a5fced76 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -1917,10 +1917,9 @@ get_rq:
 	 * limit of requests, otherwise we could have thousands of requests
 	 * allocated with any setting of ->nr_requests
 	 */
-	if (rl->count[rw] >= (3 * q->nr_requests / 2)) {
-		spin_unlock_irq(q->queue_lock);
+	if (rl->count[rw] >= (3 * q->nr_requests / 2))
 		goto out;
-	}
+
 	rl->count[rw]++;
 	rl->starved[rw] = 0;
 	if (rl->count[rw] >= queue_congestion_on_threshold(q))
-- 
cgit v1.2.3