From ef9be1d336378de279d4e37779f1b83cebadbcc0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Fri, 11 Nov 2005 14:27:09 +0100
Subject: [BLOCK] as-iosched: update alias handling

Unlike other ioscheds, as-iosched handles alias by chaing them using
rq->queuelist.  As aliased requests are very rare in the first place,
this complicates merge/dispatch handling without meaningful
performance improvement.  This patch updates as-iosched to dump
aliased requests into dispatch queue as other ioscheds do.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/as-iosched.c | 144 ++++++++++-------------------------------------------
 1 file changed, 25 insertions(+), 119 deletions(-)

diff --git a/block/as-iosched.c b/block/as-iosched.c
index 43fa2049568..8da3cf66894 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -182,6 +182,9 @@ struct as_rq {
 
 static kmem_cache_t *arq_pool;
 
+static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq);
+static void as_antic_stop(struct as_data *ad);
+
 /*
  * IO Context helper functions
  */
@@ -370,7 +373,7 @@ static struct as_rq *as_find_first_arq(struct as_data *ad, int data_dir)
  * existing request against the same sector), which can happen when using
  * direct IO, then return the alias.
  */
-static struct as_rq *as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
+static struct as_rq *__as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
 {
 	struct rb_node **p = &ARQ_RB_ROOT(ad, arq)->rb_node;
 	struct rb_node *parent = NULL;
@@ -397,6 +400,16 @@ static struct as_rq *as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
 	return NULL;
 }
 
+static void as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
+{
+	struct as_rq *alias;
+
+	while ((unlikely(alias = __as_add_arq_rb(ad, arq)))) {
+		as_move_to_dispatch(ad, alias);
+		as_antic_stop(ad);
+	}
+}
+
 static inline void as_del_arq_rb(struct as_data *ad, struct as_rq *arq)
 {
 	if (!ON_RB(&arq->rb_node)) {
@@ -1133,23 +1146,6 @@ static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq)
 	/*
 	 * take it off the sort and fifo list, add to dispatch queue
 	 */
-	while (!list_empty(&rq->queuelist)) {
-		struct request *__rq = list_entry_rq(rq->queuelist.next);
-		struct as_rq *__arq = RQ_DATA(__rq);
-
-		list_del(&__rq->queuelist);
-
-		elv_dispatch_add_tail(ad->q, __rq);
-
-		if (__arq->io_context && __arq->io_context->aic)
-			atomic_inc(&__arq->io_context->aic->nr_dispatched);
-
-		WARN_ON(__arq->state != AS_RQ_QUEUED);
-		__arq->state = AS_RQ_DISPATCHED;
-
-		ad->nr_dispatched++;
-	}
-
 	as_remove_queued_request(ad->q, rq);
 	WARN_ON(arq->state != AS_RQ_QUEUED);
 
@@ -1325,42 +1321,6 @@ fifo_expired:
 	return 1;
 }
 
-/*
- * Add arq to a list behind alias
- */
-static inline void
-as_add_aliased_request(struct as_data *ad, struct as_rq *arq,
-				struct as_rq *alias)
-{
-	struct request  *req = arq->request;
-	struct list_head *insert = alias->request->queuelist.prev;
-
-	/*
-	 * Transfer list of aliases
-	 */
-	while (!list_empty(&req->queuelist)) {
-		struct request *__rq = list_entry_rq(req->queuelist.next);
-		struct as_rq *__arq = RQ_DATA(__rq);
-
-		list_move_tail(&__rq->queuelist, &alias->request->queuelist);
-
-		WARN_ON(__arq->state != AS_RQ_QUEUED);
-	}
-
-	/*
-	 * Another request with the same start sector on the rbtree.
-	 * Link this request to that sector. They are untangled in
-	 * as_move_to_dispatch
-	 */
-	list_add(&arq->request->queuelist, insert);
-
-	/*
-	 * Don't want to have to handle merges.
-	 */
-	as_del_arq_hash(arq);
-	arq->request->flags |= REQ_NOMERGE;
-}
-
 /*
  * add arq to rbtree and fifo
  */
@@ -1368,7 +1328,6 @@ static void as_add_request(request_queue_t *q, struct request *rq)
 {
 	struct as_data *ad = q->elevator->elevator_data;
 	struct as_rq *arq = RQ_DATA(rq);
-	struct as_rq *alias;
 	int data_dir;
 
 	arq->state = AS_RQ_NEW;
@@ -1387,33 +1346,17 @@ static void as_add_request(request_queue_t *q, struct request *rq)
 		atomic_inc(&arq->io_context->aic->nr_queued);
 	}
 
-	alias = as_add_arq_rb(ad, arq);
-	if (!alias) {
-		/*
-		 * set expire time (only used for reads) and add to fifo list
-		 */
-		arq->expires = jiffies + ad->fifo_expire[data_dir];
-		list_add_tail(&arq->fifo, &ad->fifo_list[data_dir]);
+	as_add_arq_rb(ad, arq);
+	if (rq_mergeable(arq->request))
+		as_add_arq_hash(ad, arq);
 
-		if (rq_mergeable(arq->request))
-			as_add_arq_hash(ad, arq);
-		as_update_arq(ad, arq); /* keep state machine up to date */
-
-	} else {
-		as_add_aliased_request(ad, arq, alias);
-
-		/*
-		 * have we been anticipating this request?
-		 * or does it come from the same process as the one we are
-		 * anticipating for?
-		 */
-		if (ad->antic_status == ANTIC_WAIT_REQ
-				|| ad->antic_status == ANTIC_WAIT_NEXT) {
-			if (as_can_break_anticipation(ad, arq))
-				as_antic_stop(ad);
-		}
-	}
+	/*
+	 * set expire time (only used for reads) and add to fifo list
+	 */
+	arq->expires = jiffies + ad->fifo_expire[data_dir];
+	list_add_tail(&arq->fifo, &ad->fifo_list[data_dir]);
 
+	as_update_arq(ad, arq); /* keep state machine up to date */
 	arq->state = AS_RQ_QUEUED;
 }
 
@@ -1536,23 +1479,8 @@ static void as_merged_request(request_queue_t *q, struct request *req)
 	 * if the merge was a front merge, we need to reposition request
 	 */
 	if (rq_rb_key(req) != arq->rb_key) {
-		struct as_rq *alias, *next_arq = NULL;
-
-		if (ad->next_arq[arq->is_sync] == arq)
-			next_arq = as_find_next_arq(ad, arq);
-
-		/*
-		 * Note! We should really be moving any old aliased requests
-		 * off this request and try to insert them into the rbtree. We
-		 * currently don't bother. Ditto the next function.
-		 */
 		as_del_arq_rb(ad, arq);
-		if ((alias = as_add_arq_rb(ad, arq))) {
-			list_del_init(&arq->fifo);
-			as_add_aliased_request(ad, arq, alias);
-			if (next_arq)
-				ad->next_arq[arq->is_sync] = next_arq;
-		}
+		as_add_arq_rb(ad, arq);
 		/*
 		 * Note! At this stage of this and the next function, our next
 		 * request may not be optimal - eg the request may have "grown"
@@ -1579,18 +1507,8 @@ static void as_merged_requests(request_queue_t *q, struct request *req,
 	as_add_arq_hash(ad, arq);
 
 	if (rq_rb_key(req) != arq->rb_key) {
-		struct as_rq *alias, *next_arq = NULL;
-
-		if (ad->next_arq[arq->is_sync] == arq)
-			next_arq = as_find_next_arq(ad, arq);
-
 		as_del_arq_rb(ad, arq);
-		if ((alias = as_add_arq_rb(ad, arq))) {
-			list_del_init(&arq->fifo);
-			as_add_aliased_request(ad, arq, alias);
-			if (next_arq)
-				ad->next_arq[arq->is_sync] = next_arq;
-		}
+		as_add_arq_rb(ad, arq);
 	}
 
 	/*
@@ -1609,18 +1527,6 @@ static void as_merged_requests(request_queue_t *q, struct request *req,
 		}
 	}
 
-	/*
-	 * Transfer list of aliases
-	 */
-	while (!list_empty(&next->queuelist)) {
-		struct request *__rq = list_entry_rq(next->queuelist.next);
-		struct as_rq *__arq = RQ_DATA(__rq);
-
-		list_move_tail(&__rq->queuelist, &req->queuelist);
-
-		WARN_ON(__arq->state != AS_RQ_QUEUED);
-	}
-
 	/*
 	 * kill knowledge of next, this one is a goner
 	 */
-- 
cgit v1.2.3


From 88ee5ef157202624de2b43b3512fdcb54fda1ab5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Sat, 12 Nov 2005 11:09:12 +0100
Subject: [BLOCK] ll_rw_blk: fastpath get_request()

Originally from: Nick Piggin <nickpiggin@yahoo.com.au>

Move current_io_context out of the get_request fastpth.  Also try to
streamline a few other things in this area.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/ll_rw_blk.c | 70 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 37 insertions(+), 33 deletions(-)

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index d4beb9a89ee..97f4e7ecedf 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -1908,40 +1908,40 @@ static struct request *get_request(request_queue_t *q, int rw, struct bio *bio,
 {
 	struct request *rq = NULL;
 	struct request_list *rl = &q->rq;
-	struct io_context *ioc = current_io_context(GFP_ATOMIC);
-	int priv;
+	struct io_context *ioc = NULL;
+	int may_queue, priv;
 
-	if (rl->count[rw]+1 >= q->nr_requests) {
-		/*
-		 * The queue will fill after this allocation, so set it as
-		 * full, and mark this process as "batching". This process
-		 * will be allowed to complete a batch of requests, others
-		 * will be blocked.
-		 */
-		if (!blk_queue_full(q, rw)) {
-			ioc_set_batching(q, ioc);
-			blk_set_queue_full(q, rw);
-		}
-	}
+	may_queue = elv_may_queue(q, rw, bio);
+	if (may_queue == ELV_MQUEUE_NO)
+		goto rq_starved;
 
-	switch (elv_may_queue(q, rw, bio)) {
-		case ELV_MQUEUE_NO:
-			goto rq_starved;
-		case ELV_MQUEUE_MAY:
-			break;
-		case ELV_MQUEUE_MUST:
-			goto get_rq;
-	}
-
-	if (blk_queue_full(q, rw) && !ioc_batching(q, ioc)) {
-		/*
-		 * The queue is full and the allocating process is not a
-		 * "batcher", and not exempted by the IO scheduler
-		 */
-		goto out;
+	if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) {
+		if (rl->count[rw]+1 >= q->nr_requests) {
+			ioc = current_io_context(GFP_ATOMIC);
+			/*
+			 * The queue will fill after this allocation, so set
+			 * it as full, and mark this process as "batching".
+			 * This process will be allowed to complete a batch of
+			 * requests, others will be blocked.
+			 */
+			if (!blk_queue_full(q, rw)) {
+				ioc_set_batching(q, ioc);
+				blk_set_queue_full(q, rw);
+			} else {
+				if (may_queue != ELV_MQUEUE_MUST
+						&& !ioc_batching(q, ioc)) {
+					/*
+					 * The queue is full and the allocating
+					 * process is not a "batcher", and not
+					 * exempted by the IO scheduler
+					 */
+					goto out;
+				}
+			}
+		}
+		set_queue_congested(q, rw);
 	}
 
-get_rq:
 	/*
 	 * Only allow batching queuers to allocate up to 50% over the defined
 	 * limit of requests, otherwise we could have thousands of requests
@@ -1952,8 +1952,6 @@ get_rq:
 
 	rl->count[rw]++;
 	rl->starved[rw] = 0;
-	if (rl->count[rw] >= queue_congestion_on_threshold(q))
-		set_queue_congested(q, rw);
 
 	priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
 	if (priv)
@@ -1962,7 +1960,7 @@ get_rq:
 	spin_unlock_irq(q->queue_lock);
 
 	rq = blk_alloc_request(q, rw, bio, priv, gfp_mask);
-	if (!rq) {
+	if (unlikely(!rq)) {
 		/*
 		 * Allocation failed presumably due to memory. Undo anything
 		 * we might have messed up.
@@ -1987,6 +1985,12 @@ rq_starved:
 		goto out;
 	}
 
+	/*
+	 * ioc may be NULL here, and ioc_batching will be false. That's
+	 * OK, if the queue is under the request limit then requests need
+	 * not count toward the nr_batch_requests limit. There will always
+	 * be some limit enforced by BLK_BATCH_TIME.
+	 */
 	if (ioc_batching(q, ioc))
 		ioc->nr_batch_requests--;
 	
-- 
cgit v1.2.3


From 80cfd548eed68cf90c5ae9cfcd6b02230cece756 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Fri, 6 Jan 2006 09:43:28 +0100
Subject: [BLOCK] bio: check for same page merge possibilities in
 __bio_add_page()

For filesystems with a blocksize < page size, we can merge same page
calls into the bio_vec at the end of the bio. This saves segments
on systems with a page size > the "normal" 4kb fs block size.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/bio.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/fs/bio.c b/fs/bio.c
index 38d3e8023a0..dfe242a21eb 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -325,10 +325,31 @@ static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page
 	if (unlikely(bio_flagged(bio, BIO_CLONED)))
 		return 0;
 
-	if (bio->bi_vcnt >= bio->bi_max_vecs)
+	if (((bio->bi_size + len) >> 9) > max_sectors)
 		return 0;
 
-	if (((bio->bi_size + len) >> 9) > max_sectors)
+	/*
+	 * For filesystems with a blocksize smaller than the pagesize
+	 * we will often be called with the same page as last time and
+	 * a consecutive offset.  Optimize this special case.
+	 */
+	if (bio->bi_vcnt > 0) {
+		struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
+
+		if (page == prev->bv_page &&
+		    offset == prev->bv_offset + prev->bv_len) {
+			prev->bv_len += len;
+			if (q->merge_bvec_fn &&
+			    q->merge_bvec_fn(q, bio, prev) < len) {
+				prev->bv_len -= len;
+				return 0;
+			}
+
+			goto done;
+		}
+	}
+
+	if (bio->bi_vcnt >= bio->bi_max_vecs)
 		return 0;
 
 	/*
@@ -382,6 +403,7 @@ static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page
 	bio->bi_vcnt++;
 	bio->bi_phys_segments++;
 	bio->bi_hw_segments++;
+ done:
 	bio->bi_size += len;
 	return len;
 }
-- 
cgit v1.2.3


From 64100099ed22f71cce656c5c2caecf5c9cf255dc Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Fri, 6 Jan 2006 09:46:02 +0100
Subject: [BLOCK] mark some block/ variables cons

the patch below marks various read-only variables in block/* as const,
so that gcc can optimize the use of them; eg gcc will replace the use by
the value directly now and will even remove the memory usage of these.

Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/cfq-iosched.c      | 16 ++++++++--------
 block/deadline-iosched.c |  8 ++++----
 block/ll_rw_blk.c        |  2 +-
 block/scsi_ioctl.c       |  2 +-
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ee0bb41694b..74fae2daf87 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -25,15 +25,15 @@
 /*
  * tunables
  */
-static int cfq_quantum = 4;		/* max queue in one round of service */
-static int cfq_queued = 8;		/* minimum rq allocate limit per-queue*/
-static int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
-static int cfq_back_max = 16 * 1024;	/* maximum backwards seek, in KiB */
-static int cfq_back_penalty = 2;	/* penalty of a backwards seek */
+static const int cfq_quantum = 4;		/* max queue in one round of service */
+static const int cfq_queued = 8;		/* minimum rq allocate limit per-queue*/
+static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
+static const int cfq_back_max = 16 * 1024;	/* maximum backwards seek, in KiB */
+static const int cfq_back_penalty = 2;		/* penalty of a backwards seek */
 
-static int cfq_slice_sync = HZ / 10;
+static const int cfq_slice_sync = HZ / 10;
 static int cfq_slice_async = HZ / 25;
-static int cfq_slice_async_rq = 2;
+static const int cfq_slice_async_rq = 2;
 static int cfq_slice_idle = HZ / 100;
 
 #define CFQ_IDLE_GRACE		(HZ / 10)
@@ -45,7 +45,7 @@ static int cfq_slice_idle = HZ / 100;
 /*
  * disable queueing at the driver/hardware level
  */
-static int cfq_max_depth = 2;
+static const int cfq_max_depth = 2;
 
 /*
  * for the hash of cfqq inside the cfqd
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 9cbec09e841..27e494b1bf9 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -19,10 +19,10 @@
 /*
  * See Documentation/block/deadline-iosched.txt
  */
-static int read_expire = HZ / 2;  /* max time before a read is submitted. */
-static int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
-static int writes_starved = 2;    /* max times reads can starve a write */
-static int fifo_batch = 16;       /* # of sequential requests treated as one
+static const int read_expire = HZ / 2;  /* max time before a read is submitted. */
+static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
+static const int writes_starved = 2;    /* max times reads can starve a write */
+static const int fifo_batch = 16;       /* # of sequential requests treated as one
 				     by the above parameters. For throughput. */
 
 static const int deadline_hash_shift = 5;
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 97f4e7ecedf..e02c88ca8fb 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -1039,7 +1039,7 @@ void blk_queue_invalidate_tags(request_queue_t *q)
 
 EXPORT_SYMBOL(blk_queue_invalidate_tags);
 
-static char *rq_flags[] = {
+static const char * const rq_flags[] = {
 	"REQ_RW",
 	"REQ_FAILFAST",
 	"REQ_SORTED",
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 1d8852f7bbf..c2ac36dfe4f 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -46,7 +46,7 @@ EXPORT_SYMBOL(scsi_command_size);
 
 static int sg_get_version(int __user *p)
 {
-	static int sg_version_num = 30527;
+	static const int sg_version_num = 30527;
 	return put_user(sg_version_num, p);
 }
 
-- 
cgit v1.2.3


From 8ffdc6550c47f75ca4e6c9f30a2a89063e035cf2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Fri, 6 Jan 2006 09:49:03 +0100
Subject: [BLOCK] add @uptodate to end_that_request_last() and @error to
 rq_end_io_fn()

add @uptodate argument to end_that_request_last() and @error
to rq_end_io_fn().  there's no generic way to pass error code
to request completion function, making generic error handling
of non-fs request difficult (rq->errors is driver-specific and
each driver uses it differently).  this patch adds @uptodate
to end_that_request_last() and @error to rq_end_io_fn().

for fs requests, this doesn't really matter, so just using the
same uptodate argument used in the last call to
end_that_request_first() should suffice.  imho, this can also
help the generic command-carrying request jens is working on.

Signed-off-by: tejun heo <htejun@gmail.com>
Signed-Off-By: Jens Axboe <axboe@suse.de>
---
 block/elevator.c                |  2 +-
 block/ll_rw_blk.c               | 22 +++++++++++++++-------
 drivers/block/DAC960.c          |  2 +-
 drivers/block/cciss.c           |  2 +-
 drivers/block/cpqarray.c        |  2 +-
 drivers/block/floppy.c          |  2 +-
 drivers/block/nbd.c             |  2 +-
 drivers/block/sx8.c             |  2 +-
 drivers/block/ub.c              |  2 +-
 drivers/block/viodasd.c         |  2 +-
 drivers/cdrom/cdu31a.c          |  2 +-
 drivers/ide/ide-cd.c            |  4 ++--
 drivers/ide/ide-io.c            |  6 +++---
 drivers/message/i2o/i2o_block.c |  2 +-
 drivers/mmc/mmc_block.c         |  4 ++--
 drivers/s390/block/dasd.c       |  2 +-
 drivers/s390/char/tape_block.c  |  2 +-
 drivers/scsi/ide-scsi.c         |  4 ++--
 drivers/scsi/scsi_lib.c         |  2 +-
 drivers/scsi/sd.c               |  2 +-
 include/linux/blkdev.h          |  6 +++---
 21 files changed, 42 insertions(+), 34 deletions(-)

diff --git a/block/elevator.c b/block/elevator.c
index 6c3fc8a10bf..85a11cee7d1 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -498,7 +498,7 @@ struct request *elv_next_request(request_queue_t *q)
 			blkdev_dequeue_request(rq);
 			rq->flags |= REQ_QUIET;
 			end_that_request_chunk(rq, 0, nr_bytes);
-			end_that_request_last(rq);
+			end_that_request_last(rq, 0);
 		} else {
 			printk(KERN_ERR "%s: bad return=%d\n", __FUNCTION__,
 								ret);
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index e02c88ca8fb..8b1ae69bc5a 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -344,7 +344,7 @@ EXPORT_SYMBOL(blk_queue_issue_flush_fn);
 /*
  * Cache flushing for ordered writes handling
  */
-static void blk_pre_flush_end_io(struct request *flush_rq)
+static void blk_pre_flush_end_io(struct request *flush_rq, int error)
 {
 	struct request *rq = flush_rq->end_io_data;
 	request_queue_t *q = rq->q;
@@ -362,7 +362,7 @@ static void blk_pre_flush_end_io(struct request *flush_rq)
 	}
 }
 
-static void blk_post_flush_end_io(struct request *flush_rq)
+static void blk_post_flush_end_io(struct request *flush_rq, int error)
 {
 	struct request *rq = flush_rq->end_io_data;
 	request_queue_t *q = rq->q;
@@ -2317,7 +2317,7 @@ EXPORT_SYMBOL(blk_rq_map_kern);
  */
 void blk_execute_rq_nowait(request_queue_t *q, struct gendisk *bd_disk,
 			   struct request *rq, int at_head,
-			   void (*done)(struct request *))
+			   rq_end_io_fn *done)
 {
 	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
 
@@ -2521,7 +2521,7 @@ EXPORT_SYMBOL(blk_put_request);
  * blk_end_sync_rq - executes a completion event on a request
  * @rq: request to complete
  */
-void blk_end_sync_rq(struct request *rq)
+void blk_end_sync_rq(struct request *rq, int error)
 {
 	struct completion *waiting = rq->waiting;
 
@@ -3183,9 +3183,17 @@ EXPORT_SYMBOL(end_that_request_chunk);
 /*
  * queue lock must be held
  */
-void end_that_request_last(struct request *req)
+void end_that_request_last(struct request *req, int uptodate)
 {
 	struct gendisk *disk = req->rq_disk;
+	int error;
+
+	/*
+	 * extend uptodate bool to allow < 0 value to be direct io error
+	 */
+	error = 0;
+	if (end_io_error(uptodate))
+		error = !uptodate ? -EIO : uptodate;
 
 	if (unlikely(laptop_mode) && blk_fs_request(req))
 		laptop_io_completion();
@@ -3200,7 +3208,7 @@ void end_that_request_last(struct request *req)
 		disk->in_flight--;
 	}
 	if (req->end_io)
-		req->end_io(req);
+		req->end_io(req, error);
 	else
 		__blk_put_request(req->q, req);
 }
@@ -3212,7 +3220,7 @@ void end_request(struct request *req, int uptodate)
 	if (!end_that_request_first(req, uptodate, req->hard_cur_sectors)) {
 		add_disk_randomness(req->rq_disk);
 		blkdev_dequeue_request(req);
-		end_that_request_last(req);
+		end_that_request_last(req, uptodate);
 	}
 }
 
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 70eaa5c7ac0..21097a39a05 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -3471,7 +3471,7 @@ static inline boolean DAC960_ProcessCompletedRequest(DAC960_Command_T *Command,
 
 	 if (!end_that_request_first(Request, UpToDate, Command->BlockCount)) {
 
- 	 	end_that_request_last(Request);
+ 	 	end_that_request_last(Request, UpToDate);
 
 		if (Command->Completion) {
 			complete(Command->Completion);
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index c3441b3f086..d2815b7a915 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -2310,7 +2310,7 @@ static inline void complete_command( ctlr_info_t *h, CommandList_struct *cmd,
 	printk("Done with %p\n", cmd->rq);
 #endif /* CCISS_DEBUG */ 
 
-	end_that_request_last(cmd->rq);
+	end_that_request_last(cmd->rq, status ? 1 : -EIO);
 	cmd_free(h,cmd,1);
 }
 
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index cf1822a6361..9bddb687487 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -1036,7 +1036,7 @@ static inline void complete_command(cmdlist_t *cmd, int timeout)
 	complete_buffers(cmd->rq->bio, ok);
 
         DBGPX(printk("Done with %p\n", cmd->rq););
-	end_that_request_last(cmd->rq);
+	end_that_request_last(cmd->rq, ok ? 1 : -EIO);
 }
 
 /*
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index f7e765a1d31..a5b857c5c4b 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -2301,7 +2301,7 @@ static void floppy_end_request(struct request *req, int uptodate)
 	add_disk_randomness(req->rq_disk);
 	floppy_off((long)req->rq_disk->private_data);
 	blkdev_dequeue_request(req);
-	end_that_request_last(req);
+	end_that_request_last(req, uptodate);
 
 	/* We're done with the request */
 	current_req = NULL;
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 9e268ddedfb..485345c8e63 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -136,7 +136,7 @@ static void nbd_end_request(struct request *req)
 
 	spin_lock_irqsave(q->queue_lock, flags);
 	if (!end_that_request_first(req, uptodate, req->nr_sectors)) {
-		end_that_request_last(req);
+		end_that_request_last(req, uptodate);
 	}
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 1ded3b43345..9251f4131b5 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -770,7 +770,7 @@ static inline void carm_end_request_queued(struct carm_host *host,
 	rc = end_that_request_first(req, uptodate, req->hard_nr_sectors);
 	assert(rc == 0);
 
-	end_that_request_last(req);
+	end_that_request_last(req, uptodate);
 
 	rc = carm_put_request(host, crq);
 	assert(rc == 0);
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 10740a06508..a05fe5843e6 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -951,7 +951,7 @@ static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 static void ub_end_rq(struct request *rq, int uptodate)
 {
 	end_that_request_first(rq, uptodate, rq->hard_nr_sectors);
-	end_that_request_last(rq);
+	end_that_request_last(rq, uptodate);
 }
 
 static int ub_rw_cmd_retry(struct ub_dev *sc, struct ub_lun *lun,
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
index 2d518aa2720..063f0304a16 100644
--- a/drivers/block/viodasd.c
+++ b/drivers/block/viodasd.c
@@ -305,7 +305,7 @@ static void viodasd_end_request(struct request *req, int uptodate,
 	if (end_that_request_first(req, uptodate, num_sectors))
 		return;
 	add_disk_randomness(req->rq_disk);
-	end_that_request_last(req);
+	end_that_request_last(req, uptodate);
 }
 
 /*
diff --git a/drivers/cdrom/cdu31a.c b/drivers/cdrom/cdu31a.c
index ac96de15d83..378e88d2075 100644
--- a/drivers/cdrom/cdu31a.c
+++ b/drivers/cdrom/cdu31a.c
@@ -1402,7 +1402,7 @@ static void do_cdu31a_request(request_queue_t * q)
 			if (!end_that_request_first(req, 1, nblock)) {
 				spin_lock_irq(q->queue_lock);
 				blkdev_dequeue_request(req);
-				end_that_request_last(req);
+				end_that_request_last(req, 1);
 				spin_unlock_irq(q->queue_lock);
 			}
 			continue;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 70aeb3a6012..d31117eb95a 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -614,7 +614,7 @@ static void cdrom_end_request (ide_drive_t *drive, int uptodate)
 			 */
 			spin_lock_irqsave(&ide_lock, flags);
 			end_that_request_chunk(failed, 0, failed->data_len);
-			end_that_request_last(failed);
+			end_that_request_last(failed, 0);
 			spin_unlock_irqrestore(&ide_lock, flags);
 		}
 
@@ -1735,7 +1735,7 @@ end_request:
 
 	spin_lock_irqsave(&ide_lock, flags);
 	blkdev_dequeue_request(rq);
-	end_that_request_last(rq);
+	end_that_request_last(rq, 1);
 	HWGROUP(drive)->rq = NULL;
 	spin_unlock_irqrestore(&ide_lock, flags);
 	return ide_stopped;
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index ecfafcdafea..8435b44a700 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -89,7 +89,7 @@ int __ide_end_request(ide_drive_t *drive, struct request *rq, int uptodate,
 
 		blkdev_dequeue_request(rq);
 		HWGROUP(drive)->rq = NULL;
-		end_that_request_last(rq);
+		end_that_request_last(rq, uptodate);
 		ret = 0;
 	}
 	return ret;
@@ -247,7 +247,7 @@ static void ide_complete_pm_request (ide_drive_t *drive, struct request *rq)
 	}
 	blkdev_dequeue_request(rq);
 	HWGROUP(drive)->rq = NULL;
-	end_that_request_last(rq);
+	end_that_request_last(rq, 1);
 	spin_unlock_irqrestore(&ide_lock, flags);
 }
 
@@ -379,7 +379,7 @@ void ide_end_drive_cmd (ide_drive_t *drive, u8 stat, u8 err)
 	blkdev_dequeue_request(rq);
 	HWGROUP(drive)->rq = NULL;
 	rq->errors = err;
-	end_that_request_last(rq);
+	end_that_request_last(rq, !rq->errors);
 	spin_unlock_irqrestore(&ide_lock, flags);
 }
 
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index f283b5bafdd..4f522527b7e 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -466,7 +466,7 @@ static void i2o_block_end_request(struct request *req, int uptodate,
 
 	spin_lock_irqsave(q->queue_lock, flags);
 
-	end_that_request_last(req);
+	end_that_request_last(req, uptodate);
 
 	if (likely(dev)) {
 		dev->open_queue_depth--;
diff --git a/drivers/mmc/mmc_block.c b/drivers/mmc/mmc_block.c
index abcf19116d7..8e380c14bf6 100644
--- a/drivers/mmc/mmc_block.c
+++ b/drivers/mmc/mmc_block.c
@@ -263,7 +263,7 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
 			 */
 			add_disk_randomness(req->rq_disk);
 			blkdev_dequeue_request(req);
-			end_that_request_last(req);
+			end_that_request_last(req, 1);
 		}
 		spin_unlock_irq(&md->lock);
 	} while (ret);
@@ -289,7 +289,7 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
 
 	add_disk_randomness(req->rq_disk);
 	blkdev_dequeue_request(req);
-	end_that_request_last(req);
+	end_that_request_last(req, 0);
 	spin_unlock_irq(&md->lock);
 
 	return 0;
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 7008d32433b..fdb61380c52 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -1035,7 +1035,7 @@ dasd_end_request(struct request *req, int uptodate)
 	if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
 		BUG();
 	add_disk_randomness(req->rq_disk);
-	end_that_request_last(req);
+	end_that_request_last(req, uptodate);
 }
 
 /*
diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c
index 1efc9f21229..559d51490e2 100644
--- a/drivers/s390/char/tape_block.c
+++ b/drivers/s390/char/tape_block.c
@@ -78,7 +78,7 @@ tapeblock_end_request(struct request *req, int uptodate)
 {
 	if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
 		BUG();
-	end_that_request_last(req);
+	end_that_request_last(req, uptodate);
 }
 
 static void
diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c
index 4cb1f3ed910..3c688ef5466 100644
--- a/drivers/scsi/ide-scsi.c
+++ b/drivers/scsi/ide-scsi.c
@@ -1046,7 +1046,7 @@ static int idescsi_eh_reset (struct scsi_cmnd *cmd)
 
 	/* kill current request */
 	blkdev_dequeue_request(req);
-	end_that_request_last(req);
+	end_that_request_last(req, 0);
 	if (req->flags & REQ_SENSE)
 		kfree(scsi->pc->buffer);
 	kfree(scsi->pc);
@@ -1056,7 +1056,7 @@ static int idescsi_eh_reset (struct scsi_cmnd *cmd)
 	/* now nuke the drive queue */
 	while ((req = elv_next_request(drive->queue))) {
 		blkdev_dequeue_request(req);
-		end_that_request_last(req);
+		end_that_request_last(req, 0);
 	}
 
 	HWGROUP(drive)->rq = NULL;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index a7f3f0c84db..53551f1dfe2 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -791,7 +791,7 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int uptodate,
 	spin_lock_irqsave(q->queue_lock, flags);
 	if (blk_rq_tagged(req))
 		blk_queue_end_tag(q, req);
-	end_that_request_last(req);
+	end_that_request_last(req, uptodate);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
 	/*
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 3d3ad7d1b77..d651150ee76 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -748,7 +748,7 @@ static void sd_end_flush(request_queue_t *q, struct request *flush_rq)
 		 * force journal abort of barriers
 		 */
 		end_that_request_first(rq, -EOPNOTSUPP, rq->hard_nr_sectors);
-		end_that_request_last(rq);
+		end_that_request_last(rq, -EOPNOTSUPP);
 	}
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a18500d196e..a0ce8c58516 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -102,7 +102,7 @@ void copy_io_context(struct io_context **pdst, struct io_context **psrc);
 void swap_io_context(struct io_context **ioc1, struct io_context **ioc2);
 
 struct request;
-typedef void (rq_end_io_fn)(struct request *);
+typedef void (rq_end_io_fn)(struct request *, int);
 
 struct request_list {
 	int count[2];
@@ -560,7 +560,7 @@ extern void register_disk(struct gendisk *dev);
 extern void generic_make_request(struct bio *bio);
 extern void blk_put_request(struct request *);
 extern void __blk_put_request(request_queue_t *, struct request *);
-extern void blk_end_sync_rq(struct request *rq);
+extern void blk_end_sync_rq(struct request *rq, int error);
 extern void blk_attempt_remerge(request_queue_t *, struct request *);
 extern struct request *blk_get_request(request_queue_t *, int, gfp_t);
 extern void blk_insert_request(request_queue_t *, struct request *, int, void *);
@@ -614,7 +614,7 @@ static inline void blk_run_address_space(struct address_space *mapping)
  */
 extern int end_that_request_first(struct request *, int, int);
 extern int end_that_request_chunk(struct request *, int, int);
-extern void end_that_request_last(struct request *);
+extern void end_that_request_last(struct request *, int);
 extern void end_request(struct request *req, int uptodate);
 
 /*
-- 
cgit v1.2.3


From 52d9e675361261a1eb1716b02222ec6177ec342b Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Fri, 6 Jan 2006 09:49:58 +0100
Subject: [BLOCK] ll_rw_blk: separate out bio init part from __make_request

Separate out bio initialization part from __make_request.  It
will be used by the following blk_ordered_reimpl.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/ll_rw_blk.c | 62 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 33 insertions(+), 29 deletions(-)

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 8b1ae69bc5a..65c4efc02ad 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -36,6 +36,8 @@
 static void blk_unplug_work(void *data);
 static void blk_unplug_timeout(unsigned long data);
 static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io);
+static void init_request_from_bio(struct request *req, struct bio *bio);
+static int __make_request(request_queue_t *q, struct bio *bio);
 
 /*
  * For the allocated request tables
@@ -1667,8 +1669,6 @@ static int blk_init_free_list(request_queue_t *q)
 	return 0;
 }
 
-static int __make_request(request_queue_t *, struct bio *);
-
 request_queue_t *blk_alloc_queue(gfp_t gfp_mask)
 {
 	return blk_alloc_queue_node(gfp_mask, -1);
@@ -2659,6 +2659,36 @@ void blk_attempt_remerge(request_queue_t *q, struct request *rq)
 
 EXPORT_SYMBOL(blk_attempt_remerge);
 
+static void init_request_from_bio(struct request *req, struct bio *bio)
+{
+	req->flags |= REQ_CMD;
+
+	/*
+	 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
+	 */
+	if (bio_rw_ahead(bio) || bio_failfast(bio))
+		req->flags |= REQ_FAILFAST;
+
+	/*
+	 * REQ_BARRIER implies no merging, but lets make it explicit
+	 */
+	if (unlikely(bio_barrier(bio)))
+		req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
+
+	req->errors = 0;
+	req->hard_sector = req->sector = bio->bi_sector;
+	req->hard_nr_sectors = req->nr_sectors = bio_sectors(bio);
+	req->current_nr_sectors = req->hard_cur_sectors = bio_cur_sectors(bio);
+	req->nr_phys_segments = bio_phys_segments(req->q, bio);
+	req->nr_hw_segments = bio_hw_segments(req->q, bio);
+	req->buffer = bio_data(bio);	/* see ->buffer comment above */
+	req->waiting = NULL;
+	req->bio = req->biotail = bio;
+	req->ioprio = bio_prio(bio);
+	req->rq_disk = bio->bi_bdev->bd_disk;
+	req->start_time = jiffies;
+}
+
 static int __make_request(request_queue_t *q, struct bio *bio)
 {
 	struct request *req;
@@ -2754,33 +2784,7 @@ get_rq:
 	 * We don't worry about that case for efficiency. It won't happen
 	 * often, and the elevators are able to handle it.
 	 */
-
-	req->flags |= REQ_CMD;
-
-	/*
-	 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
-	 */
-	if (bio_rw_ahead(bio) || bio_failfast(bio))
-		req->flags |= REQ_FAILFAST;
-
-	/*
-	 * REQ_BARRIER implies no merging, but lets make it explicit
-	 */
-	if (unlikely(barrier))
-		req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
-
-	req->errors = 0;
-	req->hard_sector = req->sector = sector;
-	req->hard_nr_sectors = req->nr_sectors = nr_sectors;
-	req->current_nr_sectors = req->hard_cur_sectors = cur_nr_sectors;
-	req->nr_phys_segments = bio_phys_segments(q, bio);
-	req->nr_hw_segments = bio_hw_segments(q, bio);
-	req->buffer = bio_data(bio);	/* see ->buffer comment above */
-	req->waiting = NULL;
-	req->bio = req->biotail = bio;
-	req->ioprio = prio;
-	req->rq_disk = bio->bi_bdev->bd_disk;
-	req->start_time = jiffies;
+	init_request_from_bio(req, bio);
 
 	spin_lock_irq(q->queue_lock);
 	if (elv_queue_empty(q))
-- 
cgit v1.2.3


From 797e7dbbee0a91fa1349192f18ad5c454997d876 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Fri, 6 Jan 2006 09:51:03 +0100
Subject: [BLOCK] reimplement handling of barrier request

Reimplement handling of barrier requests.

* Flexible handling to deal with various capabilities of
  target devices.
* Retry support for falling back.
* Tagged queues which don't support ordered tag can do ordered.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/elevator.c         |  84 +++++++----
 block/ll_rw_blk.c        | 384 ++++++++++++++++++++++++++++++-----------------
 include/linux/blkdev.h   |  82 +++++++---
 include/linux/elevator.h |   1 +
 4 files changed, 359 insertions(+), 192 deletions(-)

diff --git a/block/elevator.c b/block/elevator.c
index 85a11cee7d1..39dcccc82ad 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -304,15 +304,7 @@ void elv_requeue_request(request_queue_t *q, struct request *rq)
 
 	rq->flags &= ~REQ_STARTED;
 
-	/*
-	 * if this is the flush, requeue the original instead and drop the flush
-	 */
-	if (rq->flags & REQ_BAR_FLUSH) {
-		clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
-		rq = rq->end_io_data;
-	}
-
-	__elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 0);
+	__elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE, 0);
 }
 
 static void elv_drain_elevator(request_queue_t *q)
@@ -332,7 +324,18 @@ static void elv_drain_elevator(request_queue_t *q)
 void __elv_add_request(request_queue_t *q, struct request *rq, int where,
 		       int plug)
 {
+	struct list_head *pos;
+	unsigned ordseq;
+
+	if (q->ordcolor)
+		rq->flags |= REQ_ORDERED_COLOR;
+
 	if (rq->flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
+		/*
+		 * toggle ordered color
+		 */
+		q->ordcolor ^= 1;
+
 		/*
 		 * barriers implicitly indicate back insertion
 		 */
@@ -393,6 +396,30 @@ void __elv_add_request(request_queue_t *q, struct request *rq, int where,
 		q->elevator->ops->elevator_add_req_fn(q, rq);
 		break;
 
+	case ELEVATOR_INSERT_REQUEUE:
+		/*
+		 * If ordered flush isn't in progress, we do front
+		 * insertion; otherwise, requests should be requeued
+		 * in ordseq order.
+		 */
+		rq->flags |= REQ_SOFTBARRIER;
+
+		if (q->ordseq == 0) {
+			list_add(&rq->queuelist, &q->queue_head);
+			break;
+		}
+
+		ordseq = blk_ordered_req_seq(rq);
+
+		list_for_each(pos, &q->queue_head) {
+			struct request *pos_rq = list_entry_rq(pos);
+			if (ordseq <= blk_ordered_req_seq(pos_rq))
+				break;
+		}
+
+		list_add_tail(&rq->queuelist, pos);
+		break;
+
 	default:
 		printk(KERN_ERR "%s: bad insertion point %d\n",
 		       __FUNCTION__, where);
@@ -422,25 +449,16 @@ static inline struct request *__elv_next_request(request_queue_t *q)
 {
 	struct request *rq;
 
-	if (unlikely(list_empty(&q->queue_head) &&
-		     !q->elevator->ops->elevator_dispatch_fn(q, 0)))
-		return NULL;
-
-	rq = list_entry_rq(q->queue_head.next);
-
-	/*
-	 * if this is a barrier write and the device has to issue a
-	 * flush sequence to support it, check how far we are
-	 */
-	if (blk_fs_request(rq) && blk_barrier_rq(rq)) {
-		BUG_ON(q->ordered == QUEUE_ORDERED_NONE);
+	while (1) {
+		while (!list_empty(&q->queue_head)) {
+			rq = list_entry_rq(q->queue_head.next);
+			if (blk_do_ordered(q, &rq))
+				return rq;
+		}
 
-		if (q->ordered == QUEUE_ORDERED_FLUSH &&
-		    !blk_barrier_preflush(rq))
-			rq = blk_start_pre_flush(q, rq);
+		if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
+			return NULL;
 	}
-
-	return rq;
 }
 
 struct request *elv_next_request(request_queue_t *q)
@@ -593,7 +611,21 @@ void elv_completed_request(request_queue_t *q, struct request *rq)
 	 * request is released from the driver, io must be done
 	 */
 	if (blk_account_rq(rq)) {
+		struct request *first_rq = list_entry_rq(q->queue_head.next);
+
 		q->in_flight--;
+
+		/*
+		 * Check if the queue is waiting for fs requests to be
+		 * drained for flush sequence.
+		 */
+		if (q->ordseq && q->in_flight == 0 &&
+		    blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
+		    blk_ordered_req_seq(first_rq) > QUEUE_ORDSEQ_DRAIN) {
+			blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
+			q->request_fn(q);
+		}
+
 		if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn)
 			e->ops->elevator_completed_req_fn(q, rq);
 	}
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 65c4efc02ad..91d3b4828c4 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -290,8 +290,8 @@ static inline void rq_init(request_queue_t *q, struct request *rq)
 
 /**
  * blk_queue_ordered - does this queue support ordered writes
- * @q:     the request queue
- * @flag:  see below
+ * @q:        the request queue
+ * @ordered:  one of QUEUE_ORDERED_*
  *
  * Description:
  *   For journalled file systems, doing ordered writes on a commit
@@ -300,28 +300,30 @@ static inline void rq_init(request_queue_t *q, struct request *rq)
  *   feature should call this function and indicate so.
  *
  **/
-void blk_queue_ordered(request_queue_t *q, int flag)
-{
-	switch (flag) {
-		case QUEUE_ORDERED_NONE:
-			if (q->flush_rq)
-				kmem_cache_free(request_cachep, q->flush_rq);
-			q->flush_rq = NULL;
-			q->ordered = flag;
-			break;
-		case QUEUE_ORDERED_TAG:
-			q->ordered = flag;
-			break;
-		case QUEUE_ORDERED_FLUSH:
-			q->ordered = flag;
-			if (!q->flush_rq)
-				q->flush_rq = kmem_cache_alloc(request_cachep,
-								GFP_KERNEL);
-			break;
-		default:
-			printk("blk_queue_ordered: bad value %d\n", flag);
-			break;
+int blk_queue_ordered(request_queue_t *q, unsigned ordered,
+		      prepare_flush_fn *prepare_flush_fn)
+{
+	if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) &&
+	    prepare_flush_fn == NULL) {
+		printk(KERN_ERR "blk_queue_ordered: prepare_flush_fn required\n");
+		return -EINVAL;
+	}
+
+	if (ordered != QUEUE_ORDERED_NONE &&
+	    ordered != QUEUE_ORDERED_DRAIN &&
+	    ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
+	    ordered != QUEUE_ORDERED_DRAIN_FUA &&
+	    ordered != QUEUE_ORDERED_TAG &&
+	    ordered != QUEUE_ORDERED_TAG_FLUSH &&
+	    ordered != QUEUE_ORDERED_TAG_FUA) {
+		printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
+		return -EINVAL;
 	}
+
+	q->next_ordered = ordered;
+	q->prepare_flush_fn = prepare_flush_fn;
+
+	return 0;
 }
 
 EXPORT_SYMBOL(blk_queue_ordered);
@@ -346,167 +348,265 @@ EXPORT_SYMBOL(blk_queue_issue_flush_fn);
 /*
  * Cache flushing for ordered writes handling
  */
-static void blk_pre_flush_end_io(struct request *flush_rq, int error)
+inline unsigned blk_ordered_cur_seq(request_queue_t *q)
 {
-	struct request *rq = flush_rq->end_io_data;
-	request_queue_t *q = rq->q;
-
-	elv_completed_request(q, flush_rq);
-
-	rq->flags |= REQ_BAR_PREFLUSH;
-
-	if (!flush_rq->errors)
-		elv_requeue_request(q, rq);
-	else {
-		q->end_flush_fn(q, flush_rq);
-		clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
-		q->request_fn(q);
-	}
+	if (!q->ordseq)
+		return 0;
+	return 1 << ffz(q->ordseq);
 }
 
-static void blk_post_flush_end_io(struct request *flush_rq, int error)
+unsigned blk_ordered_req_seq(struct request *rq)
 {
-	struct request *rq = flush_rq->end_io_data;
 	request_queue_t *q = rq->q;
 
-	elv_completed_request(q, flush_rq);
+	BUG_ON(q->ordseq == 0);
 
-	rq->flags |= REQ_BAR_POSTFLUSH;
+	if (rq == &q->pre_flush_rq)
+		return QUEUE_ORDSEQ_PREFLUSH;
+	if (rq == &q->bar_rq)
+		return QUEUE_ORDSEQ_BAR;
+	if (rq == &q->post_flush_rq)
+		return QUEUE_ORDSEQ_POSTFLUSH;
 
-	q->end_flush_fn(q, flush_rq);
-	clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
-	q->request_fn(q);
+	if ((rq->flags & REQ_ORDERED_COLOR) ==
+	    (q->orig_bar_rq->flags & REQ_ORDERED_COLOR))
+		return QUEUE_ORDSEQ_DRAIN;
+	else
+		return QUEUE_ORDSEQ_DONE;
 }
 
-struct request *blk_start_pre_flush(request_queue_t *q, struct request *rq)
+void blk_ordered_complete_seq(request_queue_t *q, unsigned seq, int error)
 {
-	struct request *flush_rq = q->flush_rq;
-
-	BUG_ON(!blk_barrier_rq(rq));
+	struct request *rq;
+	int uptodate;
 
-	if (test_and_set_bit(QUEUE_FLAG_FLUSH, &q->queue_flags))
-		return NULL;
+	if (error && !q->orderr)
+		q->orderr = error;
 
-	rq_init(q, flush_rq);
-	flush_rq->elevator_private = NULL;
-	flush_rq->flags = REQ_BAR_FLUSH;
-	flush_rq->rq_disk = rq->rq_disk;
-	flush_rq->rl = NULL;
+	BUG_ON(q->ordseq & seq);
+	q->ordseq |= seq;
 
-	/*
-	 * prepare_flush returns 0 if no flush is needed, just mark both
-	 * pre and post flush as done in that case
-	 */
-	if (!q->prepare_flush_fn(q, flush_rq)) {
-		rq->flags |= REQ_BAR_PREFLUSH | REQ_BAR_POSTFLUSH;
-		clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
-		return rq;
-	}
+	if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
+		return;
 
 	/*
-	 * some drivers dequeue requests right away, some only after io
-	 * completion. make sure the request is dequeued.
+	 * Okay, sequence complete.
 	 */
-	if (!list_empty(&rq->queuelist))
-		blkdev_dequeue_request(rq);
+	rq = q->orig_bar_rq;
+	uptodate = q->orderr ? q->orderr : 1;
 
-	flush_rq->end_io_data = rq;
-	flush_rq->end_io = blk_pre_flush_end_io;
+	q->ordseq = 0;
 
-	__elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0);
-	return flush_rq;
+	end_that_request_first(rq, uptodate, rq->hard_nr_sectors);
+	end_that_request_last(rq, uptodate);
 }
 
-static void blk_start_post_flush(request_queue_t *q, struct request *rq)
+static void pre_flush_end_io(struct request *rq, int error)
 {
-	struct request *flush_rq = q->flush_rq;
+	elv_completed_request(rq->q, rq);
+	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
+}
 
-	BUG_ON(!blk_barrier_rq(rq));
+static void bar_end_io(struct request *rq, int error)
+{
+	elv_completed_request(rq->q, rq);
+	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
+}
 
-	rq_init(q, flush_rq);
-	flush_rq->elevator_private = NULL;
-	flush_rq->flags = REQ_BAR_FLUSH;
-	flush_rq->rq_disk = rq->rq_disk;
-	flush_rq->rl = NULL;
+static void post_flush_end_io(struct request *rq, int error)
+{
+	elv_completed_request(rq->q, rq);
+	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
+}
 
-	if (q->prepare_flush_fn(q, flush_rq)) {
-		flush_rq->end_io_data = rq;
-		flush_rq->end_io = blk_post_flush_end_io;
+static void queue_flush(request_queue_t *q, unsigned which)
+{
+	struct request *rq;
+	rq_end_io_fn *end_io;
 
-		__elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0);
-		q->request_fn(q);
+	if (which == QUEUE_ORDERED_PREFLUSH) {
+		rq = &q->pre_flush_rq;
+		end_io = pre_flush_end_io;
+	} else {
+		rq = &q->post_flush_rq;
+		end_io = post_flush_end_io;
 	}
+
+	rq_init(q, rq);
+	rq->flags = REQ_HARDBARRIER;
+	rq->elevator_private = NULL;
+	rq->rq_disk = q->bar_rq.rq_disk;
+	rq->rl = NULL;
+	rq->end_io = end_io;
+	q->prepare_flush_fn(q, rq);
+
+	__elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 0);
 }
 
-static inline int blk_check_end_barrier(request_queue_t *q, struct request *rq,
-					int sectors)
+static inline struct request *start_ordered(request_queue_t *q,
+					    struct request *rq)
 {
-	if (sectors > rq->nr_sectors)
-		sectors = rq->nr_sectors;
+	q->bi_size = 0;
+	q->orderr = 0;
+	q->ordered = q->next_ordered;
+	q->ordseq |= QUEUE_ORDSEQ_STARTED;
+
+	/*
+	 * Prep proxy barrier request.
+	 */
+	blkdev_dequeue_request(rq);
+	q->orig_bar_rq = rq;
+	rq = &q->bar_rq;
+	rq_init(q, rq);
+	rq->flags = bio_data_dir(q->orig_bar_rq->bio);
+	rq->flags |= q->ordered & QUEUE_ORDERED_FUA ? REQ_FUA : 0;
+	rq->elevator_private = NULL;
+	rq->rl = NULL;
+	init_request_from_bio(rq, q->orig_bar_rq->bio);
+	rq->end_io = bar_end_io;
+
+	/*
+	 * Queue ordered sequence.  As we stack them at the head, we
+	 * need to queue in reverse order.  Note that we rely on that
+	 * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
+	 * request gets inbetween ordered sequence.
+	 */
+	if (q->ordered & QUEUE_ORDERED_POSTFLUSH)
+		queue_flush(q, QUEUE_ORDERED_POSTFLUSH);
+	else
+		q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH;
+
+	__elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 0);
+
+	if (q->ordered & QUEUE_ORDERED_PREFLUSH) {
+		queue_flush(q, QUEUE_ORDERED_PREFLUSH);
+		rq = &q->pre_flush_rq;
+	} else
+		q->ordseq |= QUEUE_ORDSEQ_PREFLUSH;
 
-	rq->nr_sectors -= sectors;
-	return rq->nr_sectors;
+	if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0)
+		q->ordseq |= QUEUE_ORDSEQ_DRAIN;
+	else
+		rq = NULL;
+
+	return rq;
 }
 
-static int __blk_complete_barrier_rq(request_queue_t *q, struct request *rq,
-				     int sectors, int queue_locked)
+int blk_do_ordered(request_queue_t *q, struct request **rqp)
 {
-	if (q->ordered != QUEUE_ORDERED_FLUSH)
-		return 0;
-	if (!blk_fs_request(rq) || !blk_barrier_rq(rq))
-		return 0;
-	if (blk_barrier_postflush(rq))
-		return 0;
+	struct request *rq = *rqp, *allowed_rq;
+	int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);
 
-	if (!blk_check_end_barrier(q, rq, sectors)) {
-		unsigned long flags = 0;
+	if (!q->ordseq) {
+		if (!is_barrier)
+			return 1;
 
-		if (!queue_locked)
-			spin_lock_irqsave(q->queue_lock, flags);
+		if (q->next_ordered != QUEUE_ORDERED_NONE) {
+			*rqp = start_ordered(q, rq);
+			return 1;
+		} else {
+			/*
+			 * This can happen when the queue switches to
+			 * ORDERED_NONE while this request is on it.
+			 */
+			blkdev_dequeue_request(rq);
+			end_that_request_first(rq, -EOPNOTSUPP,
+					       rq->hard_nr_sectors);
+			end_that_request_last(rq, -EOPNOTSUPP);
+			*rqp = NULL;
+			return 0;
+		}
+	}
 
-		blk_start_post_flush(q, rq);
+	if (q->ordered & QUEUE_ORDERED_TAG) {
+		if (is_barrier && rq != &q->bar_rq)
+			*rqp = NULL;
+		return 1;
+	}
 
-		if (!queue_locked)
-			spin_unlock_irqrestore(q->queue_lock, flags);
+	switch (blk_ordered_cur_seq(q)) {
+	case QUEUE_ORDSEQ_PREFLUSH:
+		allowed_rq = &q->pre_flush_rq;
+		break;
+	case QUEUE_ORDSEQ_BAR:
+		allowed_rq = &q->bar_rq;
+		break;
+	case QUEUE_ORDSEQ_POSTFLUSH:
+		allowed_rq = &q->post_flush_rq;
+		break;
+	default:
+		allowed_rq = NULL;
+		break;
 	}
 
+	if (rq != allowed_rq &&
+	    (blk_fs_request(rq) || rq == &q->pre_flush_rq ||
+	     rq == &q->post_flush_rq))
+		*rqp = NULL;
+
 	return 1;
 }
 
-/**
- * blk_complete_barrier_rq - complete possible barrier request
- * @q:  the request queue for the device
- * @rq:  the request
- * @sectors:  number of sectors to complete
- *
- * Description:
- *   Used in driver end_io handling to determine whether to postpone
- *   completion of a barrier request until a post flush has been done. This
- *   is the unlocked variant, used if the caller doesn't already hold the
- *   queue lock.
- **/
-int blk_complete_barrier_rq(request_queue_t *q, struct request *rq, int sectors)
+static int flush_dry_bio_endio(struct bio *bio, unsigned int bytes, int error)
 {
-	return __blk_complete_barrier_rq(q, rq, sectors, 0);
+	request_queue_t *q = bio->bi_private;
+	struct bio_vec *bvec;
+	int i;
+
+	/*
+	 * This is dry run, restore bio_sector and size.  We'll finish
+	 * this request again with the original bi_end_io after an
+	 * error occurs or post flush is complete.
+	 */
+	q->bi_size += bytes;
+
+	if (bio->bi_size)
+		return 1;
+
+	/* Rewind bvec's */
+	bio->bi_idx = 0;
+	bio_for_each_segment(bvec, bio, i) {
+		bvec->bv_len += bvec->bv_offset;
+		bvec->bv_offset = 0;
+	}
+
+	/* Reset bio */
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+	bio->bi_size = q->bi_size;
+	bio->bi_sector -= (q->bi_size >> 9);
+	q->bi_size = 0;
+
+	return 0;
 }
-EXPORT_SYMBOL(blk_complete_barrier_rq);
 
-/**
- * blk_complete_barrier_rq_locked - complete possible barrier request
- * @q:  the request queue for the device
- * @rq:  the request
- * @sectors:  number of sectors to complete
- *
- * Description:
- *   See blk_complete_barrier_rq(). This variant must be used if the caller
- *   holds the queue lock.
- **/
-int blk_complete_barrier_rq_locked(request_queue_t *q, struct request *rq,
-				   int sectors)
+static inline int ordered_bio_endio(struct request *rq, struct bio *bio,
+				    unsigned int nbytes, int error)
 {
-	return __blk_complete_barrier_rq(q, rq, sectors, 1);
+	request_queue_t *q = rq->q;
+	bio_end_io_t *endio;
+	void *private;
+
+	if (&q->bar_rq != rq)
+		return 0;
+
+	/*
+	 * Okay, this is the barrier request in progress, dry finish it.
+	 */
+	if (error && !q->orderr)
+		q->orderr = error;
+
+	endio = bio->bi_end_io;
+	private = bio->bi_private;
+	bio->bi_end_io = flush_dry_bio_endio;
+	bio->bi_private = q;
+
+	bio_endio(bio, nbytes, error);
+
+	bio->bi_end_io = endio;
+	bio->bi_private = private;
+
+	return 1;
 }
-EXPORT_SYMBOL(blk_complete_barrier_rq_locked);
 
 /**
  * blk_queue_bounce_limit - set bounce buffer limit for queue
@@ -1047,6 +1147,7 @@ static const char * const rq_flags[] = {
 	"REQ_SORTED",
 	"REQ_SOFTBARRIER",
 	"REQ_HARDBARRIER",
+	"REQ_FUA",
 	"REQ_CMD",
 	"REQ_NOMERGE",
 	"REQ_STARTED",
@@ -1066,6 +1167,7 @@ static const char * const rq_flags[] = {
 	"REQ_PM_SUSPEND",
 	"REQ_PM_RESUME",
 	"REQ_PM_SHUTDOWN",
+	"REQ_ORDERED_COLOR",
 };
 
 void blk_dump_rq_flags(struct request *rq, char *msg)
@@ -1643,8 +1745,6 @@ void blk_cleanup_queue(request_queue_t * q)
 	if (q->queue_tags)
 		__blk_queue_free_tags(q);
 
-	blk_queue_ordered(q, QUEUE_ORDERED_NONE);
-
 	kmem_cache_free(requestq_cachep, q);
 }
 
@@ -2714,7 +2814,7 @@ static int __make_request(request_queue_t *q, struct bio *bio)
 	spin_lock_prefetch(q->queue_lock);
 
 	barrier = bio_barrier(bio);
-	if (unlikely(barrier) && (q->ordered == QUEUE_ORDERED_NONE)) {
+	if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {
 		err = -EOPNOTSUPP;
 		goto end_io;
 	}
@@ -3075,7 +3175,8 @@ static int __end_that_request_first(struct request *req, int uptodate,
 		if (nr_bytes >= bio->bi_size) {
 			req->bio = bio->bi_next;
 			nbytes = bio->bi_size;
-			bio_endio(bio, nbytes, error);
+			if (!ordered_bio_endio(req, bio, nbytes, error))
+				bio_endio(bio, nbytes, error);
 			next_idx = 0;
 			bio_nbytes = 0;
 		} else {
@@ -3130,7 +3231,8 @@ static int __end_that_request_first(struct request *req, int uptodate,
 	 * if the request wasn't completed, update state
 	 */
 	if (bio_nbytes) {
-		bio_endio(bio, bio_nbytes, error);
+		if (!ordered_bio_endio(req, bio, bio_nbytes, error))
+			bio_endio(bio, bio_nbytes, error);
 		bio->bi_idx += next_idx;
 		bio_iovec(bio)->bv_offset += nr_bytes;
 		bio_iovec(bio)->bv_len -= nr_bytes;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a0ce8c58516..15db0f112d0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -207,6 +207,7 @@ enum rq_flag_bits {
 	__REQ_SORTED,		/* elevator knows about this request */
 	__REQ_SOFTBARRIER,	/* may not be passed by ioscheduler */
 	__REQ_HARDBARRIER,	/* may not be passed by drive either */
+	__REQ_FUA,		/* forced unit access */
 	__REQ_CMD,		/* is a regular fs rw request */
 	__REQ_NOMERGE,		/* don't touch this for merging */
 	__REQ_STARTED,		/* drive already may have started this one */
@@ -230,9 +231,7 @@ enum rq_flag_bits {
 	__REQ_PM_SUSPEND,	/* suspend request */
 	__REQ_PM_RESUME,	/* resume request */
 	__REQ_PM_SHUTDOWN,	/* shutdown request */
-	__REQ_BAR_PREFLUSH,	/* barrier pre-flush done */
-	__REQ_BAR_POSTFLUSH,	/* barrier post-flush */
-	__REQ_BAR_FLUSH,	/* rq is the flush request */
+	__REQ_ORDERED_COLOR,	/* is before or after barrier */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -241,6 +240,7 @@ enum rq_flag_bits {
 #define REQ_SORTED	(1 << __REQ_SORTED)
 #define REQ_SOFTBARRIER	(1 << __REQ_SOFTBARRIER)
 #define REQ_HARDBARRIER	(1 << __REQ_HARDBARRIER)
+#define REQ_FUA		(1 << __REQ_FUA)
 #define REQ_CMD		(1 << __REQ_CMD)
 #define REQ_NOMERGE	(1 << __REQ_NOMERGE)
 #define REQ_STARTED	(1 << __REQ_STARTED)
@@ -260,9 +260,7 @@ enum rq_flag_bits {
 #define REQ_PM_SUSPEND	(1 << __REQ_PM_SUSPEND)
 #define REQ_PM_RESUME	(1 << __REQ_PM_RESUME)
 #define REQ_PM_SHUTDOWN	(1 << __REQ_PM_SHUTDOWN)
-#define REQ_BAR_PREFLUSH	(1 << __REQ_BAR_PREFLUSH)
-#define REQ_BAR_POSTFLUSH	(1 << __REQ_BAR_POSTFLUSH)
-#define REQ_BAR_FLUSH	(1 << __REQ_BAR_FLUSH)
+#define REQ_ORDERED_COLOR	(1 << __REQ_ORDERED_COLOR)
 
 /*
  * State information carried for REQ_PM_SUSPEND and REQ_PM_RESUME
@@ -292,8 +290,7 @@ struct bio_vec;
 typedef int (merge_bvec_fn) (request_queue_t *, struct bio *, struct bio_vec *);
 typedef void (activity_fn) (void *data, int rw);
 typedef int (issue_flush_fn) (request_queue_t *, struct gendisk *, sector_t *);
-typedef int (prepare_flush_fn) (request_queue_t *, struct request *);
-typedef void (end_flush_fn) (request_queue_t *, struct request *);
+typedef void (prepare_flush_fn) (request_queue_t *, struct request *);
 
 enum blk_queue_state {
 	Queue_down,
@@ -335,7 +332,6 @@ struct request_queue
 	activity_fn		*activity_fn;
 	issue_flush_fn		*issue_flush_fn;
 	prepare_flush_fn	*prepare_flush_fn;
-	end_flush_fn		*end_flush_fn;
 
 	/*
 	 * Dispatch queue sorting
@@ -420,14 +416,11 @@ struct request_queue
 	/*
 	 * reserved for flush operations
 	 */
-	struct request		*flush_rq;
-	unsigned char		ordered;
-};
-
-enum {
-	QUEUE_ORDERED_NONE,
-	QUEUE_ORDERED_TAG,
-	QUEUE_ORDERED_FLUSH,
+	unsigned int		ordered, next_ordered, ordseq;
+	int			orderr, ordcolor;
+	struct request		pre_flush_rq, bar_rq, post_flush_rq;
+	struct request		*orig_bar_rq;
+	unsigned int		bi_size;
 };
 
 #define RQ_INACTIVE		(-1)
@@ -445,12 +438,51 @@ enum {
 #define QUEUE_FLAG_REENTER	6	/* Re-entrancy avoidance */
 #define QUEUE_FLAG_PLUGGED	7	/* queue is plugged */
 #define QUEUE_FLAG_ELVSWITCH	8	/* don't use elevator, just do FIFO */
-#define QUEUE_FLAG_FLUSH	9	/* doing barrier flush sequence */
+
+enum {
+	/*
+	 * Hardbarrier is supported with one of the following methods.
+	 *
+	 * NONE		: hardbarrier unsupported
+	 * DRAIN	: ordering by draining is enough
+	 * DRAIN_FLUSH	: ordering by draining w/ pre and post flushes
+	 * DRAIN_FUA	: ordering by draining w/ pre flush and FUA write
+	 * TAG		: ordering by tag is enough
+	 * TAG_FLUSH	: ordering by tag w/ pre and post flushes
+	 * TAG_FUA	: ordering by tag w/ pre flush and FUA write
+	 */
+	QUEUE_ORDERED_NONE	= 0x00,
+	QUEUE_ORDERED_DRAIN	= 0x01,
+	QUEUE_ORDERED_TAG	= 0x02,
+
+	QUEUE_ORDERED_PREFLUSH	= 0x10,
+	QUEUE_ORDERED_POSTFLUSH	= 0x20,
+	QUEUE_ORDERED_FUA	= 0x40,
+
+	QUEUE_ORDERED_DRAIN_FLUSH = QUEUE_ORDERED_DRAIN |
+			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH,
+	QUEUE_ORDERED_DRAIN_FUA	= QUEUE_ORDERED_DRAIN |
+			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_FUA,
+	QUEUE_ORDERED_TAG_FLUSH	= QUEUE_ORDERED_TAG |
+			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH,
+	QUEUE_ORDERED_TAG_FUA	= QUEUE_ORDERED_TAG |
+			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_FUA,
+
+	/*
+	 * Ordered operation sequence
+	 */
+	QUEUE_ORDSEQ_STARTED	= 0x01,	/* flushing in progress */
+	QUEUE_ORDSEQ_DRAIN	= 0x02,	/* waiting for the queue to be drained */
+	QUEUE_ORDSEQ_PREFLUSH	= 0x04,	/* pre-flushing in progress */
+	QUEUE_ORDSEQ_BAR	= 0x08,	/* original barrier req in progress */
+	QUEUE_ORDSEQ_POSTFLUSH	= 0x10,	/* post-flushing in progress */
+	QUEUE_ORDSEQ_DONE	= 0x20,
+};
 
 #define blk_queue_plugged(q)	test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
-#define blk_queue_flushing(q)	test_bit(QUEUE_FLAG_FLUSH, &(q)->queue_flags)
+#define blk_queue_flushing(q)	((q)->ordseq)
 
 #define blk_fs_request(rq)	((rq)->flags & REQ_CMD)
 #define blk_pc_request(rq)	((rq)->flags & REQ_BLOCK_PC)
@@ -466,8 +498,7 @@ enum {
 
 #define blk_sorted_rq(rq)	((rq)->flags & REQ_SORTED)
 #define blk_barrier_rq(rq)	((rq)->flags & REQ_HARDBARRIER)
-#define blk_barrier_preflush(rq)	((rq)->flags & REQ_BAR_PREFLUSH)
-#define blk_barrier_postflush(rq)	((rq)->flags & REQ_BAR_POSTFLUSH)
+#define blk_fua_rq(rq)		((rq)->flags & REQ_FUA)
 
 #define list_entry_rq(ptr)	list_entry((ptr), struct request, queuelist)
 
@@ -665,11 +696,12 @@ extern void blk_queue_prep_rq(request_queue_t *, prep_rq_fn *pfn);
 extern void blk_queue_merge_bvec(request_queue_t *, merge_bvec_fn *);
 extern void blk_queue_dma_alignment(request_queue_t *, int);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
-extern void blk_queue_ordered(request_queue_t *, int);
+extern int blk_queue_ordered(request_queue_t *, unsigned, prepare_flush_fn *);
 extern void blk_queue_issue_flush_fn(request_queue_t *, issue_flush_fn *);
-extern struct request *blk_start_pre_flush(request_queue_t *,struct request *);
-extern int blk_complete_barrier_rq(request_queue_t *, struct request *, int);
-extern int blk_complete_barrier_rq_locked(request_queue_t *, struct request *, int);
+extern int blk_do_ordered(request_queue_t *, struct request **);
+extern unsigned blk_ordered_cur_seq(request_queue_t *);
+extern unsigned blk_ordered_req_seq(struct request *);
+extern void blk_ordered_complete_seq(request_queue_t *, unsigned, int);
 
 extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index a74c27e460b..fb80fa44c4d 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -130,6 +130,7 @@ extern int elv_try_last_merge(request_queue_t *, struct bio *);
 #define ELEVATOR_INSERT_FRONT	1
 #define ELEVATOR_INSERT_BACK	2
 #define ELEVATOR_INSERT_SORT	3
+#define ELEVATOR_INSERT_REQUEUE	4
 
 /*
  * return values from elevator_may_queue_fn
-- 
cgit v1.2.3


From 461d4e90c8cd049718884cd17c955e231140d3be Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Fri, 6 Jan 2006 09:52:55 +0100
Subject: [BLOCK] update SCSI to use new blk_ordered for barriers

All ordered request related stuff delegated to HLD.  Midlayer
now doens't deal with ordered setting or prepare_flush
callback.  sd.c updated to deal with blk_queue_ordered
setting.  Currently, ordered tag isn't used as SCSI midlayer
cannot guarantee request ordering.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 drivers/scsi/hosts.c       |  9 -------
 drivers/scsi/scsi_lib.c    | 46 ------------------------------------
 drivers/scsi/sd.c          | 58 ++++++++++++++++------------------------------
 include/scsi/scsi_driver.h |  1 -
 include/scsi/scsi_host.h   |  1 -
 5 files changed, 20 insertions(+), 95 deletions(-)

diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 5b9c2c5a7f0..66783c860a1 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -347,17 +347,8 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
 	shost->cmd_per_lun = sht->cmd_per_lun;
 	shost->unchecked_isa_dma = sht->unchecked_isa_dma;
 	shost->use_clustering = sht->use_clustering;
-	shost->ordered_flush = sht->ordered_flush;
 	shost->ordered_tag = sht->ordered_tag;
 
-	/*
-	 * hosts/devices that do queueing must support ordered tags
-	 */
-	if (shost->can_queue > 1 && shost->ordered_flush) {
-		printk(KERN_ERR "scsi: ordered flushes don't support queueing\n");
-		shost->ordered_flush = 0;
-	}
-
 	if (sht->max_host_blocked)
 		shost->max_host_blocked = sht->max_host_blocked;
 	else
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 53551f1dfe2..7a38b101976 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -932,9 +932,6 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes,
 	int sense_valid = 0;
 	int sense_deferred = 0;
 
-	if (blk_complete_barrier_rq(q, req, good_bytes >> 9))
-		return;
-
 	/*
 	 * Free up any indirection buffers we allocated for DMA purposes. 
 	 * For the case of a READ, we need to copy the data out of the
@@ -1199,38 +1196,6 @@ static int scsi_init_io(struct scsi_cmnd *cmd)
 	return BLKPREP_KILL;
 }
 
-static int scsi_prepare_flush_fn(request_queue_t *q, struct request *rq)
-{
-	struct scsi_device *sdev = q->queuedata;
-	struct scsi_driver *drv;
-
-	if (sdev->sdev_state == SDEV_RUNNING) {
-		drv = *(struct scsi_driver **) rq->rq_disk->private_data;
-
-		if (drv->prepare_flush)
-			return drv->prepare_flush(q, rq);
-	}
-
-	return 0;
-}
-
-static void scsi_end_flush_fn(request_queue_t *q, struct request *rq)
-{
-	struct scsi_device *sdev = q->queuedata;
-	struct request *flush_rq = rq->end_io_data;
-	struct scsi_driver *drv;
-
-	if (flush_rq->errors) {
-		printk("scsi: barrier error, disabling flush support\n");
-		blk_queue_ordered(q, QUEUE_ORDERED_NONE);
-	}
-
-	if (sdev->sdev_state == SDEV_RUNNING) {
-		drv = *(struct scsi_driver **) rq->rq_disk->private_data;
-		drv->end_flush(q, rq);
-	}
-}
-
 static int scsi_issue_flush_fn(request_queue_t *q, struct gendisk *disk,
 			       sector_t *error_sector)
 {
@@ -1703,17 +1668,6 @@ struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)
 	blk_queue_segment_boundary(q, shost->dma_boundary);
 	blk_queue_issue_flush_fn(q, scsi_issue_flush_fn);
 
-	/*
-	 * ordered tags are superior to flush ordering
-	 */
-	if (shost->ordered_tag)
-		blk_queue_ordered(q, QUEUE_ORDERED_TAG);
-	else if (shost->ordered_flush) {
-		blk_queue_ordered(q, QUEUE_ORDERED_FLUSH);
-		q->prepare_flush_fn = scsi_prepare_flush_fn;
-		q->end_flush_fn = scsi_end_flush_fn;
-	}
-
 	if (!shost->use_clustering)
 		clear_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
 	return q;
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index d651150ee76..2eefc9eb5da 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -121,8 +121,7 @@ static void sd_shutdown(struct device *dev);
 static void sd_rescan(struct device *);
 static int sd_init_command(struct scsi_cmnd *);
 static int sd_issue_flush(struct device *, sector_t *);
-static void sd_end_flush(request_queue_t *, struct request *);
-static int sd_prepare_flush(request_queue_t *, struct request *);
+static void sd_prepare_flush(request_queue_t *, struct request *);
 static void sd_read_capacity(struct scsi_disk *sdkp, char *diskname,
 			     unsigned char *buffer);
 
@@ -137,8 +136,6 @@ static struct scsi_driver sd_template = {
 	.rescan			= sd_rescan,
 	.init_command		= sd_init_command,
 	.issue_flush		= sd_issue_flush,
-	.prepare_flush		= sd_prepare_flush,
-	.end_flush		= sd_end_flush,
 };
 
 /*
@@ -729,42 +726,13 @@ static int sd_issue_flush(struct device *dev, sector_t *error_sector)
 	return ret;
 }
 
-static void sd_end_flush(request_queue_t *q, struct request *flush_rq)
+static void sd_prepare_flush(request_queue_t *q, struct request *rq)
 {
-	struct request *rq = flush_rq->end_io_data;
-	struct scsi_cmnd *cmd = rq->special;
-	unsigned int bytes = rq->hard_nr_sectors << 9;
-
-	if (!flush_rq->errors) {
-		spin_unlock(q->queue_lock);
-		scsi_io_completion(cmd, bytes, 0);
-		spin_lock(q->queue_lock);
-	} else if (blk_barrier_postflush(rq)) {
-		spin_unlock(q->queue_lock);
-		scsi_io_completion(cmd, 0, bytes);
-		spin_lock(q->queue_lock);
-	} else {
-		/*
-		 * force journal abort of barriers
-		 */
-		end_that_request_first(rq, -EOPNOTSUPP, rq->hard_nr_sectors);
-		end_that_request_last(rq, -EOPNOTSUPP);
-	}
-}
-
-static int sd_prepare_flush(request_queue_t *q, struct request *rq)
-{
-	struct scsi_device *sdev = q->queuedata;
-	struct scsi_disk *sdkp = dev_get_drvdata(&sdev->sdev_gendev);
-
-	if (!sdkp || !sdkp->WCE)
-		return 0;
-
 	memset(rq->cmd, 0, sizeof(rq->cmd));
-	rq->flags |= REQ_BLOCK_PC | REQ_SOFTBARRIER;
+	rq->flags |= REQ_BLOCK_PC;
 	rq->timeout = SD_TIMEOUT;
 	rq->cmd[0] = SYNCHRONIZE_CACHE;
-	return 1;
+	rq->cmd_len = 10;
 }
 
 static void sd_rescan(struct device *dev)
@@ -1462,6 +1430,7 @@ static int sd_revalidate_disk(struct gendisk *disk)
 	struct scsi_disk *sdkp = scsi_disk(disk);
 	struct scsi_device *sdp = sdkp->device;
 	unsigned char *buffer;
+	unsigned ordered;
 
 	SCSI_LOG_HLQUEUE(3, printk("sd_revalidate_disk: disk=%s\n", disk->disk_name));
 
@@ -1498,7 +1467,20 @@ static int sd_revalidate_disk(struct gendisk *disk)
 		sd_read_write_protect_flag(sdkp, disk->disk_name, buffer);
 		sd_read_cache_type(sdkp, disk->disk_name, buffer);
 	}
-		
+
+	/*
+	 * We now have all cache related info, determine how we deal
+	 * with ordered requests.  Note that as the current SCSI
+	 * dispatch function can alter request order, we cannot use
+	 * QUEUE_ORDERED_TAG_* even when ordered tag is supported.
+	 */
+	if (sdkp->WCE)
+		ordered = QUEUE_ORDERED_DRAIN_FLUSH;
+	else
+		ordered = QUEUE_ORDERED_DRAIN;
+
+	blk_queue_ordered(sdkp->disk->queue, ordered, sd_prepare_flush);
+
 	set_capacity(disk, sdkp->capacity);
 	kfree(buffer);
 
@@ -1598,6 +1580,7 @@ static int sd_probe(struct device *dev)
 	strcpy(gd->devfs_name, sdp->devfs_name);
 
 	gd->private_data = &sdkp->driver;
+	gd->queue = sdkp->device->request_queue;
 
 	sd_revalidate_disk(gd);
 
@@ -1605,7 +1588,6 @@ static int sd_probe(struct device *dev)
 	gd->flags = GENHD_FL_DRIVERFS;
 	if (sdp->removable)
 		gd->flags |= GENHD_FL_REMOVABLE;
-	gd->queue = sdkp->device->request_queue;
 
 	dev_set_drvdata(dev, sdkp);
 	add_disk(gd);
diff --git a/include/scsi/scsi_driver.h b/include/scsi/scsi_driver.h
index 850dfa877fd..02e26c1672b 100644
--- a/include/scsi/scsi_driver.h
+++ b/include/scsi/scsi_driver.h
@@ -15,7 +15,6 @@ struct scsi_driver {
 	void (*rescan)(struct device *);
 	int (*issue_flush)(struct device *, sector_t *);
 	int (*prepare_flush)(struct request_queue *, struct request *);
-	void (*end_flush)(struct request_queue *, struct request *);
 };
 #define to_scsi_driver(drv) \
 	container_of((drv), struct scsi_driver, gendrv)
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 6cbb1982ed0..25f637bd38b 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -392,7 +392,6 @@ struct scsi_host_template {
 	/*
 	 * ordered write support
 	 */
-	unsigned ordered_flush:1;
 	unsigned ordered_tag:1;
 
 	/*
-- 
cgit v1.2.3


From 007365ad60387df30f02f01fdc2b6e6432f6c265 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Fri, 6 Jan 2006 09:53:52 +0100
Subject: [BLOCK] scsi: add FUA support to sd

Add FUA support for barriers to SCSI disk.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 drivers/scsi/sd.c | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 2eefc9eb5da..32d4d8d7b9f 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -102,6 +102,7 @@ struct scsi_disk {
 	u8		write_prot;
 	unsigned	WCE : 1;	/* state of disk WCE bit */
 	unsigned	RCD : 1;	/* state of disk RCD bit, unused */
+	unsigned	DPOFUA : 1;	/* state of disk DPOFUA bit */
 };
 
 static DEFINE_IDR(sd_index_idr);
@@ -343,6 +344,7 @@ static int sd_init_command(struct scsi_cmnd * SCpnt)
 	
 	if (block > 0xffffffff) {
 		SCpnt->cmnd[0] += READ_16 - READ_6;
+		SCpnt->cmnd[1] |= blk_fua_rq(rq) ? 0x8 : 0;
 		SCpnt->cmnd[2] = sizeof(block) > 4 ? (unsigned char) (block >> 56) & 0xff : 0;
 		SCpnt->cmnd[3] = sizeof(block) > 4 ? (unsigned char) (block >> 48) & 0xff : 0;
 		SCpnt->cmnd[4] = sizeof(block) > 4 ? (unsigned char) (block >> 40) & 0xff : 0;
@@ -362,6 +364,7 @@ static int sd_init_command(struct scsi_cmnd * SCpnt)
 			this_count = 0xffff;
 
 		SCpnt->cmnd[0] += READ_10 - READ_6;
+		SCpnt->cmnd[1] |= blk_fua_rq(rq) ? 0x8 : 0;
 		SCpnt->cmnd[2] = (unsigned char) (block >> 24) & 0xff;
 		SCpnt->cmnd[3] = (unsigned char) (block >> 16) & 0xff;
 		SCpnt->cmnd[4] = (unsigned char) (block >> 8) & 0xff;
@@ -370,6 +373,17 @@ static int sd_init_command(struct scsi_cmnd * SCpnt)
 		SCpnt->cmnd[7] = (unsigned char) (this_count >> 8) & 0xff;
 		SCpnt->cmnd[8] = (unsigned char) this_count & 0xff;
 	} else {
+		if (unlikely(blk_fua_rq(rq))) {
+			/*
+			 * This happens only if this drive failed
+			 * 10byte rw command with ILLEGAL_REQUEST
+			 * during operation and thus turned off
+			 * use_10_for_rw.
+			 */
+			printk(KERN_ERR "sd: FUA write on READ/WRITE(6) drive\n");
+			return 0;
+		}
+
 		SCpnt->cmnd[1] |= (unsigned char) ((block >> 16) & 0x1f);
 		SCpnt->cmnd[2] = (unsigned char) ((block >> 8) & 0xff);
 		SCpnt->cmnd[3] = (unsigned char) block & 0xff;
@@ -1395,10 +1409,18 @@ sd_read_cache_type(struct scsi_disk *sdkp, char *diskname,
 			sdkp->RCD = 0;
 		}
 
+		sdkp->DPOFUA = (data.device_specific & 0x10) != 0;
+		if (sdkp->DPOFUA && !sdkp->device->use_10_for_rw) {
+			printk(KERN_NOTICE "SCSI device %s: uses "
+			       "READ/WRITE(6), disabling FUA\n", diskname);
+			sdkp->DPOFUA = 0;
+		}
+
 		ct =  sdkp->RCD + 2*sdkp->WCE;
 
-		printk(KERN_NOTICE "SCSI device %s: drive cache: %s\n",
-		       diskname, types[ct]);
+		printk(KERN_NOTICE "SCSI device %s: drive cache: %s%s\n",
+		       diskname, types[ct],
+		       sdkp->DPOFUA ? " w/ FUA" : "");
 
 		return;
 	}
@@ -1475,7 +1497,8 @@ static int sd_revalidate_disk(struct gendisk *disk)
 	 * QUEUE_ORDERED_TAG_* even when ordered tag is supported.
 	 */
 	if (sdkp->WCE)
-		ordered = QUEUE_ORDERED_DRAIN_FLUSH;
+		ordered = sdkp->DPOFUA
+			? QUEUE_ORDERED_DRAIN_FUA : QUEUE_ORDERED_DRAIN_FLUSH;
 	else
 		ordered = QUEUE_ORDERED_DRAIN;
 
-- 
cgit v1.2.3


From 93c9338713d4e11102cd09b4670ad42a336b06a3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Fri, 6 Jan 2006 09:55:00 +0100
Subject: [BLOCK] update libata to use new blk_ordered for barriers

Reflect changes in SCSI midlayer and updated to use new
ordered request implementation

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 drivers/scsi/ahci.c         | 1 -
 drivers/scsi/ata_piix.c     | 1 -
 drivers/scsi/sata_mv.c      | 1 -
 drivers/scsi/sata_nv.c      | 1 -
 drivers/scsi/sata_promise.c | 1 -
 drivers/scsi/sata_sil.c     | 1 -
 drivers/scsi/sata_sil24.c   | 1 -
 drivers/scsi/sata_sis.c     | 1 -
 drivers/scsi/sata_svw.c     | 1 -
 drivers/scsi/sata_sx4.c     | 1 -
 drivers/scsi/sata_uli.c     | 1 -
 drivers/scsi/sata_via.c     | 1 -
 drivers/scsi/sata_vsc.c     | 1 -
 13 files changed, 13 deletions(-)

diff --git a/drivers/scsi/ahci.c b/drivers/scsi/ahci.c
index 887eaa2a3eb..d113290b5fc 100644
--- a/drivers/scsi/ahci.c
+++ b/drivers/scsi/ahci.c
@@ -214,7 +214,6 @@ static struct scsi_host_template ahci_sht = {
 	.dma_boundary		= AHCI_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
-	.ordered_flush		= 1,
 };
 
 static const struct ata_port_operations ahci_ops = {
diff --git a/drivers/scsi/ata_piix.c b/drivers/scsi/ata_piix.c
index 0ea27873b9f..4b647eefc9a 100644
--- a/drivers/scsi/ata_piix.c
+++ b/drivers/scsi/ata_piix.c
@@ -185,7 +185,6 @@ static struct scsi_host_template piix_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
-	.ordered_flush		= 1,
 };
 
 static const struct ata_port_operations piix_pata_ops = {
diff --git a/drivers/scsi/sata_mv.c b/drivers/scsi/sata_mv.c
index b2bf16a9bf4..cd54244058b 100644
--- a/drivers/scsi/sata_mv.c
+++ b/drivers/scsi/sata_mv.c
@@ -374,7 +374,6 @@ static struct scsi_host_template mv_sht = {
 	.dma_boundary		= MV_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
-	.ordered_flush		= 1,
 };
 
 static const struct ata_port_operations mv5_ops = {
diff --git a/drivers/scsi/sata_nv.c b/drivers/scsi/sata_nv.c
index 4954896dfdb..c0cf52cb975 100644
--- a/drivers/scsi/sata_nv.c
+++ b/drivers/scsi/sata_nv.c
@@ -235,7 +235,6 @@ static struct scsi_host_template nv_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
-	.ordered_flush		= 1,
 };
 
 static const struct ata_port_operations nv_ops = {
diff --git a/drivers/scsi/sata_promise.c b/drivers/scsi/sata_promise.c
index da7fa04b8a7..3d1ea09a06a 100644
--- a/drivers/scsi/sata_promise.c
+++ b/drivers/scsi/sata_promise.c
@@ -114,7 +114,6 @@ static struct scsi_host_template pdc_ata_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
-	.ordered_flush		= 1,
 };
 
 static const struct ata_port_operations pdc_sata_ops = {
diff --git a/drivers/scsi/sata_sil.c b/drivers/scsi/sata_sil.c
index d2053487c73..b017f85e6d6 100644
--- a/drivers/scsi/sata_sil.c
+++ b/drivers/scsi/sata_sil.c
@@ -147,7 +147,6 @@ static struct scsi_host_template sil_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
-	.ordered_flush		= 1,
 };
 
 static const struct ata_port_operations sil_ops = {
diff --git a/drivers/scsi/sata_sil24.c b/drivers/scsi/sata_sil24.c
index a0ad3ed2200..923130185a9 100644
--- a/drivers/scsi/sata_sil24.c
+++ b/drivers/scsi/sata_sil24.c
@@ -292,7 +292,6 @@ static struct scsi_host_template sil24_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
-	.ordered_flush		= 1, /* NCQ not supported yet */
 };
 
 static const struct ata_port_operations sil24_ops = {
diff --git a/drivers/scsi/sata_sis.c b/drivers/scsi/sata_sis.c
index 32e12620b16..2df8c5632ac 100644
--- a/drivers/scsi/sata_sis.c
+++ b/drivers/scsi/sata_sis.c
@@ -99,7 +99,6 @@ static struct scsi_host_template sis_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
-	.ordered_flush		= 1,
 };
 
 static const struct ata_port_operations sis_ops = {
diff --git a/drivers/scsi/sata_svw.c b/drivers/scsi/sata_svw.c
index 6e7f7c83a75..668373590aa 100644
--- a/drivers/scsi/sata_svw.c
+++ b/drivers/scsi/sata_svw.c
@@ -303,7 +303,6 @@ static struct scsi_host_template k2_sata_sht = {
 	.proc_info		= k2_sata_proc_info,
 #endif
 	.bios_param		= ata_std_bios_param,
-	.ordered_flush		= 1,
 };
 
 
diff --git a/drivers/scsi/sata_sx4.c b/drivers/scsi/sata_sx4.c
index 94b253b80da..bc87c16c80d 100644
--- a/drivers/scsi/sata_sx4.c
+++ b/drivers/scsi/sata_sx4.c
@@ -194,7 +194,6 @@ static struct scsi_host_template pdc_sata_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
-	.ordered_flush		= 1,
 };
 
 static const struct ata_port_operations pdc_20621_ops = {
diff --git a/drivers/scsi/sata_uli.c b/drivers/scsi/sata_uli.c
index b2422a0f25c..9635ca70097 100644
--- a/drivers/scsi/sata_uli.c
+++ b/drivers/scsi/sata_uli.c
@@ -87,7 +87,6 @@ static struct scsi_host_template uli_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
-	.ordered_flush		= 1,
 };
 
 static const struct ata_port_operations uli_ops = {
diff --git a/drivers/scsi/sata_via.c b/drivers/scsi/sata_via.c
index c76215692da..6d5b0a794cf 100644
--- a/drivers/scsi/sata_via.c
+++ b/drivers/scsi/sata_via.c
@@ -106,7 +106,6 @@ static struct scsi_host_template svia_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
-	.ordered_flush		= 1,
 };
 
 static const struct ata_port_operations svia_sata_ops = {
diff --git a/drivers/scsi/sata_vsc.c b/drivers/scsi/sata_vsc.c
index fcfa486965b..2e2c3b7acb0 100644
--- a/drivers/scsi/sata_vsc.c
+++ b/drivers/scsi/sata_vsc.c
@@ -235,7 +235,6 @@ static struct scsi_host_template vsc_sata_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
-	.ordered_flush		= 1,
 };
 
 
-- 
cgit v1.2.3


From 9a3dccc42556537a48f39ee9a9e7ab90a933f766 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Fri, 6 Jan 2006 09:56:18 +0100
Subject: [BLOCK] add FUA support to libata

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 drivers/scsi/libata-core.c | 31 +++++++++++++++++++++++++------
 drivers/scsi/libata-scsi.c | 32 ++++++++++++++++++++++++++------
 drivers/scsi/libata.h      |  4 +++-
 include/linux/ata.h        |  6 +++++-
 include/linux/libata.h     |  3 ++-
 5 files changed, 61 insertions(+), 15 deletions(-)

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 9ea10258791..bdfb0a88cd6 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -562,16 +562,28 @@ static const u8 ata_rw_cmds[] = {
 	ATA_CMD_WRITE_MULTI,
 	ATA_CMD_READ_MULTI_EXT,
 	ATA_CMD_WRITE_MULTI_EXT,
+	0,
+	0,
+	0,
+	ATA_CMD_WRITE_MULTI_FUA_EXT,
 	/* pio */
 	ATA_CMD_PIO_READ,
 	ATA_CMD_PIO_WRITE,
 	ATA_CMD_PIO_READ_EXT,
 	ATA_CMD_PIO_WRITE_EXT,
+	0,
+	0,
+	0,
+	0,
 	/* dma */
 	ATA_CMD_READ,
 	ATA_CMD_WRITE,
 	ATA_CMD_READ_EXT,
-	ATA_CMD_WRITE_EXT
+	ATA_CMD_WRITE_EXT,
+	0,
+	0,
+	0,
+	ATA_CMD_WRITE_FUA_EXT
 };
 
 /**
@@ -584,25 +596,32 @@ static const u8 ata_rw_cmds[] = {
  *	LOCKING:
  *	caller.
  */
-void ata_rwcmd_protocol(struct ata_queued_cmd *qc)
+int ata_rwcmd_protocol(struct ata_queued_cmd *qc)
 {
 	struct ata_taskfile *tf = &qc->tf;
 	struct ata_device *dev = qc->dev;
+	u8 cmd;
 
-	int index, lba48, write;
+	int index, fua, lba48, write;
  
+	fua = (tf->flags & ATA_TFLAG_FUA) ? 4 : 0;
 	lba48 = (tf->flags & ATA_TFLAG_LBA48) ? 2 : 0;
 	write = (tf->flags & ATA_TFLAG_WRITE) ? 1 : 0;
 
 	if (dev->flags & ATA_DFLAG_PIO) {
 		tf->protocol = ATA_PROT_PIO;
-		index = dev->multi_count ? 0 : 4;
+		index = dev->multi_count ? 0 : 8;
 	} else {
 		tf->protocol = ATA_PROT_DMA;
-		index = 8;
+		index = 16;
 	}
 
-	tf->command = ata_rw_cmds[index + lba48 + write];
+	cmd = ata_rw_cmds[index + fua + lba48 + write];
+	if (cmd) {
+		tf->command = cmd;
+		return 0;
+	}
+	return -1;
 }
 
 static const char * const xfer_mode_str[] = {
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index e0439be4b57..2c644cbb6e9 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -1080,11 +1080,13 @@ static unsigned int ata_scsi_rw_xlat(struct ata_queued_cmd *qc, const u8 *scsicm
 	    scsicmd[0] == WRITE_16)
 		tf->flags |= ATA_TFLAG_WRITE;
 
-	/* Calculate the SCSI LBA and transfer length. */
+	/* Calculate the SCSI LBA, transfer length and FUA. */
 	switch (scsicmd[0]) {
 	case READ_10:
 	case WRITE_10:
 		scsi_10_lba_len(scsicmd, &block, &n_block);
+		if (unlikely(scsicmd[1] & (1 << 3)))
+			tf->flags |= ATA_TFLAG_FUA;
 		break;
 	case READ_6:
 	case WRITE_6:
@@ -1099,6 +1101,8 @@ static unsigned int ata_scsi_rw_xlat(struct ata_queued_cmd *qc, const u8 *scsicm
 	case READ_16:
 	case WRITE_16:
 		scsi_16_lba_len(scsicmd, &block, &n_block);
+		if (unlikely(scsicmd[1] & (1 << 3)))
+			tf->flags |= ATA_TFLAG_FUA;
 		break;
 	default:
 		DPRINTK("no-byte command\n");
@@ -1142,7 +1146,8 @@ static unsigned int ata_scsi_rw_xlat(struct ata_queued_cmd *qc, const u8 *scsicm
 			tf->device |= (block >> 24) & 0xf;
 		}
 
-		ata_rwcmd_protocol(qc);
+		if (unlikely(ata_rwcmd_protocol(qc) < 0))
+			goto invalid_fld;
 
 		qc->nsect = n_block;
 		tf->nsect = n_block & 0xff;
@@ -1160,7 +1165,8 @@ static unsigned int ata_scsi_rw_xlat(struct ata_queued_cmd *qc, const u8 *scsicm
 		if ((block >> 28) || (n_block > 256))
 			goto out_of_range;
 
-		ata_rwcmd_protocol(qc);
+		if (unlikely(ata_rwcmd_protocol(qc) < 0))
+			goto invalid_fld;
 
 		/* Convert LBA to CHS */
 		track = (u32)block / dev->sectors;
@@ -1695,6 +1701,7 @@ static unsigned int ata_msense_rw_recovery(u8 **ptr_io, const u8 *last)
 unsigned int ata_scsiop_mode_sense(struct ata_scsi_args *args, u8 *rbuf,
 				  unsigned int buflen)
 {
+	struct ata_device *dev = args->dev;
 	u8 *scsicmd = args->cmd->cmnd, *p, *last;
 	const u8 sat_blk_desc[] = {
 		0, 0, 0, 0,	/* number of blocks: sat unspecified */
@@ -1703,6 +1710,7 @@ unsigned int ata_scsiop_mode_sense(struct ata_scsi_args *args, u8 *rbuf,
 	};
 	u8 pg, spg;
 	unsigned int ebd, page_control, six_byte, output_len, alloc_len, minlen;
+	u8 dpofua;
 
 	VPRINTK("ENTER\n");
 
@@ -1771,9 +1779,17 @@ unsigned int ata_scsiop_mode_sense(struct ata_scsi_args *args, u8 *rbuf,
 
 	if (minlen < 1)
 		return 0;
+
+	dpofua = 0;
+	if (ata_id_has_fua(args->id) && dev->flags & ATA_DFLAG_LBA48 &&
+	    (!(dev->flags & ATA_DFLAG_PIO) || dev->multi_count))
+		dpofua = 1 << 4;
+
 	if (six_byte) {
 		output_len--;
 		rbuf[0] = output_len;
+		if (minlen > 2)
+			rbuf[2] |= dpofua;
 		if (ebd) {
 			if (minlen > 3)
 				rbuf[3] = sizeof(sat_blk_desc);
@@ -1786,6 +1802,8 @@ unsigned int ata_scsiop_mode_sense(struct ata_scsi_args *args, u8 *rbuf,
 		rbuf[0] = output_len >> 8;
 		if (minlen > 1)
 			rbuf[1] = output_len;
+		if (minlen > 3)
+			rbuf[3] |= dpofua;
 		if (ebd) {
 			if (minlen > 7)
 				rbuf[7] = sizeof(sat_blk_desc);
@@ -2446,7 +2464,7 @@ int ata_scsi_queuecmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *))
 		if (xlat_func)
 			ata_scsi_translate(ap, dev, cmd, done, xlat_func);
 		else
-			ata_scsi_simulate(dev->id, cmd, done);
+			ata_scsi_simulate(ap, dev, cmd, done);
 	} else
 		ata_scsi_translate(ap, dev, cmd, done, atapi_xlat);
 
@@ -2469,14 +2487,16 @@ out_unlock:
  *	spin_lock_irqsave(host_set lock)
  */
 
-void ata_scsi_simulate(u16 *id,
+void ata_scsi_simulate(struct ata_port *ap, struct ata_device *dev,
 		      struct scsi_cmnd *cmd,
 		      void (*done)(struct scsi_cmnd *))
 {
 	struct ata_scsi_args args;
 	const u8 *scsicmd = cmd->cmnd;
 
-	args.id = id;
+	args.ap = ap;
+	args.dev = dev;
+	args.id = dev->id;
 	args.cmd = cmd;
 	args.done = done;
 
diff --git a/drivers/scsi/libata.h b/drivers/scsi/libata.h
index 251e53bdc6e..e03ce48b7b4 100644
--- a/drivers/scsi/libata.h
+++ b/drivers/scsi/libata.h
@@ -32,6 +32,8 @@
 #define DRV_VERSION	"1.20"	/* must be exactly four chars */
 
 struct ata_scsi_args {
+	struct ata_port		*ap;
+	struct ata_device	*dev;
 	u16			*id;
 	struct scsi_cmnd	*cmd;
 	void			(*done)(struct scsi_cmnd *);
@@ -41,7 +43,7 @@ struct ata_scsi_args {
 extern int atapi_enabled;
 extern struct ata_queued_cmd *ata_qc_new_init(struct ata_port *ap,
 				      struct ata_device *dev);
-extern void ata_rwcmd_protocol(struct ata_queued_cmd *qc);
+extern int ata_rwcmd_protocol(struct ata_queued_cmd *qc);
 extern void ata_qc_free(struct ata_queued_cmd *qc);
 extern int ata_qc_issue(struct ata_queued_cmd *qc);
 extern int ata_check_atapi_dma(struct ata_queued_cmd *qc);
diff --git a/include/linux/ata.h b/include/linux/ata.h
index d2873b732bb..f63dad4165b 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -129,6 +129,7 @@ enum {
 	ATA_CMD_READ_EXT	= 0x25,
 	ATA_CMD_WRITE		= 0xCA,
 	ATA_CMD_WRITE_EXT	= 0x35,
+	ATA_CMD_WRITE_FUA_EXT	= 0x3D,
 	ATA_CMD_PIO_READ	= 0x20,
 	ATA_CMD_PIO_READ_EXT	= 0x24,
 	ATA_CMD_PIO_WRITE	= 0x30,
@@ -137,6 +138,7 @@ enum {
 	ATA_CMD_READ_MULTI_EXT	= 0x29,
 	ATA_CMD_WRITE_MULTI	= 0xC5,
 	ATA_CMD_WRITE_MULTI_EXT	= 0x39,
+	ATA_CMD_WRITE_MULTI_FUA_EXT = 0xCE,
 	ATA_CMD_SET_FEATURES	= 0xEF,
 	ATA_CMD_PACKET		= 0xA0,
 	ATA_CMD_VERIFY		= 0x40,
@@ -192,6 +194,7 @@ enum {
 	ATA_TFLAG_DEVICE	= (1 << 2), /* enable r/w to device reg */
 	ATA_TFLAG_WRITE		= (1 << 3), /* data dir: host->dev==1 (write) */
 	ATA_TFLAG_LBA		= (1 << 4), /* enable LBA */
+	ATA_TFLAG_FUA		= (1 << 5), /* enable FUA */
 };
 
 enum ata_tf_protocols {
@@ -245,7 +248,8 @@ struct ata_taskfile {
 #define ata_id_is_sata(id)	((id)[93] == 0)
 #define ata_id_rahead_enabled(id) ((id)[85] & (1 << 6))
 #define ata_id_wcache_enabled(id) ((id)[85] & (1 << 5))
-#define ata_id_has_flush(id) ((id)[83] & (1 << 12))
+#define ata_id_has_fua(id)	((id)[84] & (1 << 6))
+#define ata_id_has_flush(id)	((id)[83] & (1 << 12))
 #define ata_id_has_flush_ext(id) ((id)[83] & (1 << 13))
 #define ata_id_has_lba48(id)	((id)[83] & (1 << 10))
 #define ata_id_has_wcache(id)	((id)[82] & (1 << 5))
diff --git a/include/linux/libata.h b/include/linux/libata.h
index e828e172ccb..6db2c084573 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -480,7 +480,8 @@ extern u8   ata_bmdma_status(struct ata_port *ap);
 extern void ata_bmdma_irq_clear(struct ata_port *ap);
 extern void ata_qc_complete(struct ata_queued_cmd *qc);
 extern void ata_eng_timeout(struct ata_port *ap);
-extern void ata_scsi_simulate(u16 *id, struct scsi_cmnd *cmd,
+extern void ata_scsi_simulate(struct ata_port *ap, struct ata_device *dev,
+			      struct scsi_cmnd *cmd,
 			      void (*done)(struct scsi_cmnd *));
 extern int ata_std_bios_param(struct scsi_device *sdev,
 			      struct block_device *bdev,
-- 
cgit v1.2.3


From 3e087b575496b8aa445192f58e7d996b1cdfa121 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Fri, 6 Jan 2006 09:57:31 +0100
Subject: [BLOCK] update IDE to use new blk_ordered for barriers

Update IDE to use new blk_ordered.  This change makes the
following behavior changes.

* Partial completion of the barrier request is handled as
  failure of the whole ordered sequence.  No more partial
  completion for barrier requests.

* Any failure of pre or post flush request results in failure
  of the whole ordered sequence.

So, successfully completed ordered sequence guarantees that
all requests prior to the barrier made to physical medium and,
then, the while barrier request made to the physical medium.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 drivers/ide/ide-disk.c | 137 +++++++++++++++++++------------------------------
 drivers/ide/ide-io.c   |   5 +-
 2 files changed, 54 insertions(+), 88 deletions(-)

diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 4e5767968d7..4b441720b6b 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -681,50 +681,9 @@ static ide_proc_entry_t idedisk_proc[] = {
 
 #endif	/* CONFIG_PROC_FS */
 
-static void idedisk_end_flush(request_queue_t *q, struct request *flush_rq)
+static void idedisk_prepare_flush(request_queue_t *q, struct request *rq)
 {
 	ide_drive_t *drive = q->queuedata;
-	struct request *rq = flush_rq->end_io_data;
-	int good_sectors = rq->hard_nr_sectors;
-	int bad_sectors;
-	sector_t sector;
-
-	if (flush_rq->errors & ABRT_ERR) {
-		printk(KERN_ERR "%s: barrier support doesn't work\n", drive->name);
-		blk_queue_ordered(drive->queue, QUEUE_ORDERED_NONE);
-		blk_queue_issue_flush_fn(drive->queue, NULL);
-		good_sectors = 0;
-	} else if (flush_rq->errors) {
-		good_sectors = 0;
-		if (blk_barrier_preflush(rq)) {
-			sector = ide_get_error_location(drive,flush_rq->buffer);
-			if ((sector >= rq->hard_sector) &&
-			    (sector < rq->hard_sector + rq->hard_nr_sectors))
-				good_sectors = sector - rq->hard_sector;
-		}
-	}
-
-	if (flush_rq->errors)
-		printk(KERN_ERR "%s: failed barrier write: "
-				"sector=%Lx(good=%d/bad=%d)\n",
-				drive->name, (unsigned long long)rq->sector,
-				good_sectors,
-				(int) (rq->hard_nr_sectors-good_sectors));
-
-	bad_sectors = rq->hard_nr_sectors - good_sectors;
-
-	if (good_sectors)
-		__ide_end_request(drive, rq, 1, good_sectors);
-	if (bad_sectors)
-		__ide_end_request(drive, rq, 0, bad_sectors);
-}
-
-static int idedisk_prepare_flush(request_queue_t *q, struct request *rq)
-{
-	ide_drive_t *drive = q->queuedata;
-
-	if (!drive->wcache)
-		return 0;
 
 	memset(rq->cmd, 0, sizeof(rq->cmd));
 
@@ -735,9 +694,8 @@ static int idedisk_prepare_flush(request_queue_t *q, struct request *rq)
 		rq->cmd[0] = WIN_FLUSH_CACHE;
 
 
-	rq->flags |= REQ_DRIVE_TASK | REQ_SOFTBARRIER;
+	rq->flags |= REQ_DRIVE_TASK;
 	rq->buffer = rq->cmd;
-	return 1;
 }
 
 static int idedisk_issue_flush(request_queue_t *q, struct gendisk *disk,
@@ -794,27 +752,64 @@ static int set_nowerr(ide_drive_t *drive, int arg)
 	return 0;
 }
 
+static void update_ordered(ide_drive_t *drive)
+{
+	struct hd_driveid *id = drive->id;
+	unsigned ordered = QUEUE_ORDERED_NONE;
+	prepare_flush_fn *prep_fn = NULL;
+	issue_flush_fn *issue_fn = NULL;
+
+	if (drive->wcache) {
+		unsigned long long capacity;
+		int barrier;
+		/*
+		 * We must avoid issuing commands a drive does not
+		 * understand or we may crash it. We check flush cache
+		 * is supported. We also check we have the LBA48 flush
+		 * cache if the drive capacity is too large. By this
+		 * time we have trimmed the drive capacity if LBA48 is
+		 * not available so we don't need to recheck that.
+		 */
+		capacity = idedisk_capacity(drive);
+		barrier = ide_id_has_flush_cache(id) &&
+			(drive->addressing == 0 || capacity <= (1ULL << 28) ||
+			 ide_id_has_flush_cache_ext(id));
+
+		printk(KERN_INFO "%s: cache flushes %ssupported\n",
+		       drive->name, barrier ? "" : "not");
+
+		if (barrier) {
+			ordered = QUEUE_ORDERED_DRAIN_FLUSH;
+			prep_fn = idedisk_prepare_flush;
+			issue_fn = idedisk_issue_flush;
+		}
+	} else
+		ordered = QUEUE_ORDERED_DRAIN;
+
+	blk_queue_ordered(drive->queue, ordered, prep_fn);
+	blk_queue_issue_flush_fn(drive->queue, issue_fn);
+}
+
 static int write_cache(ide_drive_t *drive, int arg)
 {
 	ide_task_t args;
-	int err;
-
-	if (!ide_id_has_flush_cache(drive->id))
-		return 1;
+	int err = 1;
 
-	memset(&args, 0, sizeof(ide_task_t));
-	args.tfRegister[IDE_FEATURE_OFFSET]	= (arg) ?
+	if (ide_id_has_flush_cache(drive->id)) {
+		memset(&args, 0, sizeof(ide_task_t));
+		args.tfRegister[IDE_FEATURE_OFFSET]	= (arg) ?
 			SETFEATURES_EN_WCACHE : SETFEATURES_DIS_WCACHE;
-	args.tfRegister[IDE_COMMAND_OFFSET]	= WIN_SETFEATURES;
-	args.command_type			= IDE_DRIVE_TASK_NO_DATA;
-	args.handler				= &task_no_data_intr;
+		args.tfRegister[IDE_COMMAND_OFFSET]	= WIN_SETFEATURES;
+		args.command_type		= IDE_DRIVE_TASK_NO_DATA;
+		args.handler			= &task_no_data_intr;
+		err = ide_raw_taskfile(drive, &args, NULL);
+		if (err == 0)
+			drive->wcache = arg;
+	}
 
-	err = ide_raw_taskfile(drive, &args, NULL);
-	if (err)
-		return err;
+	update_ordered(drive);
 
-	drive->wcache = arg;
-	return 0;
+	return err;
 }
 
 static int do_idedisk_flushcache (ide_drive_t *drive)
@@ -888,7 +883,6 @@ static void idedisk_setup (ide_drive_t *drive)
 {
 	struct hd_driveid *id = drive->id;
 	unsigned long long capacity;
-	int barrier;
 
 	idedisk_add_settings(drive);
 
@@ -992,31 +986,6 @@ static void idedisk_setup (ide_drive_t *drive)
 		drive->wcache = 1;
 
 	write_cache(drive, 1);
-
-	/*
-	 * We must avoid issuing commands a drive does not understand
-	 * or we may crash it. We check flush cache is supported. We also
-	 * check we have the LBA48 flush cache if the drive capacity is
-	 * too large. By this time we have trimmed the drive capacity if
-	 * LBA48 is not available so we don't need to recheck that.
-	 */
-	barrier = 0;
-	if (ide_id_has_flush_cache(id))
-		barrier = 1;
-	if (drive->addressing == 1) {
-		/* Can't issue the correct flush ? */
-		if (capacity > (1ULL << 28) && !ide_id_has_flush_cache_ext(id))
-			barrier = 0;
-	}
-
-	printk(KERN_INFO "%s: cache flushes %ssupported\n",
-		drive->name, barrier ? "" : "not ");
-	if (barrier) {
-		blk_queue_ordered(drive->queue, QUEUE_ORDERED_FLUSH);
-		drive->queue->prepare_flush_fn = idedisk_prepare_flush;
-		drive->queue->end_flush_fn = idedisk_end_flush;
-		blk_queue_issue_flush_fn(drive->queue, idedisk_issue_flush);
-	}
 }
 
 static void ide_cacheflush_p(ide_drive_t *drive)
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 8435b44a700..b5dc6df8e67 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -119,10 +119,7 @@ int ide_end_request (ide_drive_t *drive, int uptodate, int nr_sectors)
 	if (!nr_sectors)
 		nr_sectors = rq->hard_cur_sectors;
 
-	if (blk_complete_barrier_rq_locked(drive->queue, rq, nr_sectors))
-		ret = rq->nr_sectors != 0;
-	else
-		ret = __ide_end_request(drive, rq, uptodate, nr_sectors);
+	ret = __ide_end_request(drive, rq, uptodate, nr_sectors);
 
 	spin_unlock_irqrestore(&ide_lock, flags);
 	return ret;
-- 
cgit v1.2.3


From ff5b8cf1491330836d75eede4e5632caa32b776a Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Fri, 6 Jan 2006 09:58:37 +0100
Subject: [BLOCK] I/O barrier documentation update

Update documentation to match new barrier implementation.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 Documentation/block/biodoc.txt | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 303c57a7fad..8e63831971d 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -263,14 +263,8 @@ A flag in the bio structure, BIO_BARRIER is used to identify a barrier i/o.
 The generic i/o scheduler would make sure that it places the barrier request and
 all other requests coming after it after all the previous requests in the
 queue. Barriers may be implemented in different ways depending on the
-driver. A SCSI driver for example could make use of ordered tags to
-preserve the necessary ordering with a lower impact on throughput. For IDE
-this might be two sync cache flush: a pre and post flush when encountering
-a barrier write.
-
-There is a provision for queues to indicate what kind of barriers they
-can provide. This is as of yet unmerged, details will be added here once it
-is in the kernel.
+driver. For more details regarding I/O barriers, please read barrier.txt
+in this directory.
 
 1.2.2 Request Priority/Latency
 
-- 
cgit v1.2.3


From 15fc858a0067c800f410a24551a7b461978abf0b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Fri, 6 Jan 2006 10:00:50 +0100
Subject: [BLOCK] Correct blk_execute_rq_nowait() prototype

---
 include/linux/blkdev.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 15db0f112d0..fb098537742 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -613,8 +613,7 @@ extern int blk_rq_map_user_iov(request_queue_t *, struct request *, struct sg_io
 extern int blk_execute_rq(request_queue_t *, struct gendisk *,
 			  struct request *, int);
 extern void blk_execute_rq_nowait(request_queue_t *, struct gendisk *,
-				  struct request *, int,
-				  void (*done)(struct request *));
+				  struct request *, int, rq_end_io_fn *);
 
 static inline request_queue_t *bdev_get_queue(struct block_device *bdev)
 {
-- 
cgit v1.2.3


From e650c305ec3178818b317dad37a6d9c7fa8ba28d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Fri, 6 Jan 2006 12:38:30 +0100
Subject: [SCSI] scsi_end_async() needs to take an uptodate parameter

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 drivers/scsi/scsi_lib.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 7a38b101976..ba93d6e66d4 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -308,7 +308,7 @@ struct scsi_io_context {
 
 static kmem_cache_t *scsi_io_context_cache;
 
-static void scsi_end_async(struct request *req)
+static void scsi_end_async(struct request *req, int uptodate)
 {
 	struct scsi_io_context *sioc = req->end_io_data;
 
-- 
cgit v1.2.3