275 files changed, 10799 insertions, 6497 deletions
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 34f40ea0ba6..f4076d9e990 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -94,5 +94,5 @@ source "drivers/kvm/Kconfig"
 
 source "drivers/uio/Kconfig"
 
-source "drivers/lguest/Kconfig"
+source "drivers/virtio/Kconfig"
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index cfe38ffff28..560496b4330 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -91,3 +91,4 @@ obj-$(CONFIG_HID)		+= hid/
 obj-$(CONFIG_PPC_PS3)		+= ps3/
 obj-$(CONFIG_OF)		+= of/
 obj-$(CONFIG_SSB)		+= ssb/
+obj-$(CONFIG_VIRTIO)		+= virtio/
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 629eadbd0ec..69092bce1ad 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -4296,7 +4296,7 @@ void ata_sg_clean(struct ata_queued_cmd *qc)
 		sg_last(sg, qc->orig_n_elem)->length += qc->pad_len;
 		if (pad_buf) {
 			struct scatterlist *psg = &qc->pad_sgent;
-			void *addr = kmap_atomic(psg->page, KM_IRQ0);
+			void *addr = kmap_atomic(sg_page(psg), KM_IRQ0);
 			memcpy(addr + psg->offset, pad_buf, qc->pad_len);
 			kunmap_atomic(addr, KM_IRQ0);
 		}
@@ -4686,11 +4686,11 @@ static int ata_sg_setup(struct ata_queued_cmd *qc)
 		 * data in this function or read data in ata_sg_clean.
 		 */
 		offset = lsg->offset + lsg->length - qc->pad_len;
-		psg->page = nth_page(lsg->page, offset >> PAGE_SHIFT);
+		sg_set_page(psg, nth_page(sg_page(lsg), offset >> PAGE_SHIFT));
 		psg->offset = offset_in_page(offset);
 
 		if (qc->tf.flags & ATA_TFLAG_WRITE) {
-			void *addr = kmap_atomic(psg->page, KM_IRQ0);
+			void *addr = kmap_atomic(sg_page(psg), KM_IRQ0);
 			memcpy(pad_buf, addr + psg->offset, qc->pad_len);
 			kunmap_atomic(addr, KM_IRQ0);
 		}
@@ -4836,7 +4836,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
 	if (qc->curbytes == qc->nbytes - qc->sect_size)
 		ap->hsm_task_state = HSM_ST_LAST;
 
-	page = qc->cursg->page;
+	page = sg_page(qc->cursg);
 	offset = qc->cursg->offset + qc->cursg_ofs;
 
 	/* get the current page and offset */
@@ -4988,7 +4988,7 @@ next_sg:
 
 	sg = qc->cursg;
 
-	page = sg->page;
+	page = sg_page(sg);
 	offset = sg->offset + qc->cursg_ofs;
 
 	/* get the current page and offset */
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 9fbb39cd0f5..5b758b9ad0b 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -1544,7 +1544,7 @@ static unsigned int ata_scsi_rbuf_get(struct scsi_cmnd *cmd, u8 **buf_out)
 	struct scatterlist *sg = scsi_sglist(cmd);
 
 	if (sg) {
-		buf = kmap_atomic(sg->page, KM_IRQ0) + sg->offset;
+		buf = kmap_atomic(sg_page(sg), KM_IRQ0) + sg->offset;
 		buflen = sg->length;
 	} else {
 		buf = NULL;
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index c41d0728efe..7868707c7ed 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -137,7 +137,7 @@ static ssize_t show_mem_state(struct sys_device *dev, char *buf)
 	return len;
 }
 
-static inline int memory_notify(unsigned long val, void *v)
+int memory_notify(unsigned long val, void *v)
 {
 	return blocking_notifier_call_chain(&memory_chain, val, v);
 }
@@ -183,7 +183,6 @@ memory_block_action(struct memory_block *mem, unsigned long action)
 			break;
 		case MEM_OFFLINE:
 			mem->state = MEM_GOING_OFFLINE;
-			memory_notify(MEM_GOING_OFFLINE, NULL);
 			start_paddr = page_to_pfn(first_page) << PAGE_SHIFT;
 			ret = remove_memory(start_paddr,
 					    PAGES_PER_SECTION << PAGE_SHIFT);
@@ -191,7 +190,6 @@ memory_block_action(struct memory_block *mem, unsigned long action)
 				mem->state = old_state;
 				break;
 			}
-			memory_notify(MEM_MAPPING_INVALID, NULL);
 			break;
 		default:
 			printk(KERN_WARNING "%s(%p, %ld) unknown action: %ld\n",
@@ -199,11 +197,6 @@ memory_block_action(struct memory_block *mem, unsigned long action)
 			WARN_ON(1);
 			ret = -EINVAL;
 	}
-	/*
-	 * For now, only notify on successful memory operations
-	 */
-	if (!ret)
-		memory_notify(action, NULL);
 
 	return ret;
 }
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 84d6aa500e2..9030c373ce6 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -44,6 +44,7 @@
 #include <linux/init.h>
 #include <linux/jiffies.h>
 #include <linux/random.h>
+#include <linux/scatterlist.h>
 #include <asm/io.h>
 #include <asm/uaccess.h>
 #include "DAC960.h"
@@ -345,6 +346,7 @@ static bool DAC960_CreateAuxiliaryStructures(DAC960_Controller_T *Controller)
 	Command->V1.ScatterGatherList =
 		(DAC960_V1_ScatterGatherSegment_T *)ScatterGatherCPU;
 	Command->V1.ScatterGatherListDMA = ScatterGatherDMA;
+	sg_init_table(Command->cmd_sglist, DAC960_V1_ScatterGatherLimit);
       } else {
         Command->cmd_sglist = Command->V2.ScatterList;
 	Command->V2.ScatterGatherList =
@@ -353,6 +355,7 @@ static bool DAC960_CreateAuxiliaryStructures(DAC960_Controller_T *Controller)
 	Command->V2.RequestSense =
 				(DAC960_SCSI_RequestSense_T *)RequestSenseCPU;
 	Command->V2.RequestSenseDMA = RequestSenseDMA;
+	sg_init_table(Command->cmd_sglist, DAC960_V2_ScatterGatherLimit);
       }
     }
   return true;
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index ce4b1e484e6..4d0119ea9e3 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -425,4 +425,10 @@ config XEN_BLKDEV_FRONTEND
 	  block device driver.  It communicates with a back-end driver
 	  in another domain which drives the actual block device.
 
+config VIRTIO_BLK
+	tristate "Virtio block driver (EXPERIMENTAL)"
+	depends on EXPERIMENTAL && VIRTIO
+	---help---
+	  This is the virtual block driver for lguest.  Say Y or M.
+
 endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 014e72121b5..7691505a2e1 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -25,10 +25,10 @@ obj-$(CONFIG_SUNVDC)		+= sunvdc.o
 obj-$(CONFIG_BLK_DEV_UMEM)	+= umem.o
 obj-$(CONFIG_BLK_DEV_NBD)	+= nbd.o
 obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o
+obj-$(CONFIG_VIRTIO_BLK)	+= virtio_blk.o
 
 obj-$(CONFIG_VIODASD)		+= viodasd.o
 obj-$(CONFIG_BLK_DEV_SX8)	+= sx8.o
 obj-$(CONFIG_BLK_DEV_UB)	+= ub.o
 
 obj-$(CONFIG_XEN_BLKDEV_FRONTEND)	+= xen-blkfront.o
-obj-$(CONFIG_LGUEST_BLOCK)	+= lguest_blk.o
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 7c2cfde08f1..5a6fe17fc63 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -2610,7 +2610,7 @@ static void do_cciss_request(struct request_queue *q)
 	       (int)creq->nr_sectors);
 #endif				/* CCISS_DEBUG */
 
-	memset(tmp_sg, 0, sizeof(tmp_sg));
+	sg_init_table(tmp_sg, MAXSGENTRIES);
 	seg = blk_rq_map_sg(q, creq, tmp_sg);
 
 	/* get the DMA records for the setup */
@@ -2621,7 +2621,7 @@ static void do_cciss_request(struct request_queue *q)
 
 	for (i = 0; i < seg; i++) {
 		c->SG[i].Len = tmp_sg[i].length;
-		temp64.val = (__u64) pci_map_page(h->pdev, tmp_sg[i].page,
+		temp64.val = (__u64) pci_map_page(h->pdev, sg_page(&tmp_sg[i]),
 						  tmp_sg[i].offset,
 						  tmp_sg[i].length, dir);
 		c->SG[i].Addr.lower = temp64.val32.lower;
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index 568603d3043..c8132d95879 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -37,6 +37,7 @@
 #include <linux/spinlock.h>
 #include <linux/blkdev.h>
 #include <linux/genhd.h>
+#include <linux/scatterlist.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
@@ -918,6 +919,7 @@ queue_next:
 DBGPX(
 	printk("sector=%d, nr_sectors=%d\n", creq->sector, creq->nr_sectors);
 );
+	sg_init_table(tmp_sg, SG_MAX);
 	seg = blk_rq_map_sg(q, creq, tmp_sg);
 
 	/* Now do all the DMA Mappings */
@@ -929,7 +931,7 @@ DBGPX(
 	{
 		c->req.sg[i].size = tmp_sg[i].length;
 		c->req.sg[i].addr = (__u32) pci_map_page(h->pci_dev,
-						 tmp_sg[i].page,
+						 sg_page(&tmp_sg[i]),
 						 tmp_sg[i].offset,
 						 tmp_sg[i].length, dir);
 	}
diff --git a/drivers/block/cryptoloop.c b/drivers/block/cryptoloop.c
index 40535036e89..1b58b010797 100644
--- a/drivers/block/cryptoloop.c
+++ b/drivers/block/cryptoloop.c
@@ -26,6 +26,7 @@
 #include <linux/crypto.h>
 #include <linux/blkdev.h>
 #include <linux/loop.h>
+#include <linux/scatterlist.h>
 #include <asm/semaphore.h>
 #include <asm/uaccess.h>
 
@@ -119,14 +120,17 @@ cryptoloop_transfer(struct loop_device *lo, int cmd,
 		.tfm = tfm,
 		.flags = CRYPTO_TFM_REQ_MAY_SLEEP,
 	};
-	struct scatterlist sg_out = { NULL, };
-	struct scatterlist sg_in = { NULL, };
+	struct scatterlist sg_out;
+	struct scatterlist sg_in;
 
 	encdec_cbc_t encdecfunc;
 	struct page *in_page, *out_page;
 	unsigned in_offs, out_offs;
 	int err;
 
+	sg_init_table(&sg_out, 1);
+	sg_init_table(&sg_in, 1);
+
 	if (cmd == READ) {
 		in_page = raw_page;
 		in_offs = raw_off;
@@ -146,11 +150,11 @@ cryptoloop_transfer(struct loop_device *lo, int cmd,
 		u32 iv[4] = { 0, };
 		iv[0] = cpu_to_le32(IV & 0xffffffff);
 
-		sg_in.page = in_page;
+		sg_set_page(&sg_in, in_page);
 		sg_in.offset = in_offs;
 		sg_in.length = sz;
 
-		sg_out.page = out_page;
+		sg_set_page(&sg_out, out_page);
 		sg_out.offset = out_offs;
 		sg_out.length = sz;
 
diff --git a/drivers/block/lguest_blk.c b/drivers/block/lguest_blk.c
deleted file mode 100644
index fa8e42341b8..00000000000
--- a/drivers/block/lguest_blk.c
+++ /dev/null
@@ -1,421 +0,0 @@
-/*D:400
- * The Guest block driver
- *
- * This is a simple block driver, which appears as /dev/lgba, lgbb, lgbc etc.
- * The mechanism is simple: we place the information about the request in the
- * device page, then use SEND_DMA (containing the data for a write, or an empty
- * "ping" DMA for a read).
- :*/
-/* Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-//#define DEBUG
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/blkdev.h>
-#include <linux/interrupt.h>
-#include <linux/lguest_bus.h>
-
-static char next_block_index = 'a';
-
-/*D:420 Here is the structure which holds all the information we need about
- * each Guest block device.
- *
- * I'm sure at this stage, you're wondering "hey, where was the adventure I was
- * promised?" and thinking "Rusty sucks, I shall say nasty things about him on
- * my blog".  I think Real adventures have boring bits, too, and you're in the
- * middle of one.  But it gets better.  Just not quite yet. */
-struct blockdev
-{
-	/* The block queue infrastructure wants a spinlock: it is held while it
-	 * calls our block request function.  We grab it in our interrupt
-	 * handler so the responses don't mess with new requests. */
-	spinlock_t lock;
-
-	/* The disk structure registered with kernel. */
-	struct gendisk *disk;
-
-	/* The major device number for this disk, and the interrupt.  We only
-	 * really keep them here for completeness; we'd need them if we
-	 * supported device unplugging. */
-	int major;
-	int irq;
-
-	/* The physical address of this device's memory page */
-	unsigned long phys_addr;
-	/* The mapped memory page for convenient acces. */
-	struct lguest_block_page *lb_page;
-
-	/* We only have a single request outstanding at a time: this is it. */
-	struct lguest_dma dma;
-	struct request *req;
-};
-
-/*D:495 We originally used end_request() throughout the driver, but it turns
- * out that end_request() is deprecated, and doesn't actually end the request
- * (which seems like a good reason to deprecate it!).  It simply ends the first
- * bio.  So if we had 3 bios in a "struct request" we would do all 3,
- * end_request(), do 2, end_request(), do 1 and end_request(): twice as much
- * work as we needed to do.
- *
- * This reinforced to me that I do not understand the block layer.
- *
- * Nonetheless, Jens Axboe gave me this nice helper to end all chunks of a
- * request.  This improved disk speed by 130%. */
-static void end_entire_request(struct request *req, int uptodate)
-{
-	if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
-		BUG();
-	add_disk_randomness(req->rq_disk);
-	blkdev_dequeue_request(req);
-	end_that_request_last(req, uptodate);
-}
-
-/* I'm told there are only two stories in the world worth telling: love and
- * hate.  So there used to be a love scene here like this:
- *
- *  Launcher:	We could make beautiful I/O together, you and I.
- *  Guest:	My, that's a big disk!
- *
- * Unfortunately, it was just too raunchy for our otherwise-gentle tale. */
-
-/*D:490 This is the interrupt handler, called when a block read or write has
- * been completed for us. */
-static irqreturn_t lgb_irq(int irq, void *_bd)
-{
-	/* We handed our "struct blockdev" as the argument to request_irq(), so
-	 * it is passed through to us here.  This tells us which device we're
-	 * dealing with in case we have more than one. */
-	struct blockdev *bd = _bd;
-	unsigned long flags;
-
-	/* We weren't doing anything?  Strange, but could happen if we shared
-	 * interrupts (we don't!). */
-	if (!bd->req) {
-		pr_debug("No work!\n");
-		return IRQ_NONE;
-	}
-
-	/* Not done yet?  That's equally strange. */
-	if (!bd->lb_page->result) {
-		pr_debug("No result!\n");
-		return IRQ_NONE;
-	}
-
-	/* We have to grab the lock before ending the request. */
-	spin_lock_irqsave(&bd->lock, flags);
-	/* "result" is 1 for success, 2 for failure: end_entire_request() wants
-	 * to know whether this succeeded or not. */
-	end_entire_request(bd->req, bd->lb_page->result == 1);
-	/* Clear out request, it's done. */
-	bd->req = NULL;
-	/* Reset incoming DMA for next time. */
-	bd->dma.used_len = 0;
-	/* Ready for more reads or writes */
-	blk_start_queue(bd->disk->queue);
-	spin_unlock_irqrestore(&bd->lock, flags);
-
-	/* The interrupt was for us, we dealt with it. */
-	return IRQ_HANDLED;
-}
-
-/*D:480 The block layer's "struct request" contains a number of "struct bio"s,
- * each of which contains "struct bio_vec"s, each of which contains a page, an
- * offset and a length.
- *
- * Fortunately there are iterators to help us walk through the "struct
- * request".  Even more fortunately, there were plenty of places to steal the
- * code from.  We pack the "struct request" into our "struct lguest_dma" and
- * return the total length. */
-static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma)
-{
-	unsigned int i = 0, len = 0;
-	struct req_iterator iter;
-	struct bio_vec *bvec;
-
-	rq_for_each_segment(bvec, req, iter) {
-		/* We told the block layer not to give us too many. */
-		BUG_ON(i == LGUEST_MAX_DMA_SECTIONS);
-		/* If we had a zero-length segment, it would look like
-		 * the end of the data referred to by the "struct
-		 * lguest_dma", so make sure that doesn't happen. */
-		BUG_ON(!bvec->bv_len);
-		/* Convert page & offset to a physical address */
-		dma->addr[i] = page_to_phys(bvec->bv_page)
-			+ bvec->bv_offset;
-		dma->len[i] = bvec->bv_len;
-		len += bvec->bv_len;
-		i++;
-	}
-	/* If the array isn't full, we mark the end with a 0 length */
-	if (i < LGUEST_MAX_DMA_SECTIONS)
-		dma->len[i] = 0;
-	return len;
-}
-
-/* This creates an empty DMA, useful for prodding the Host without sending data
- * (ie. when we want to do a read) */
-static void empty_dma(struct lguest_dma *dma)
-{
-	dma->len[0] = 0;
-}
-
-/*D:470 Setting up a request is fairly easy: */
-static void setup_req(struct blockdev *bd,
-		      int type, struct request *req, struct lguest_dma *dma)
-{
-	/* The type is 1 (write) or 0 (read). */
-	bd->lb_page->type = type;
-	/* The sector on disk where the read or write starts. */
-	bd->lb_page->sector = req->sector;
-	/* The result is initialized to 0 (unfinished). */
-	bd->lb_page->result = 0;
-	/* The current request (so we can end it in the interrupt handler). */
-	bd->req = req;
-	/* The number of bytes: returned as a side-effect of req_to_dma(),
-	 * which packs the block layer's "struct request" into our "struct
-	 * lguest_dma" */
-	bd->lb_page->bytes = req_to_dma(req, dma);
-}
-
-/*D:450 Write is pretty straightforward: we pack the request into a "struct
- * lguest_dma", then use SEND_DMA to send the request. */
-static void do_write(struct blockdev *bd, struct request *req)
-{
-	struct lguest_dma send;
-
-	pr_debug("lgb: WRITE sector %li\n", (long)req->sector);
-	setup_req(bd, 1, req, &send);
-
-	lguest_send_dma(bd->phys_addr, &send);
-}
-
-/* Read is similar to write, except we pack the request into our receive
- * "struct lguest_dma" and send through an empty DMA just to tell the Host that
- * there's a request pending. */
-static void do_read(struct blockdev *bd, struct request *req)
-{
-	struct lguest_dma ping;
-
-	pr_debug("lgb: READ sector %li\n", (long)req->sector);
-	setup_req(bd, 0, req, &bd->dma);
-
-	empty_dma(&ping);
-	lguest_send_dma(bd->phys_addr, &ping);
-}
-
-/*D:440 This where requests come in: we get handed the request queue and are
- * expected to pull a "struct request" off it until we've finished them or
- * we're waiting for a reply: */
-static void do_lgb_request(struct request_queue *q)
-{
-	struct blockdev *bd;
-	struct request *req;
-
-again:
-	/* This sometimes returns NULL even on the very first time around.  I
-	 * wonder if it's something to do with letting elves handle the request
-	 * queue... */
-	req = elv_next_request(q);
-	if (!req)
-		return;
-
-	/* We attached the struct blockdev to the disk: get it back */
-	bd = req->rq_disk->private_data;
-	/* Sometimes we get repeated requests after blk_stop_queue(), but we
-	 * can only handle one at a time. */
-	if (bd->req)
-		return;
-
-	/* We only do reads and writes: no tricky business! */
-	if (!blk_fs_request(req)) {
-		pr_debug("Got non-command 0x%08x\n", req->cmd_type);
-		req->errors++;
-		end_entire_request(req, 0);
-		goto again;
-	}
-
-	if (rq_data_dir(req) == WRITE)
-		do_write(bd, req);
-	else
-		do_read(bd, req);
-
-	/* We've put out the request, so stop any more coming in until we get
-	 * an interrupt, which takes us to lgb_irq() to re-enable the queue. */
-	blk_stop_queue(q);
-}
-
-/*D:430 This is the "struct block_device_operations" we attach to the disk at
- * the end of lguestblk_probe().  It doesn't seem to want much. */
-static struct block_device_operations lguestblk_fops = {
-	.owner = THIS_MODULE,
-};
-
-/*D:425 Setting up a disk device seems to involve a lot of code.  I'm not sure
- * quite why.  I do know that the IDE code sent two or three of the maintainers
- * insane, perhaps this is the fringe of the same disease?
- *
- * As in the console code, the probe function gets handed the generic
- * lguest_device from lguest_bus.c: */
-static int lguestblk_probe(struct lguest_device *lgdev)
-{
-	struct blockdev *bd;
-	int err;
-	int irqflags = IRQF_SHARED;
-
-	/* First we allocate our own "struct blockdev" and initialize the easy
-	 * fields. */
-	bd = kmalloc(sizeof(*bd), GFP_KERNEL);
-	if (!bd)
-		return -ENOMEM;
-
-	spin_lock_init(&bd->lock);
-	bd->irq = lgdev_irq(lgdev);
-	bd->req = NULL;
-	bd->dma.used_len = 0;
-	bd->dma.len[0] = 0;
-	/* The descriptor in the lguest_devices array provided by the Host
-	 * gives the Guest the physical page number of the device's page. */
-	bd->phys_addr = (lguest_devices[lgdev->index].pfn << PAGE_SHIFT);
-
-	/* We use lguest_map() to get a pointer to the device page */
-	bd->lb_page = lguest_map(bd->phys_addr, 1);
-	if (!bd->lb_page) {
-		err = -ENOMEM;
-		goto out_free_bd;
-	}
-
-	/* We need a major device number: 0 means "assign one dynamically". */
-	bd->major = register_blkdev(0, "lguestblk");
-	if (bd->major < 0) {
-		err = bd->major;
-		goto out_unmap;
-	}
-
-	/* This allocates a "struct gendisk" where we pack all the information
-	 * about the disk which the rest of Linux sees.  The argument is the
-	 * number of minor devices desired: we need one minor for the main
-	 * disk, and one for each partition.  Of course, we can't possibly know
-	 * how many partitions are on the disk (add_disk does that).
-	 */
-	bd->disk = alloc_disk(16);
-	if (!bd->disk) {
-		err = -ENOMEM;
-		goto out_unregister_blkdev;
-	}
-
-	/* Every disk needs a queue for requests to come in: we set up the
-	 * queue with a callback function (the core of our driver) and the lock
-	 * to use. */
-	bd->disk->queue = blk_init_queue(do_lgb_request, &bd->lock);
-	if (!bd->disk->queue) {
-		err = -ENOMEM;
-		goto out_put_disk;
-	}
-
-	/* We can only handle a certain number of pointers in our SEND_DMA
-	 * call, so we set that with blk_queue_max_hw_segments().  This is not
-	 * to be confused with blk_queue_max_phys_segments() of course!  I
-	 * know, who could possibly confuse the two?
-	 *
-	 * Well, it's simple to tell them apart: this one seems to work and the
-	 * other one didn't. */
-	blk_queue_max_hw_segments(bd->disk->queue, LGUEST_MAX_DMA_SECTIONS);
-
-	/* Due to technical limitations of our Host (and simple coding) we
-	 * can't have a single buffer which crosses a page boundary.  Tell it
-	 * here.  This means that our maximum request size is 16
-	 * (LGUEST_MAX_DMA_SECTIONS) pages. */
-	blk_queue_segment_boundary(bd->disk->queue, PAGE_SIZE-1);
-
-	/* We name our disk: this becomes the device name when udev does its
-	 * magic thing and creates the device node, such as /dev/lgba.
-	 * next_block_index is a global which starts at 'a'.  Unfortunately
-	 * this simple increment logic means that the 27th disk will be called
-	 * "/dev/lgb{".  In that case, I recommend having at least 29 disks, so
-	 * your /dev directory will be balanced. */
-	sprintf(bd->disk->disk_name, "lgb%c", next_block_index++);
-
-	/* We look to the device descriptor again to see if this device's
-	 * interrupts are expected to be random.  If they are, we tell the irq
-	 * subsystem.  At the moment this bit is always set. */
-	if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS)
-		irqflags |= IRQF_SAMPLE_RANDOM;
-
-	/* Now we have the name and irqflags, we can request the interrupt; we
-	 * give it the "struct blockdev" we have set up to pass to lgb_irq()
-	 * when there is an interrupt. */
-	err = request_irq(bd->irq, lgb_irq, irqflags, bd->disk->disk_name, bd);
-	if (err)
-		goto out_cleanup_queue;
-
-	/* We bind our one-entry DMA pool to the key for this block device so
-	 * the Host can reply to our requests.  The key is equal to the
-	 * physical address of the device's page, which is conveniently
-	 * unique. */
-	err = lguest_bind_dma(bd->phys_addr, &bd->dma, 1, bd->irq);
-	if (err)
-		goto out_free_irq;
-
-	/* We finish our disk initialization and add the disk to the system. */
-	bd->disk->major = bd->major;
-	bd->disk->first_minor = 0;
-	bd->disk->private_data = bd;
-	bd->disk->fops = &lguestblk_fops;
-	/* This is initialized to the disk size by the Launcher. */
-	set_capacity(bd->disk, bd->lb_page->num_sectors);
-	add_disk(bd->disk);
-
-	printk(KERN_INFO "%s: device %i at major %d\n",
-	       bd->disk->disk_name, lgdev->index, bd->major);
-
-	/* We don't need to keep the "struct blockdev" around, but if we ever
-	 * implemented device removal, we'd need this. */
-	lgdev->private = bd;
-	return 0;
-
-out_free_irq:
-	free_irq(bd->irq, bd);
-out_cleanup_queue:
-	blk_cleanup_queue(bd->disk->queue);
-out_put_disk:
-	put_disk(bd->disk);
-out_unregister_blkdev:
-	unregister_blkdev(bd->major, "lguestblk");
-out_unmap:
-	lguest_unmap(bd->lb_page);
-out_free_bd:
-	kfree(bd);
-	return err;
-}
-
-/*D:410 The boilerplate code for registering the lguest block driver is just
- * like the console: */
-static struct lguest_driver lguestblk_drv = {
-	.name = "lguestblk",
-	.owner = THIS_MODULE,
-	.device_type = LGUEST_DEVICE_T_BLOCK,
-	.probe = lguestblk_probe,
-};
-
-static __init int lguestblk_init(void)
-{
-	return register_lguest_driver(&lguestblk_drv);
-}
-module_init(lguestblk_init);
-
-MODULE_DESCRIPTION("Lguest block driver");
-MODULE_LICENSE("GPL");
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 317a790c153..7276f7d207c 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -388,6 +388,7 @@ static int __send_request(struct request *req)
 		op = VD_OP_BWRITE;
 	}
 
+	sg_init_table(sg, port->ring_cookies);
 	nsg = blk_rq_map_sg(req->q, req, sg);
 
 	len = 0;
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 402209fec59..52dc5e13171 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -27,6 +27,7 @@
 #include <linux/hdreg.h>
 #include <linux/dma-mapping.h>
 #include <linux/completion.h>
+#include <linux/scatterlist.h>
 #include <asm/io.h>
 #include <asm/uaccess.h>
 
@@ -522,6 +523,7 @@ static struct carm_request *carm_get_request(struct carm_host *host)
 			host->n_msgs++;
 
 			assert(host->n_msgs <= CARM_MAX_REQ);
+			sg_init_table(crq->sg, CARM_MAX_REQ_SG);
 			return crq;
 		}
 
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index c57dd2b3a0c..14143f2c484 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -25,6 +25,7 @@
 #include <linux/usb_usual.h>
 #include <linux/blkdev.h>
 #include <linux/timer.h>
+#include <linux/scatterlist.h>
 #include <scsi/scsi.h>
 
 #define DRV_NAME "ub"
@@ -656,6 +657,7 @@ static int ub_request_fn_1(struct ub_lun *lun, struct request *rq)
 	if ((cmd = ub_get_cmd(lun)) == NULL)
 		return -1;
 	memset(cmd, 0, sizeof(struct ub_scsi_cmd));
+	sg_init_table(cmd->sgv, UB_MAX_REQ_SG);
 
 	blkdev_dequeue_request(rq);
 
@@ -1309,9 +1311,8 @@ static void ub_data_start(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 	else
 		pipe = sc->send_bulk_pipe;
 	sc->last_pipe = pipe;
-	usb_fill_bulk_urb(&sc->work_urb, sc->dev, pipe,
-	    page_address(sg->page) + sg->offset, sg->length,
-	    ub_urb_complete, sc);
+	usb_fill_bulk_urb(&sc->work_urb, sc->dev, pipe, sg_virt(sg),
+	    sg->length, ub_urb_complete, sc);
 	sc->work_urb.actual_length = 0;
 	sc->work_urb.error_count = 0;
 	sc->work_urb.status = 0;
@@ -1427,7 +1428,7 @@ static void ub_state_sense(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 	scmd->state = UB_CMDST_INIT;
 	scmd->nsg = 1;
 	sg = &scmd->sgv[0];
-	sg->page = virt_to_page(sc->top_sense);
+	sg_set_page(sg, virt_to_page(sc->top_sense));
 	sg->offset = (unsigned long)sc->top_sense & (PAGE_SIZE-1);
 	sg->length = UB_SENSE_SIZE;
 	scmd->len = UB_SENSE_SIZE;
@@ -1863,7 +1864,7 @@ static int ub_sync_read_cap(struct ub_dev *sc, struct ub_lun *lun,
 	cmd->state = UB_CMDST_INIT;
 	cmd->nsg = 1;
 	sg = &cmd->sgv[0];
-	sg->page = virt_to_page(p);
+	sg_set_page(sg, virt_to_page(p));
 	sg->offset = (unsigned long)p & (PAGE_SIZE-1);
 	sg->length = 8;
 	cmd->len = 8;
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
index e824b672e05..ab5d404faa1 100644
--- a/drivers/block/viodasd.c
+++ b/drivers/block/viodasd.c
@@ -41,6 +41,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/completion.h>
 #include <linux/device.h>
+#include <linux/scatterlist.h>
 
 #include <asm/uaccess.h>
 #include <asm/vio.h>
@@ -270,6 +271,7 @@ static int send_request(struct request *req)
         d = req->rq_disk->private_data;
 
 	/* Now build the scatter-gather list */
+	sg_init_table(sg, VIOMAXBLOCKDMA);
 	nsg = blk_rq_map_sg(req->q, req, sg);
 	nsg = dma_map_sg(d->dev, sg, nsg, direction);
 
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
new file mode 100644
index 00000000000..a901eee64ba
--- /dev/null
+++ b/drivers/block/virtio_blk.c
@@ -0,0 +1,308 @@
+//#define DEBUG
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+#include <linux/virtio.h>
+#include <linux/virtio_blk.h>
+#include <linux/virtio_blk.h>
+
+static unsigned char virtblk_index = 'a';
+struct virtio_blk
+{
+	spinlock_t lock;
+
+	struct virtio_device *vdev;
+	struct virtqueue *vq;
+
+	/* The disk structure for the kernel. */
+	struct gendisk *disk;
+
+	/* Request tracking. */
+	struct list_head reqs;
+
+	mempool_t *pool;
+
+	/* Scatterlist: can be too big for stack. */
+	struct scatterlist sg[3+MAX_PHYS_SEGMENTS];
+};
+
+struct virtblk_req
+{
+	struct list_head list;
+	struct request *req;
+	struct virtio_blk_outhdr out_hdr;
+	struct virtio_blk_inhdr in_hdr;
+};
+
+static bool blk_done(struct virtqueue *vq)
+{
+	struct virtio_blk *vblk = vq->vdev->priv;
+	struct virtblk_req *vbr;
+	unsigned int len;
+	unsigned long flags;
+
+	spin_lock_irqsave(&vblk->lock, flags);
+	while ((vbr = vblk->vq->vq_ops->get_buf(vblk->vq, &len)) != NULL) {
+		int uptodate;
+		switch (vbr->in_hdr.status) {
+		case VIRTIO_BLK_S_OK:
+			uptodate = 1;
+			break;
+		case VIRTIO_BLK_S_UNSUPP:
+			uptodate = -ENOTTY;
+			break;
+		default:
+			uptodate = 0;
+			break;
+		}
+
+		end_dequeued_request(vbr->req, uptodate);
+		list_del(&vbr->list);
+		mempool_free(vbr, vblk->pool);
+	}
+	/* In case queue is stopped waiting for more buffers. */
+	blk_start_queue(vblk->disk->queue);
+	spin_unlock_irqrestore(&vblk->lock, flags);
+	return true;
+}
+
+static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
+		   struct request *req)
+{
+	unsigned long num, out, in;
+	struct virtblk_req *vbr;
+
+	vbr = mempool_alloc(vblk->pool, GFP_ATOMIC);
+	if (!vbr)
+		/* When another request finishes we'll try again. */
+		return false;
+
+	vbr->req = req;
+	if (blk_fs_request(vbr->req)) {
+		vbr->out_hdr.type = 0;
+		vbr->out_hdr.sector = vbr->req->sector;
+		vbr->out_hdr.ioprio = vbr->req->ioprio;
+	} else if (blk_pc_request(vbr->req)) {
+		vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD;
+		vbr->out_hdr.sector = 0;
+		vbr->out_hdr.ioprio = vbr->req->ioprio;
+	} else {
+		/* We don't put anything else in the queue. */
+		BUG();
+	}
+
+	if (blk_barrier_rq(vbr->req))
+		vbr->out_hdr.type |= VIRTIO_BLK_T_BARRIER;
+
+	/* We have to zero this, otherwise blk_rq_map_sg gets upset. */
+	memset(vblk->sg, 0, sizeof(vblk->sg));
+	sg_set_buf(&vblk->sg[0], &vbr->out_hdr, sizeof(vbr->out_hdr));
+	num = blk_rq_map_sg(q, vbr->req, vblk->sg+1);
+	sg_set_buf(&vblk->sg[num+1], &vbr->in_hdr, sizeof(vbr->in_hdr));
+
+	if (rq_data_dir(vbr->req) == WRITE) {
+		vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
+		out = 1 + num;
+		in = 1;
+	} else {
+		vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
+		out = 1;
+		in = 1 + num;
+	}
+
+	if (vblk->vq->vq_ops->add_buf(vblk->vq, vblk->sg, out, in, vbr)) {
+		mempool_free(vbr, vblk->pool);
+		return false;
+	}
+
+	list_add_tail(&vbr->list, &vblk->reqs);
+	return true;
+}
+
+static void do_virtblk_request(struct request_queue *q)
+{
+	struct virtio_blk *vblk = NULL;
+	struct request *req;
+	unsigned int issued = 0;
+
+	while ((req = elv_next_request(q)) != NULL) {
+		vblk = req->rq_disk->private_data;
+		BUG_ON(req->nr_phys_segments > ARRAY_SIZE(vblk->sg));
+
+		/* If this request fails, stop queue and wait for something to
+		   finish to restart it. */
+		if (!do_req(q, vblk, req)) {
+			blk_stop_queue(q);
+			break;
+		}
+		blkdev_dequeue_request(req);
+		issued++;
+	}
+
+	if (issued)
+		vblk->vq->vq_ops->kick(vblk->vq);
+}
+
+static int virtblk_ioctl(struct inode *inode, struct file *filp,
+			 unsigned cmd, unsigned long data)
+{
+	return scsi_cmd_ioctl(filp, inode->i_bdev->bd_disk->queue,
+			      inode->i_bdev->bd_disk, cmd,
+			      (void __user *)data);
+}
+
+static struct block_device_operations virtblk_fops = {
+	.ioctl = virtblk_ioctl,
+	.owner = THIS_MODULE,
+};
+
+static int virtblk_probe(struct virtio_device *vdev)
+{
+	struct virtio_blk *vblk;
+	int err, major;
+	void *token;
+	unsigned int len;
+	u64 cap;
+	u32 v;
+
+	vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL);
+	if (!vblk) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&vblk->reqs);
+	spin_lock_init(&vblk->lock);
+	vblk->vdev = vdev;
+
+	/* We expect one virtqueue, for output. */
+	vblk->vq = vdev->config->find_vq(vdev, blk_done);
+	if (IS_ERR(vblk->vq)) {
+		err = PTR_ERR(vblk->vq);
+		goto out_free_vblk;
+	}
+
+	vblk->pool = mempool_create_kmalloc_pool(1,sizeof(struct virtblk_req));
+	if (!vblk->pool) {
+		err = -ENOMEM;
+		goto out_free_vq;
+	}
+
+	major = register_blkdev(0, "virtblk");
+	if (major < 0) {
+		err = major;
+		goto out_mempool;
+	}
+
+	/* FIXME: How many partitions?  How long is a piece of string? */
+	vblk->disk = alloc_disk(1 << 4);
+	if (!vblk->disk) {
+		err = -ENOMEM;
+		goto out_unregister_blkdev;
+	}
+
+	vblk->disk->queue = blk_init_queue(do_virtblk_request, &vblk->lock);
+	if (!vblk->disk->queue) {
+		err = -ENOMEM;
+		goto out_put_disk;
+	}
+
+	sprintf(vblk->disk->disk_name, "vd%c", virtblk_index++);
+	vblk->disk->major = major;
+	vblk->disk->first_minor = 0;
+	vblk->disk->private_data = vblk;
+	vblk->disk->fops = &virtblk_fops;
+
+	/* If barriers are supported, tell block layer that queue is ordered */
+	token = vdev->config->find(vdev, VIRTIO_CONFIG_BLK_F, &len);
+	if (virtio_use_bit(vdev, token, len, VIRTIO_BLK_F_BARRIER))
+		blk_queue_ordered(vblk->disk->queue, QUEUE_ORDERED_TAG, NULL);
+
+	err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_CAPACITY, &cap);
+	if (err) {
+		dev_err(&vdev->dev, "Bad/missing capacity in config\n");
+		goto out_put_disk;
+	}
+
+	/* If capacity is too big, truncate with warning. */
+	if ((sector_t)cap != cap) {
+		dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n",
+			 (unsigned long long)cap);
+		cap = (sector_t)-1;
+	}
+	set_capacity(vblk->disk, cap);
+
+	err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_SIZE_MAX, &v);
+	if (!err)
+		blk_queue_max_segment_size(vblk->disk->queue, v);
+	else if (err != -ENOENT) {
+		dev_err(&vdev->dev, "Bad SIZE_MAX in config\n");
+		goto out_put_disk;
+	}
+
+	err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_SEG_MAX, &v);
+	if (!err)
+		blk_queue_max_hw_segments(vblk->disk->queue, v);
+	else if (err != -ENOENT) {
+		dev_err(&vdev->dev, "Bad SEG_MAX in config\n");
+		goto out_put_disk;
+	}
+
+	add_disk(vblk->disk);
+	return 0;
+
+out_put_disk:
+	put_disk(vblk->disk);
+out_unregister_blkdev:
+	unregister_blkdev(major, "virtblk");
+out_mempool:
+	mempool_destroy(vblk->pool);
+out_free_vq:
+	vdev->config->del_vq(vblk->vq);
+out_free_vblk:
+	kfree(vblk);
+out:
+	return err;
+}
+
+static void virtblk_remove(struct virtio_device *vdev)
+{
+	struct virtio_blk *vblk = vdev->priv;
+	int major = vblk->disk->major;
+
+	BUG_ON(!list_empty(&vblk->reqs));
+	blk_cleanup_queue(vblk->disk->queue);
+	put_disk(vblk->disk);
+	unregister_blkdev(major, "virtblk");
+	mempool_destroy(vblk->pool);
+	kfree(vblk);
+}
+
+static struct virtio_device_id id_table[] = {
+	{ VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
+	{ 0 },
+};
+
+static struct virtio_driver virtio_blk = {
+	.driver.name =	KBUILD_MODNAME,
+	.driver.owner =	THIS_MODULE,
+	.id_table =	id_table,
+	.probe =	virtblk_probe,
+	.remove =	__devexit_p(virtblk_remove),
+};
+
+static int __init init(void)
+{
+	return register_virtio_driver(&virtio_blk);
+}
+
+static void __exit fini(void)
+{
+	unregister_virtio_driver(&virtio_blk);
+}
+module_init(init);
+module_exit(fini);
+
+MODULE_DEVICE_TABLE(virtio, id_table);
+MODULE_DESCRIPTION("Virtio block driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/bluetooth/Kconfig b/drivers/bluetooth/Kconfig
index b9fbe6e7f9a..075598e1c50 100644
--- a/drivers/bluetooth/Kconfig
+++ b/drivers/bluetooth/Kconfig
@@ -22,6 +22,30 @@ config BT_HCIUSB_SCO
 
 	  Say Y here to compile support for SCO over HCI USB.
 
+config BT_HCIBTUSB
+	tristate "HCI USB driver (alternate version)"
+	depends on USB && EXPERIMENTAL && BT_HCIUSB=n
+	help
+	  Bluetooth HCI USB driver.
+	  This driver is required if you want to use Bluetooth devices with
+	  USB interface.
+
+          This driver is still experimental and has no SCO support.
+
+	  Say Y here to compile support for Bluetooth USB devices into the
+	  kernel or say M to compile it as module (btusb).
+
+config BT_HCIBTSDIO
+	tristate "HCI SDIO driver"
+	depends on MMC
+	help
+	  Bluetooth HCI SDIO driver.
+	  This driver is required if you want to use Bluetooth device with
+	  SDIO interface.
+
+	  Say Y here to compile support for Bluetooth SDIO devices into the
+	  kernel or say M to compile it as module (btsdio).
+
 config BT_HCIUART
 	tristate "HCI UART driver"
 	help
@@ -55,6 +79,17 @@ config BT_HCIUART_BCSP
 
 	  Say Y here to compile support for HCI BCSP protocol.
 
+config BT_HCIUART_LL
+	bool "HCILL protocol support"
+	depends on BT_HCIUART
+	help
+	  HCILL (HCI Low Level) is a serial protocol for communication
+	  between Bluetooth device and host. This protocol is required for
+	  serial Bluetooth devices that are based on Texas Instruments'
+	  BRF chips.
+
+	  Say Y here to compile support for HCILL protocol.
+
 config BT_HCIBCM203X
 	tristate "HCI BCM203x USB driver"
 	depends on USB
diff --git a/drivers/bluetooth/Makefile b/drivers/bluetooth/Makefile
index 08c10e178e0..77444afbf10 100644
--- a/drivers/bluetooth/Makefile
+++ b/drivers/bluetooth/Makefile
@@ -13,7 +13,11 @@ obj-$(CONFIG_BT_HCIBT3C)	+= bt3c_cs.o
 obj-$(CONFIG_BT_HCIBLUECARD)	+= bluecard_cs.o
 obj-$(CONFIG_BT_HCIBTUART)	+= btuart_cs.o
 
+obj-$(CONFIG_BT_HCIBTUSB)	+= btusb.o
+obj-$(CONFIG_BT_HCIBTSDIO)	+= btsdio.o
+
 hci_uart-y				:= hci_ldisc.o
 hci_uart-$(CONFIG_BT_HCIUART_H4)	+= hci_h4.o
 hci_uart-$(CONFIG_BT_HCIUART_BCSP)	+= hci_bcsp.o
+hci_uart-$(CONFIG_BT_HCIUART_LL)	+= hci_ll.o
 hci_uart-objs				:= $(hci_uart-y)
diff --git a/drivers/bluetooth/bluecard_cs.c b/drivers/bluetooth/bluecard_cs.c
index 851de4d5b7d..bcf57927b7a 100644
--- a/drivers/bluetooth/bluecard_cs.c
+++ b/drivers/bluetooth/bluecard_cs.c
@@ -503,10 +503,7 @@ static irqreturn_t bluecard_interrupt(int irq, void *dev_inst)
 	unsigned int iobase;
 	unsigned char reg;
 
-	if (!info || !info->hdev) {
-		BT_ERR("Call of irq %d for unknown device", irq);
-		return IRQ_NONE;
-	}
+	BUG_ON(!info->hdev);
 
 	if (!test_bit(CARD_READY, &(info->hw_state)))
 		return IRQ_HANDLED;
diff --git a/drivers/bluetooth/bpa10x.c b/drivers/bluetooth/bpa10x.c
index e8ebd5d3de8..1375b5345a0 100644
--- a/drivers/bluetooth/bpa10x.c
+++ b/drivers/bluetooth/bpa10x.c
@@ -2,7 +2,7 @@
  *
  *  Digianswer Bluetooth USB driver
  *
- *  Copyright (C) 2004-2005  Marcel Holtmann <marcel@holtmann.org>
+ *  Copyright (C) 2004-2007  Marcel Holtmann <marcel@holtmann.org>
  *
  *
  *  This program is free software; you can redistribute it and/or modify
@@ -21,13 +21,14 @@
  *
  */
 
-#include <linux/module.h>
-
 #include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/types.h>
+#include <linux/sched.h>
 #include <linux/errno.h>
+#include <linux/skbuff.h>
 
 #include <linux/usb.h>
 
@@ -39,7 +40,7 @@
 #define BT_DBG(D...)
 #endif
 
-#define VERSION "0.8"
+#define VERSION "0.9"
 
 static int ignore = 0;
 
@@ -52,393 +53,285 @@ static struct usb_device_id bpa10x_table[] = {
 
 MODULE_DEVICE_TABLE(usb, bpa10x_table);
 
-#define BPA10X_CMD_EP		0x00
-#define BPA10X_EVT_EP		0x81
-#define BPA10X_TX_EP		0x02
-#define BPA10X_RX_EP		0x82
-
-#define BPA10X_CMD_BUF_SIZE	252
-#define BPA10X_EVT_BUF_SIZE	16
-#define BPA10X_TX_BUF_SIZE	384
-#define BPA10X_RX_BUF_SIZE	384
-
 struct bpa10x_data {
-	struct hci_dev		*hdev;
-	struct usb_device	*udev;
+	struct hci_dev    *hdev;
+	struct usb_device *udev;
 
-	rwlock_t		lock;
+	struct usb_anchor tx_anchor;
+	struct usb_anchor rx_anchor;
 
-	struct sk_buff_head	cmd_queue;
-	struct urb		*cmd_urb;
-	struct urb		*evt_urb;
-	struct sk_buff		*evt_skb;
-	unsigned int		evt_len;
-
-	struct sk_buff_head	tx_queue;
-	struct urb		*tx_urb;
-	struct urb		*rx_urb;
+	struct sk_buff *rx_skb[2];
 };
 
-#define HCI_VENDOR_HDR_SIZE	5
+#define HCI_VENDOR_HDR_SIZE 5
 
 struct hci_vendor_hdr {
-	__u8	type;
-	__le16	snum;
-	__le16	dlen;
+	__u8    type;
+	__le16  snum;
+	__le16  dlen;
 } __attribute__ ((packed));
 
-static void bpa10x_recv_bulk(struct bpa10x_data *data, unsigned char *buf, int count)
+static int bpa10x_recv(struct hci_dev *hdev, int queue, void *buf, int count)
 {
-	struct hci_acl_hdr *ah;
-	struct hci_sco_hdr *sh;
-	struct hci_vendor_hdr *vh;
-	struct sk_buff *skb;
-	int len;
+	struct bpa10x_data *data = hdev->driver_data;
+
+	BT_DBG("%s queue %d buffer %p count %d", hdev->name,
+							queue, buf, count);
+
+	if (queue < 0 || queue > 1)
+		return -EILSEQ;
+
+	hdev->stat.byte_rx += count;
 
 	while (count) {
-		switch (*buf++) {
-		case HCI_ACLDATA_PKT:
-			ah = (struct hci_acl_hdr *) buf;
-			len = HCI_ACL_HDR_SIZE + __le16_to_cpu(ah->dlen);
-			skb = bt_skb_alloc(len, GFP_ATOMIC);
-			if (skb) {
-				memcpy(skb_put(skb, len), buf, len);
-				skb->dev = (void *) data->hdev;
-				bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT;
-				hci_recv_frame(skb);
-			}
-			break;
+		struct sk_buff *skb = data->rx_skb[queue];
+		struct { __u8 type; int expect; } *scb;
+		int type, len = 0;
 
-		case HCI_SCODATA_PKT:
-			sh = (struct hci_sco_hdr *) buf;
-			len = HCI_SCO_HDR_SIZE + sh->dlen;
-			skb = bt_skb_alloc(len, GFP_ATOMIC);
-			if (skb) {
-				memcpy(skb_put(skb, len), buf, len);
-				skb->dev = (void *) data->hdev;
-				bt_cb(skb)->pkt_type = HCI_SCODATA_PKT;
-				hci_recv_frame(skb);
+		if (!skb) {
+			/* Start of the frame */
+
+			type = *((__u8 *) buf);
+			count--; buf++;
+
+			switch (type) {
+			case HCI_EVENT_PKT:
+				if (count >= HCI_EVENT_HDR_SIZE) {
+					struct hci_event_hdr *h = buf;
+					len = HCI_EVENT_HDR_SIZE + h->plen;
+				} else
+					return -EILSEQ;
+				break;
+
+			case HCI_ACLDATA_PKT:
+				if (count >= HCI_ACL_HDR_SIZE) {
+					struct hci_acl_hdr *h = buf;
+					len = HCI_ACL_HDR_SIZE +
+							__le16_to_cpu(h->dlen);
+				} else
+					return -EILSEQ;
+				break;
+
+			case HCI_SCODATA_PKT:
+				if (count >= HCI_SCO_HDR_SIZE) {
+					struct hci_sco_hdr *h = buf;
+					len = HCI_SCO_HDR_SIZE + h->dlen;
+				} else
+					return -EILSEQ;
+				break;
+
+			case HCI_VENDOR_PKT:
+				if (count >= HCI_VENDOR_HDR_SIZE) {
+					struct hci_vendor_hdr *h = buf;
+					len = HCI_VENDOR_HDR_SIZE +
+							__le16_to_cpu(h->dlen);
+				} else
+					return -EILSEQ;
+				break;
 			}
-			break;
 
-		case HCI_VENDOR_PKT:
-			vh = (struct hci_vendor_hdr *) buf;
-			len = HCI_VENDOR_HDR_SIZE + __le16_to_cpu(vh->dlen);
 			skb = bt_skb_alloc(len, GFP_ATOMIC);
-			if (skb) {
-				memcpy(skb_put(skb, len), buf, len);
-				skb->dev = (void *) data->hdev;
-				bt_cb(skb)->pkt_type = HCI_VENDOR_PKT;
-				hci_recv_frame(skb);
+			if (!skb) {
+				BT_ERR("%s no memory for packet", hdev->name);
+				return -ENOMEM;
 			}
-			break;
-
-		default:
-			len = count - 1;
-			break;
-		}
 
-		buf   += len;
-		count -= (len + 1);
-	}
-}
-
-static int bpa10x_recv_event(struct bpa10x_data *data, unsigned char *buf, int size)
-{
-	BT_DBG("data %p buf %p size %d", data, buf, size);
+			skb->dev = (void *) hdev;
 
-	if (data->evt_skb) {
-		struct sk_buff *skb = data->evt_skb;
+			data->rx_skb[queue] = skb;
 
-		memcpy(skb_put(skb, size), buf, size);
+			scb = (void *) skb->cb;
+			scb->type = type;
+			scb->expect = len;
+		} else {
+			/* Continuation */
 
-		if (skb->len == data->evt_len) {
-			data->evt_skb = NULL;
-			data->evt_len = 0;
-			hci_recv_frame(skb);
-		}
-	} else {
-		struct sk_buff *skb;
-		struct hci_event_hdr *hdr;
-		unsigned char pkt_type;
-		int pkt_len = 0;
-
-		if (size < HCI_EVENT_HDR_SIZE + 1) {
-			BT_ERR("%s event packet block with size %d is too short",
-							data->hdev->name, size);
-			return -EILSEQ;
+			scb = (void *) skb->cb;
+			len = scb->expect;
 		}
 
-		pkt_type = *buf++;
-		size--;
-
-		if (pkt_type != HCI_EVENT_PKT) {
-			BT_ERR("%s unexpected event packet start byte 0x%02x",
-							data->hdev->name, pkt_type);
-			return -EPROTO;
-		}
+		len = min(len, count);
 
-		hdr = (struct hci_event_hdr *) buf;
-		pkt_len = HCI_EVENT_HDR_SIZE + hdr->plen;
+		memcpy(skb_put(skb, len), buf, len);
 
-		skb = bt_skb_alloc(pkt_len, GFP_ATOMIC);
-		if (!skb) {
-			BT_ERR("%s no memory for new event packet",
-							data->hdev->name);
-			return -ENOMEM;
-		}
+		scb->expect -= len;
 
-		skb->dev = (void *) data->hdev;
-		bt_cb(skb)->pkt_type = pkt_type;
+		if (scb->expect == 0) {
+			/* Complete frame */
 
-		memcpy(skb_put(skb, size), buf, size);
+			data->rx_skb[queue] = NULL;
 
-		if (pkt_len == size) {
+			bt_cb(skb)->pkt_type = scb->type;
 			hci_recv_frame(skb);
-		} else {
-			data->evt_skb = skb;
-			data->evt_len = pkt_len;
 		}
+
+		count -= len; buf += len;
 	}
 
 	return 0;
 }
 
-static void bpa10x_wakeup(struct bpa10x_data *data)
+static void bpa10x_tx_complete(struct urb *urb)
 {
-	struct urb *urb;
-	struct sk_buff *skb;
-	int err;
+	struct sk_buff *skb = urb->context;
+	struct hci_dev *hdev = (struct hci_dev *) skb->dev;
 
-	BT_DBG("data %p", data);
+	BT_DBG("%s urb %p status %d count %d", hdev->name,
+					urb, urb->status, urb->actual_length);
 
-	urb = data->cmd_urb;
-	if (urb->status == -EINPROGRESS)
-		skb = NULL;
+	if (!test_bit(HCI_RUNNING, &hdev->flags))
+		goto done;
+
+	if (!urb->status)
+		hdev->stat.byte_tx += urb->transfer_buffer_length;
 	else
-		skb = skb_dequeue(&data->cmd_queue);
+		hdev->stat.err_tx++;
 
-	if (skb) {
-		struct usb_ctrlrequest *cr;
+done:
+	kfree(urb->setup_packet);
 
-		if (skb->len > BPA10X_CMD_BUF_SIZE) {
-			BT_ERR("%s command packet with size %d is too big",
-							data->hdev->name, skb->len);
-			kfree_skb(skb);
-			return;
-		}
+	kfree_skb(skb);
+}
+
+static void bpa10x_rx_complete(struct urb *urb)
+{
+	struct hci_dev *hdev = urb->context;
+	struct bpa10x_data *data = hdev->driver_data;
+	int err;
 
-		cr = (struct usb_ctrlrequest *) urb->setup_packet;
-		cr->wLength = __cpu_to_le16(skb->len);
+	BT_DBG("%s urb %p status %d count %d", hdev->name,
+					urb, urb->status, urb->actual_length);
 
-		skb_copy_from_linear_data(skb, urb->transfer_buffer, skb->len);
-		urb->transfer_buffer_length = skb->len;
+	if (!test_bit(HCI_RUNNING, &hdev->flags))
+		return;
 
-		err = usb_submit_urb(urb, GFP_ATOMIC);
-		if (err < 0 && err != -ENODEV) {
-			BT_ERR("%s submit failed for command urb %p with error %d",
-							data->hdev->name, urb, err);
-			skb_queue_head(&data->cmd_queue, skb);
-		} else
-			kfree_skb(skb);
+	if (urb->status == 0) {
+		if (bpa10x_recv(hdev, usb_pipebulk(urb->pipe),
+						urb->transfer_buffer,
+						urb->actual_length) < 0) {
+			BT_ERR("%s corrupted event packet", hdev->name);
+			hdev->stat.err_rx++;
+		}
 	}
 
-	urb = data->tx_urb;
-	if (urb->status == -EINPROGRESS)
-		skb = NULL;
-	else
-		skb = skb_dequeue(&data->tx_queue);
-
-	if (skb) {
-		skb_copy_from_linear_data(skb, urb->transfer_buffer, skb->len);
-		urb->transfer_buffer_length = skb->len;
-
-		err = usb_submit_urb(urb, GFP_ATOMIC);
-		if (err < 0 && err != -ENODEV) {
-			BT_ERR("%s submit failed for command urb %p with error %d",
-							data->hdev->name, urb, err);
-			skb_queue_head(&data->tx_queue, skb);
-		} else
-			kfree_skb(skb);
+	usb_anchor_urb(urb, &data->rx_anchor);
+
+	err = usb_submit_urb(urb, GFP_ATOMIC);
+	if (err < 0) {
+		BT_ERR("%s urb %p failed to resubmit (%d)",
+						hdev->name, urb, -err);
+		usb_unanchor_urb(urb);
 	}
 }
 
-static void bpa10x_complete(struct urb *urb)
+static inline int bpa10x_submit_intr_urb(struct hci_dev *hdev)
 {
-	struct bpa10x_data *data = urb->context;
-	unsigned char *buf = urb->transfer_buffer;
-	int err, count = urb->actual_length;
+	struct bpa10x_data *data = hdev->driver_data;
+	struct urb *urb;
+	unsigned char *buf;
+	unsigned int pipe;
+	int err, size = 16;
 
-	BT_DBG("data %p urb %p buf %p count %d", data, urb, buf, count);
+	BT_DBG("%s", hdev->name);
 
-	read_lock(&data->lock);
+	urb = usb_alloc_urb(0, GFP_KERNEL);
+	if (!urb)
+		return -ENOMEM;
 
-	if (!test_bit(HCI_RUNNING, &data->hdev->flags))
-		goto unlock;
+	buf = kmalloc(size, GFP_KERNEL);
+	if (!buf) {
+		usb_free_urb(urb);
+		return -ENOMEM;
+	}
 
-	if (urb->status < 0 || !count)
-		goto resubmit;
+	pipe = usb_rcvintpipe(data->udev, 0x81);
 
-	if (usb_pipein(urb->pipe)) {
-		data->hdev->stat.byte_rx += count;
+	usb_fill_int_urb(urb, data->udev, pipe, buf, size,
+						bpa10x_rx_complete, hdev, 1);
 
-		if (usb_pipetype(urb->pipe) == PIPE_INTERRUPT)
-			bpa10x_recv_event(data, buf, count);
+	urb->transfer_flags |= URB_FREE_BUFFER;
 
-		if (usb_pipetype(urb->pipe) == PIPE_BULK)
-			bpa10x_recv_bulk(data, buf, count);
-	} else {
-		data->hdev->stat.byte_tx += count;
+	usb_anchor_urb(urb, &data->rx_anchor);
 
-		bpa10x_wakeup(data);
+	err = usb_submit_urb(urb, GFP_KERNEL);
+	if (err < 0) {
+		BT_ERR("%s urb %p submission failed (%d)",
+						hdev->name, urb, -err);
+		usb_unanchor_urb(urb);
+		kfree(buf);
 	}
 
-resubmit:
-	if (usb_pipein(urb->pipe)) {
-		err = usb_submit_urb(urb, GFP_ATOMIC);
-		if (err < 0 && err != -ENODEV) {
-			BT_ERR("%s urb %p type %d resubmit status %d",
-				data->hdev->name, urb, usb_pipetype(urb->pipe), err);
-		}
-	}
+	usb_free_urb(urb);
 
-unlock:
-	read_unlock(&data->lock);
+	return err;
 }
 
-static inline struct urb *bpa10x_alloc_urb(struct usb_device *udev, unsigned int pipe,
-					size_t size, gfp_t flags, void *data)
+static inline int bpa10x_submit_bulk_urb(struct hci_dev *hdev)
 {
+	struct bpa10x_data *data = hdev->driver_data;
 	struct urb *urb;
-	struct usb_ctrlrequest *cr;
 	unsigned char *buf;
+	unsigned int pipe;
+	int err, size = 64;
 
-	BT_DBG("udev %p data %p", udev, data);
+	BT_DBG("%s", hdev->name);
 
-	urb = usb_alloc_urb(0, flags);
+	urb = usb_alloc_urb(0, GFP_KERNEL);
 	if (!urb)
-		return NULL;
+		return -ENOMEM;
 
-	buf = kmalloc(size, flags);
+	buf = kmalloc(size, GFP_KERNEL);
 	if (!buf) {
 		usb_free_urb(urb);
-		return NULL;
+		return -ENOMEM;
 	}
 
-	switch (usb_pipetype(pipe)) {
-	case PIPE_CONTROL:
-		cr = kmalloc(sizeof(*cr), flags);
-		if (!cr) {
-			kfree(buf);
-			usb_free_urb(urb);
-			return NULL;
-		}
+	pipe = usb_rcvbulkpipe(data->udev, 0x82);
 
-		cr->bRequestType = USB_TYPE_VENDOR;
-		cr->bRequest     = 0;
-		cr->wIndex       = 0;
-		cr->wValue       = 0;
-		cr->wLength      = __cpu_to_le16(0);
+	usb_fill_bulk_urb(urb, data->udev, pipe,
+					buf, size, bpa10x_rx_complete, hdev);
 
-		usb_fill_control_urb(urb, udev, pipe, (void *) cr, buf, 0, bpa10x_complete, data);
-		break;
+	urb->transfer_flags |= URB_FREE_BUFFER;
 
-	case PIPE_INTERRUPT:
-		usb_fill_int_urb(urb, udev, pipe, buf, size, bpa10x_complete, data, 1);
-		break;
+	usb_anchor_urb(urb, &data->rx_anchor);
 
-	case PIPE_BULK:
-		usb_fill_bulk_urb(urb, udev, pipe, buf, size, bpa10x_complete, data);
-		break;
-
-	default:
+	err = usb_submit_urb(urb, GFP_KERNEL);
+	if (err < 0) {
+		BT_ERR("%s urb %p submission failed (%d)",
+						hdev->name, urb, -err);
+		usb_unanchor_urb(urb);
 		kfree(buf);
-		usb_free_urb(urb);
-		return NULL;
 	}
 
-	return urb;
-}
-
-static inline void bpa10x_free_urb(struct urb *urb)
-{
-	BT_DBG("urb %p", urb);
-
-	if (!urb)
-		return;
-
-	kfree(urb->setup_packet);
-	kfree(urb->transfer_buffer);
-
 	usb_free_urb(urb);
+
+	return err;
 }
 
 static int bpa10x_open(struct hci_dev *hdev)
 {
 	struct bpa10x_data *data = hdev->driver_data;
-	struct usb_device *udev = data->udev;
-	unsigned long flags;
 	int err;
 
-	BT_DBG("hdev %p data %p", hdev, data);
+	BT_DBG("%s", hdev->name);
 
 	if (test_and_set_bit(HCI_RUNNING, &hdev->flags))
 		return 0;
 
-	data->cmd_urb = bpa10x_alloc_urb(udev, usb_sndctrlpipe(udev, BPA10X_CMD_EP),
-					BPA10X_CMD_BUF_SIZE, GFP_KERNEL, data);
-	if (!data->cmd_urb) {
-		err = -ENOMEM;
-		goto done;
-	}
-
-	data->evt_urb = bpa10x_alloc_urb(udev, usb_rcvintpipe(udev, BPA10X_EVT_EP),
-					BPA10X_EVT_BUF_SIZE, GFP_KERNEL, data);
-	if (!data->evt_urb) {
-		bpa10x_free_urb(data->cmd_urb);
-		err = -ENOMEM;
-		goto done;
-	}
-
-	data->rx_urb = bpa10x_alloc_urb(udev, usb_rcvbulkpipe(udev, BPA10X_RX_EP),
-					BPA10X_RX_BUF_SIZE, GFP_KERNEL, data);
-	if (!data->rx_urb) {
-		bpa10x_free_urb(data->evt_urb);
-		bpa10x_free_urb(data->cmd_urb);
-		err = -ENOMEM;
-		goto done;
-	}
-
-	data->tx_urb = bpa10x_alloc_urb(udev, usb_sndbulkpipe(udev, BPA10X_TX_EP),
-					BPA10X_TX_BUF_SIZE, GFP_KERNEL, data);
-	if (!data->rx_urb) {
-		bpa10x_free_urb(data->rx_urb);
-		bpa10x_free_urb(data->evt_urb);
-		bpa10x_free_urb(data->cmd_urb);
-		err = -ENOMEM;
-		goto done;
-	}
+	err = bpa10x_submit_intr_urb(hdev);
+	if (err < 0)
+		goto error;
 
-	write_lock_irqsave(&data->lock, flags);
+	err = bpa10x_submit_bulk_urb(hdev);
+	if (err < 0)
+		goto error;
 
-	err = usb_submit_urb(data->evt_urb, GFP_ATOMIC);
-	if (err < 0) {
-		BT_ERR("%s submit failed for event urb %p with error %d",
-					data->hdev->name, data->evt_urb, err);
-	} else {
-		err = usb_submit_urb(data->rx_urb, GFP_ATOMIC);
-		if (err < 0) {
-			BT_ERR("%s submit failed for rx urb %p with error %d",
-					data->hdev->name, data->evt_urb, err);
-			usb_kill_urb(data->evt_urb);
-		}
-	}
+	return 0;
 
-	write_unlock_irqrestore(&data->lock, flags);
+error:
+	usb_kill_anchored_urbs(&data->rx_anchor);
 
-done:
-	if (err < 0)
-		clear_bit(HCI_RUNNING, &hdev->flags);
+	clear_bit(HCI_RUNNING, &hdev->flags);
 
 	return err;
 }
@@ -446,27 +339,13 @@ done:
 static int bpa10x_close(struct hci_dev *hdev)
 {
 	struct bpa10x_data *data = hdev->driver_data;
-	unsigned long flags;
 
-	BT_DBG("hdev %p data %p", hdev, data);
+	BT_DBG("%s", hdev->name);
 
 	if (!test_and_clear_bit(HCI_RUNNING, &hdev->flags))
 		return 0;
 
-	write_lock_irqsave(&data->lock, flags);
-
-	skb_queue_purge(&data->cmd_queue);
-	usb_kill_urb(data->cmd_urb);
-	usb_kill_urb(data->evt_urb);
-	usb_kill_urb(data->rx_urb);
-	usb_kill_urb(data->tx_urb);
-
-	write_unlock_irqrestore(&data->lock, flags);
-
-	bpa10x_free_urb(data->cmd_urb);
-	bpa10x_free_urb(data->evt_urb);
-	bpa10x_free_urb(data->rx_urb);
-	bpa10x_free_urb(data->tx_urb);
+	usb_kill_anchored_urbs(&data->rx_anchor);
 
 	return 0;
 }
@@ -475,9 +354,9 @@ static int bpa10x_flush(struct hci_dev *hdev)
 {
 	struct bpa10x_data *data = hdev->driver_data;
 
-	BT_DBG("hdev %p data %p", hdev, data);
+	BT_DBG("%s", hdev->name);
 
-	skb_queue_purge(&data->cmd_queue);
+	usb_kill_anchored_urbs(&data->tx_anchor);
 
 	return 0;
 }
@@ -485,45 +364,78 @@ static int bpa10x_flush(struct hci_dev *hdev)
 static int bpa10x_send_frame(struct sk_buff *skb)
 {
 	struct hci_dev *hdev = (struct hci_dev *) skb->dev;
-	struct bpa10x_data *data;
-
-	BT_DBG("hdev %p skb %p type %d len %d", hdev, skb, bt_cb(skb)->pkt_type, skb->len);
+	struct bpa10x_data *data = hdev->driver_data;
+	struct usb_ctrlrequest *dr;
+	struct urb *urb;
+	unsigned int pipe;
+	int err;
 
-	if (!hdev) {
-		BT_ERR("Frame for unknown HCI device");
-		return -ENODEV;
-	}
+	BT_DBG("%s", hdev->name);
 
 	if (!test_bit(HCI_RUNNING, &hdev->flags))
 		return -EBUSY;
 
-	data = hdev->driver_data;
+	urb = usb_alloc_urb(0, GFP_ATOMIC);
+	if (!urb)
+		return -ENOMEM;
 
 	/* Prepend skb with frame type */
-	memcpy(skb_push(skb, 1), &bt_cb(skb)->pkt_type, 1);
+	*skb_push(skb, 1) = bt_cb(skb)->pkt_type;
 
 	switch (bt_cb(skb)->pkt_type) {
 	case HCI_COMMAND_PKT:
+		dr = kmalloc(sizeof(*dr), GFP_ATOMIC);
+		if (!dr) {
+			usb_free_urb(urb);
+			return -ENOMEM;
+		}
+
+		dr->bRequestType = USB_TYPE_VENDOR;
+		dr->bRequest     = 0;
+		dr->wIndex       = 0;
+		dr->wValue       = 0;
+		dr->wLength      = __cpu_to_le16(skb->len);
+
+		pipe = usb_sndctrlpipe(data->udev, 0x00);
+
+		usb_fill_control_urb(urb, data->udev, pipe, (void *) dr,
+				skb->data, skb->len, bpa10x_tx_complete, skb);
+
 		hdev->stat.cmd_tx++;
-		skb_queue_tail(&data->cmd_queue, skb);
 		break;
 
 	case HCI_ACLDATA_PKT:
+		pipe = usb_sndbulkpipe(data->udev, 0x02);
+
+		usb_fill_bulk_urb(urb, data->udev, pipe,
+				skb->data, skb->len, bpa10x_tx_complete, skb);
+
 		hdev->stat.acl_tx++;
-		skb_queue_tail(&data->tx_queue, skb);
 		break;
 
 	case HCI_SCODATA_PKT:
+		pipe = usb_sndbulkpipe(data->udev, 0x02);
+
+		usb_fill_bulk_urb(urb, data->udev, pipe,
+				skb->data, skb->len, bpa10x_tx_complete, skb);
+
 		hdev->stat.sco_tx++;
-		skb_queue_tail(&data->tx_queue, skb);
 		break;
-	};
 
-	read_lock(&data->lock);
+	default:
+		return -EILSEQ;
+	}
+
+	usb_anchor_urb(urb, &data->tx_anchor);
 
-	bpa10x_wakeup(data);
+	err = usb_submit_urb(urb, GFP_ATOMIC);
+	if (err < 0) {
+		BT_ERR("%s urb %p submission failed", hdev->name, urb);
+		kfree(urb->setup_packet);
+		usb_unanchor_urb(urb);
+	}
 
-	read_unlock(&data->lock);
+	usb_free_urb(urb);
 
 	return 0;
 }
@@ -532,16 +444,17 @@ static void bpa10x_destruct(struct hci_dev *hdev)
 {
 	struct bpa10x_data *data = hdev->driver_data;
 
-	BT_DBG("hdev %p data %p", hdev, data);
+	BT_DBG("%s", hdev->name);
 
+	kfree(data->rx_skb[0]);
+	kfree(data->rx_skb[1]);
 	kfree(data);
 }
 
 static int bpa10x_probe(struct usb_interface *intf, const struct usb_device_id *id)
 {
-	struct usb_device *udev = interface_to_usbdev(intf);
-	struct hci_dev *hdev;
 	struct bpa10x_data *data;
+	struct hci_dev *hdev;
 	int err;
 
 	BT_DBG("intf %p id %p", intf, id);
@@ -549,48 +462,43 @@ static int bpa10x_probe(struct usb_interface *intf, const struct usb_device_id *
 	if (ignore)
 		return -ENODEV;
 
-	if (intf->cur_altsetting->desc.bInterfaceNumber > 0)
+	if (intf->cur_altsetting->desc.bInterfaceNumber != 0)
 		return -ENODEV;
 
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
-	if (!data) {
-		BT_ERR("Can't allocate data structure");
+	if (!data)
 		return -ENOMEM;
-	}
-
-	data->udev = udev;
 
-	rwlock_init(&data->lock);
+	data->udev = interface_to_usbdev(intf);
 
-	skb_queue_head_init(&data->cmd_queue);
-	skb_queue_head_init(&data->tx_queue);
+	init_usb_anchor(&data->tx_anchor);
+	init_usb_anchor(&data->rx_anchor);
 
 	hdev = hci_alloc_dev();
 	if (!hdev) {
-		BT_ERR("Can't allocate HCI device");
 		kfree(data);
 		return -ENOMEM;
 	}
 
-	data->hdev = hdev;
-
 	hdev->type = HCI_USB;
 	hdev->driver_data = data;
+
+	data->hdev = hdev;
+
 	SET_HCIDEV_DEV(hdev, &intf->dev);
 
-	hdev->open	= bpa10x_open;
-	hdev->close	= bpa10x_close;
-	hdev->flush	= bpa10x_flush;
-	hdev->send	= bpa10x_send_frame;
-	hdev->destruct	= bpa10x_destruct;
+	hdev->open     = bpa10x_open;
+	hdev->close    = bpa10x_close;
+	hdev->flush    = bpa10x_flush;
+	hdev->send     = bpa10x_send_frame;
+	hdev->destruct = bpa10x_destruct;
 
 	hdev->owner = THIS_MODULE;
 
 	err = hci_register_dev(hdev);
 	if (err < 0) {
-		BT_ERR("Can't register HCI device");
-		kfree(data);
 		hci_free_dev(hdev);
+		kfree(data);
 		return err;
 	}
 
@@ -602,19 +510,17 @@ static int bpa10x_probe(struct usb_interface *intf, const struct usb_device_id *
 static void bpa10x_disconnect(struct usb_interface *intf)
 {
 	struct bpa10x_data *data = usb_get_intfdata(intf);
-	struct hci_dev *hdev = data->hdev;
 
 	BT_DBG("intf %p", intf);
 
-	if (!hdev)
+	if (!data)
 		return;
 
 	usb_set_intfdata(intf, NULL);
 
-	if (hci_unregister_dev(hdev) < 0)
-		BT_ERR("Can't unregister HCI device %s", hdev->name);
+	hci_unregister_dev(data->hdev);
 
-	hci_free_dev(hdev);
+	hci_free_dev(data->hdev);
 }
 
 static struct usb_driver bpa10x_driver = {
@@ -626,15 +532,9 @@ static struct usb_driver bpa10x_driver = {
 
 static int __init bpa10x_init(void)
 {
-	int err;
-
 	BT_INFO("Digianswer Bluetooth USB driver ver %s", VERSION);
 
-	err = usb_register(&bpa10x_driver);
-	if (err < 0)
-		BT_ERR("Failed to register USB driver");
-
-	return err;
+	return usb_register(&bpa10x_driver);
 }
 
 static void __exit bpa10x_exit(void)
diff --git a/drivers/bluetooth/bt3c_cs.c b/drivers/bluetooth/bt3c_cs.c
index 39516074636..a18f9b8c9e1 100644
--- a/drivers/bluetooth/bt3c_cs.c
+++ b/drivers/bluetooth/bt3c_cs.c
@@ -344,10 +344,7 @@ static irqreturn_t bt3c_interrupt(int irq, void *dev_inst)
 	unsigned int iobase;
 	int iir;
 
-	if (!info || !info->hdev) {
-		BT_ERR("Call of irq %d for unknown device", irq);
-		return IRQ_NONE;
-	}
+	BUG_ON(!info->hdev);
 
 	iobase = info->p_dev->io.BasePort1;
 
diff --git a/drivers/bluetooth/btsdio.c b/drivers/bluetooth/btsdio.c
new file mode 100644
index 00000000000..b786f618790
--- /dev/null
+++ b/drivers/bluetooth/btsdio.c
@@ -0,0 +1,406 @@
+/*
+ *
+ *  Generic Bluetooth SDIO driver
+ *
+ *  Copyright (C) 2007  Cambridge Silicon Radio Ltd.
+ *  Copyright (C) 2007  Marcel Holtmann <marcel@holtmann.org>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+
+#include <linux/mmc/sdio_ids.h>
+#include <linux/mmc/sdio_func.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+#ifndef CONFIG_BT_HCIBTSDIO_DEBUG
+#undef  BT_DBG
+#define BT_DBG(D...)
+#endif
+
+#define VERSION "0.1"
+
+static const struct sdio_device_id btsdio_table[] = {
+	/* Generic Bluetooth Type-A SDIO device */
+	{ SDIO_DEVICE_CLASS(SDIO_CLASS_BT_A) },
+
+	/* Generic Bluetooth Type-B SDIO device */
+	{ SDIO_DEVICE_CLASS(SDIO_CLASS_BT_B) },
+
+	{ }	/* Terminating entry */
+};
+
+MODULE_DEVICE_TABLE(sdio, btsdio_table);
+
+struct btsdio_data {
+	struct hci_dev   *hdev;
+	struct sdio_func *func;
+
+	struct work_struct work;
+
+	struct sk_buff_head txq;
+};
+
+#define REG_RDAT     0x00	/* Receiver Data */
+#define REG_TDAT     0x00	/* Transmitter Data */
+#define REG_PC_RRT   0x10	/* Read Packet Control */
+#define REG_PC_WRT   0x11	/* Write Packet Control */
+#define REG_RTC_STAT 0x12	/* Retry Control Status */
+#define REG_RTC_SET  0x12	/* Retry Control Set */
+#define REG_INTRD    0x13	/* Interrupt Indication */
+#define REG_CL_INTRD 0x13	/* Interrupt Clear */
+#define REG_EN_INTRD 0x14	/* Interrupt Enable */
+#define REG_MD_STAT  0x20	/* Bluetooth Mode Status */
+
+static int btsdio_tx_packet(struct btsdio_data *data, struct sk_buff *skb)
+{
+	int err;
+
+	BT_DBG("%s", data->hdev->name);
+
+	/* Prepend Type-A header */
+	skb_push(skb, 4);
+	skb->data[0] = (skb->len & 0x0000ff);
+	skb->data[1] = (skb->len & 0x00ff00) >> 8;
+	skb->data[2] = (skb->len & 0xff0000) >> 16;
+	skb->data[3] = bt_cb(skb)->pkt_type;
+
+	err = sdio_writesb(data->func, REG_TDAT, skb->data, skb->len);
+	if (err < 0) {
+		sdio_writeb(data->func, 0x01, REG_PC_WRT, NULL);
+		return err;
+	}
+
+	data->hdev->stat.byte_tx += skb->len;
+
+	kfree_skb(skb);
+
+	return 0;
+}
+
+static void btsdio_work(struct work_struct *work)
+{
+	struct btsdio_data *data = container_of(work, struct btsdio_data, work);
+	struct sk_buff *skb;
+	int err;
+
+	BT_DBG("%s", data->hdev->name);
+
+	sdio_claim_host(data->func);
+
+	while ((skb = skb_dequeue(&data->txq))) {
+		err = btsdio_tx_packet(data, skb);
+		if (err < 0) {
+			data->hdev->stat.err_tx++;
+			skb_queue_head(&data->txq, skb);
+			break;
+		}
+	}
+
+	sdio_release_host(data->func);
+}
+
+static int btsdio_rx_packet(struct btsdio_data *data)
+{
+	u8 hdr[4] __attribute__ ((aligned(4)));
+	struct sk_buff *skb;
+	int err, len;
+
+	BT_DBG("%s", data->hdev->name);
+
+	err = sdio_readsb(data->func, hdr, REG_RDAT, 4);
+	if (err < 0)
+		return err;
+
+	len = hdr[0] | (hdr[1] << 8) | (hdr[2] << 16);
+	if (len < 4 || len > 65543)
+		return -EILSEQ;
+
+	skb = bt_skb_alloc(len - 4, GFP_KERNEL);
+	if (!skb) {
+		/* Out of memory. Prepare a read retry and just
+		 * return with the expectation that the next time
+		 * we're called we'll have more memory. */
+		return -ENOMEM;
+	}
+
+	skb_put(skb, len - 4);
+
+	err = sdio_readsb(data->func, skb->data, REG_RDAT, len - 4);
+	if (err < 0) {
+		kfree(skb);
+		return err;
+	}
+
+	data->hdev->stat.byte_rx += len;
+
+	skb->dev = (void *) data->hdev;
+	bt_cb(skb)->pkt_type = hdr[3];
+
+	err = hci_recv_frame(skb);
+	if (err < 0) {
+		kfree(skb);
+		return err;
+	}
+
+	sdio_writeb(data->func, 0x00, REG_PC_RRT, NULL);
+
+	return 0;
+}
+
+static void btsdio_interrupt(struct sdio_func *func)
+{
+	struct btsdio_data *data = sdio_get_drvdata(func);
+	int intrd;
+
+	BT_DBG("%s", data->hdev->name);
+
+	intrd = sdio_readb(func, REG_INTRD, NULL);
+	if (intrd & 0x01) {
+		sdio_writeb(func, 0x01, REG_CL_INTRD, NULL);
+
+		if (btsdio_rx_packet(data) < 0) {
+			data->hdev->stat.err_rx++;
+			sdio_writeb(data->func, 0x01, REG_PC_RRT, NULL);
+		}
+	}
+}
+
+static int btsdio_open(struct hci_dev *hdev)
+{
+	struct btsdio_data *data = hdev->driver_data;
+	int err;
+
+	BT_DBG("%s", hdev->name);
+
+	if (test_and_set_bit(HCI_RUNNING, &hdev->flags))
+		return 0;
+
+	sdio_claim_host(data->func);
+
+	err = sdio_enable_func(data->func);
+	if (err < 0) {
+		clear_bit(HCI_RUNNING, &hdev->flags);
+		goto release;
+	}
+
+	err = sdio_claim_irq(data->func, btsdio_interrupt);
+	if (err < 0) {
+		sdio_disable_func(data->func);
+		clear_bit(HCI_RUNNING, &hdev->flags);
+		goto release;
+	}
+
+	if (data->func->class == SDIO_CLASS_BT_B)
+		sdio_writeb(data->func, 0x00, REG_MD_STAT, NULL);
+
+	sdio_writeb(data->func, 0x01, REG_EN_INTRD, NULL);
+
+release:
+	sdio_release_host(data->func);
+
+	return err;
+}
+
+static int btsdio_close(struct hci_dev *hdev)
+{
+	struct btsdio_data *data = hdev->driver_data;
+
+	BT_DBG("%s", hdev->name);
+
+	if (!test_and_clear_bit(HCI_RUNNING, &hdev->flags))
+		return 0;
+
+	sdio_claim_host(data->func);
+
+	sdio_writeb(data->func, 0x00, REG_EN_INTRD, NULL);
+
+	sdio_release_irq(data->func);
+	sdio_disable_func(data->func);
+
+	sdio_release_host(data->func);
+
+	return 0;
+}
+
+static int btsdio_flush(struct hci_dev *hdev)
+{
+	struct btsdio_data *data = hdev->driver_data;
+
+	BT_DBG("%s", hdev->name);
+
+	skb_queue_purge(&data->txq);
+
+	return 0;
+}
+
+static int btsdio_send_frame(struct sk_buff *skb)
+{
+	struct hci_dev *hdev = (struct hci_dev *) skb->dev;
+	struct btsdio_data *data = hdev->driver_data;
+
+	BT_DBG("%s", hdev->name);
+
+	if (!test_bit(HCI_RUNNING, &hdev->flags))
+		return -EBUSY;
+
+	switch (bt_cb(skb)->pkt_type) {
+	case HCI_COMMAND_PKT:
+		hdev->stat.cmd_tx++;
+		break;
+
+	case HCI_ACLDATA_PKT:
+		hdev->stat.acl_tx++;
+		break;
+
+	case HCI_SCODATA_PKT:
+		hdev->stat.sco_tx++;
+		break;
+
+	default:
+		return -EILSEQ;
+	}
+
+	skb_queue_tail(&data->txq, skb);
+
+	schedule_work(&data->work);
+
+	return 0;
+}
+
+static void btsdio_destruct(struct hci_dev *hdev)
+{
+	struct btsdio_data *data = hdev->driver_data;
+
+	BT_DBG("%s", hdev->name);
+
+	kfree(data);
+}
+
+static int btsdio_probe(struct sdio_func *func,
+				const struct sdio_device_id *id)
+{
+	struct btsdio_data *data;
+	struct hci_dev *hdev;
+	struct sdio_func_tuple *tuple = func->tuples;
+	int err;
+
+	BT_DBG("func %p id %p class 0x%04x", func, id, func->class);
+
+	while (tuple) {
+		BT_DBG("code 0x%x size %d", tuple->code, tuple->size);
+		tuple = tuple->next;
+	}
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->func = func;
+
+	INIT_WORK(&data->work, btsdio_work);
+
+	skb_queue_head_init(&data->txq);
+
+	hdev = hci_alloc_dev();
+	if (!hdev) {
+		kfree(data);
+		return -ENOMEM;
+	}
+
+	hdev->type = HCI_SDIO;
+	hdev->driver_data = data;
+
+	data->hdev = hdev;
+
+	SET_HCIDEV_DEV(hdev, &func->dev);
+
+	hdev->open     = btsdio_open;
+	hdev->close    = btsdio_close;
+	hdev->flush    = btsdio_flush;
+	hdev->send     = btsdio_send_frame;
+	hdev->destruct = btsdio_destruct;
+
+	hdev->owner = THIS_MODULE;
+
+	err = hci_register_dev(hdev);
+	if (err < 0) {
+		hci_free_dev(hdev);
+		kfree(data);
+		return err;
+	}
+
+	sdio_set_drvdata(func, data);
+
+	return 0;
+}
+
+static void btsdio_remove(struct sdio_func *func)
+{
+	struct btsdio_data *data = sdio_get_drvdata(func);
+	struct hci_dev *hdev;
+
+	BT_DBG("func %p", func);
+
+	if (!data)
+		return;
+
+	hdev = data->hdev;
+
+	sdio_set_drvdata(func, NULL);
+
+	hci_unregister_dev(hdev);
+
+	hci_free_dev(hdev);
+}
+
+static struct sdio_driver btsdio_driver = {
+	.name		= "btsdio",
+	.probe		= btsdio_probe,
+	.remove		= btsdio_remove,
+	.id_table	= btsdio_table,
+};
+
+static int __init btsdio_init(void)
+{
+	BT_INFO("Generic Bluetooth SDIO driver ver %s", VERSION);
+
+	return sdio_register_driver(&btsdio_driver);
+}
+
+static void __exit btsdio_exit(void)
+{
+	sdio_unregister_driver(&btsdio_driver);
+}
+
+module_init(btsdio_init);
+module_exit(btsdio_exit);
+
+MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>");
+MODULE_DESCRIPTION("Generic Bluetooth SDIO driver ver " VERSION);
+MODULE_VERSION(VERSION);
+MODULE_LICENSE("GPL");
diff --git a/drivers/bluetooth/btuart_cs.c b/drivers/bluetooth/btuart_cs.c
index d7d2ea0d86a..08f48d577ab 100644
--- a/drivers/bluetooth/btuart_cs.c
+++ b/drivers/bluetooth/btuart_cs.c
@@ -294,10 +294,7 @@ static irqreturn_t btuart_interrupt(int irq, void *dev_inst)
 	int boguscount = 0;
 	int iir, lsr;
 
-	if (!info || !info->hdev) {
-		BT_ERR("Call of irq %d for unknown device", irq);
-		return IRQ_NONE;
-	}
+	BUG_ON(!info->hdev);
 
 	iobase = info->p_dev->io.BasePort1;
 
diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
new file mode 100644
index 00000000000..12e108914f1
--- /dev/null
+++ b/drivers/bluetooth/btusb.c
@@ -0,0 +1,564 @@
+/*
+ *
+ *  Generic Bluetooth USB driver
+ *
+ *  Copyright (C) 2005-2007  Marcel Holtmann <marcel@holtmann.org>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+
+#include <linux/usb.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+//#define CONFIG_BT_HCIBTUSB_DEBUG
+#ifndef CONFIG_BT_HCIBTUSB_DEBUG
+#undef  BT_DBG
+#define BT_DBG(D...)
+#endif
+
+#define VERSION "0.1"
+
+static struct usb_device_id btusb_table[] = {
+	/* Generic Bluetooth USB device */
+	{ USB_DEVICE_INFO(0xe0, 0x01, 0x01) },
+
+	{ }	/* Terminating entry */
+};
+
+MODULE_DEVICE_TABLE(usb, btusb_table);
+
+static struct usb_device_id blacklist_table[] = {
+	{ }	/* Terminating entry */
+};
+
+#define BTUSB_INTR_RUNNING	0
+#define BTUSB_BULK_RUNNING	1
+
+struct btusb_data {
+	struct hci_dev       *hdev;
+	struct usb_device    *udev;
+
+	spinlock_t lock;
+
+	unsigned long flags;
+
+	struct work_struct work;
+
+	struct usb_anchor tx_anchor;
+	struct usb_anchor intr_anchor;
+	struct usb_anchor bulk_anchor;
+
+	struct usb_endpoint_descriptor *intr_ep;
+	struct usb_endpoint_descriptor *bulk_tx_ep;
+	struct usb_endpoint_descriptor *bulk_rx_ep;
+};
+
+static void btusb_intr_complete(struct urb *urb)
+{
+	struct hci_dev *hdev = urb->context;
+	struct btusb_data *data = hdev->driver_data;
+	int err;
+
+	BT_DBG("%s urb %p status %d count %d", hdev->name,
+					urb, urb->status, urb->actual_length);
+
+	if (!test_bit(HCI_RUNNING, &hdev->flags))
+		return;
+
+	if (urb->status == 0) {
+		if (hci_recv_fragment(hdev, HCI_EVENT_PKT,
+						urb->transfer_buffer,
+						urb->actual_length) < 0) {
+			BT_ERR("%s corrupted event packet", hdev->name);
+			hdev->stat.err_rx++;
+		}
+	}
+
+	if (!test_bit(BTUSB_INTR_RUNNING, &data->flags))
+		return;
+
+	usb_anchor_urb(urb, &data->intr_anchor);
+
+	err = usb_submit_urb(urb, GFP_ATOMIC);
+	if (err < 0) {
+		BT_ERR("%s urb %p failed to resubmit (%d)",
+						hdev->name, urb, -err);
+		usb_unanchor_urb(urb);
+	}
+}
+
+static inline int btusb_submit_intr_urb(struct hci_dev *hdev)
+{
+	struct btusb_data *data = hdev->driver_data;
+	struct urb *urb;
+	unsigned char *buf;
+	unsigned int pipe;
+	int err, size;
+
+	BT_DBG("%s", hdev->name);
+
+	urb = usb_alloc_urb(0, GFP_ATOMIC);
+	if (!urb)
+		return -ENOMEM;
+
+	size = le16_to_cpu(data->intr_ep->wMaxPacketSize);
+
+	buf = kmalloc(size, GFP_ATOMIC);
+	if (!buf) {
+		usb_free_urb(urb);
+		return -ENOMEM;
+	}
+
+	pipe = usb_rcvintpipe(data->udev, data->intr_ep->bEndpointAddress);
+
+	usb_fill_int_urb(urb, data->udev, pipe, buf, size,
+						btusb_intr_complete, hdev,
+						data->intr_ep->bInterval);
+
+	urb->transfer_flags |= URB_FREE_BUFFER;
+
+	usb_anchor_urb(urb, &data->intr_anchor);
+
+	err = usb_submit_urb(urb, GFP_ATOMIC);
+	if (err < 0) {
+		BT_ERR("%s urb %p submission failed (%d)",
+						hdev->name, urb, -err);
+		usb_unanchor_urb(urb);
+		kfree(buf);
+	}
+
+	usb_free_urb(urb);
+
+	return err;
+}
+
+static void btusb_bulk_complete(struct urb *urb)
+{
+	struct hci_dev *hdev = urb->context;
+	struct btusb_data *data = hdev->driver_data;
+	int err;
+
+	BT_DBG("%s urb %p status %d count %d", hdev->name,
+					urb, urb->status, urb->actual_length);
+
+	if (!test_bit(HCI_RUNNING, &hdev->flags))
+		return;
+
+	if (urb->status == 0) {
+		if (hci_recv_fragment(hdev, HCI_ACLDATA_PKT,
+						urb->transfer_buffer,
+						urb->actual_length) < 0) {
+			BT_ERR("%s corrupted ACL packet", hdev->name);
+			hdev->stat.err_rx++;
+		}
+	}
+
+	if (!test_bit(BTUSB_BULK_RUNNING, &data->flags))
+		return;
+
+	usb_anchor_urb(urb, &data->bulk_anchor);
+
+	err = usb_submit_urb(urb, GFP_ATOMIC);
+	if (err < 0) {
+		BT_ERR("%s urb %p failed to resubmit (%d)",
+						hdev->name, urb, -err);
+		usb_unanchor_urb(urb);
+	}
+}
+
+static inline int btusb_submit_bulk_urb(struct hci_dev *hdev)
+{
+	struct btusb_data *data = hdev->driver_data;
+	struct urb *urb;
+	unsigned char *buf;
+	unsigned int pipe;
+	int err, size;
+
+	BT_DBG("%s", hdev->name);
+
+	urb = usb_alloc_urb(0, GFP_KERNEL);
+	if (!urb)
+		return -ENOMEM;
+
+	size = le16_to_cpu(data->bulk_rx_ep->wMaxPacketSize);
+
+	buf = kmalloc(size, GFP_KERNEL);
+	if (!buf) {
+		usb_free_urb(urb);
+		return -ENOMEM;
+	}
+
+	pipe = usb_rcvbulkpipe(data->udev, data->bulk_rx_ep->bEndpointAddress);
+
+	usb_fill_bulk_urb(urb, data->udev, pipe,
+					buf, size, btusb_bulk_complete, hdev);
+
+	urb->transfer_flags |= URB_FREE_BUFFER;
+
+	usb_anchor_urb(urb, &data->bulk_anchor);
+
+	err = usb_submit_urb(urb, GFP_KERNEL);
+	if (err < 0) {
+		BT_ERR("%s urb %p submission failed (%d)",
+						hdev->name, urb, -err);
+		usb_unanchor_urb(urb);
+		kfree(buf);
+	}
+
+	usb_free_urb(urb);
+
+	return err;
+}
+
+static void btusb_tx_complete(struct urb *urb)
+{
+	struct sk_buff *skb = urb->context;
+	struct hci_dev *hdev = (struct hci_dev *) skb->dev;
+
+	BT_DBG("%s urb %p status %d count %d", hdev->name,
+					urb, urb->status, urb->actual_length);
+
+	if (!test_bit(HCI_RUNNING, &hdev->flags))
+		goto done;
+
+	if (!urb->status)
+		hdev->stat.byte_tx += urb->transfer_buffer_length;
+	else
+		hdev->stat.err_tx++;
+
+done:
+	kfree(urb->setup_packet);
+
+	kfree_skb(skb);
+}
+
+static int btusb_open(struct hci_dev *hdev)
+{
+	struct btusb_data *data = hdev->driver_data;
+	int err;
+
+	BT_DBG("%s", hdev->name);
+
+	if (test_and_set_bit(HCI_RUNNING, &hdev->flags))
+		return 0;
+
+	if (test_and_set_bit(BTUSB_INTR_RUNNING, &data->flags))
+		return 0;
+
+	err = btusb_submit_intr_urb(hdev);
+	if (err < 0) {
+		clear_bit(BTUSB_INTR_RUNNING, &hdev->flags);
+		clear_bit(HCI_RUNNING, &hdev->flags);
+	}
+
+	return err;
+}
+
+static int btusb_close(struct hci_dev *hdev)
+{
+	struct btusb_data *data = hdev->driver_data;
+
+	BT_DBG("%s", hdev->name);
+
+	if (!test_and_clear_bit(HCI_RUNNING, &hdev->flags))
+		return 0;
+
+	clear_bit(BTUSB_BULK_RUNNING, &data->flags);
+	usb_kill_anchored_urbs(&data->bulk_anchor);
+
+	clear_bit(BTUSB_INTR_RUNNING, &data->flags);
+	usb_kill_anchored_urbs(&data->intr_anchor);
+
+	return 0;
+}
+
+static int btusb_flush(struct hci_dev *hdev)
+{
+	struct btusb_data *data = hdev->driver_data;
+
+	BT_DBG("%s", hdev->name);
+
+	usb_kill_anchored_urbs(&data->tx_anchor);
+
+	return 0;
+}
+
+static int btusb_send_frame(struct sk_buff *skb)
+{
+	struct hci_dev *hdev = (struct hci_dev *) skb->dev;
+	struct btusb_data *data = hdev->driver_data;
+	struct usb_ctrlrequest *dr;
+	struct urb *urb;
+	unsigned int pipe;
+	int err;
+
+	BT_DBG("%s", hdev->name);
+
+	if (!test_bit(HCI_RUNNING, &hdev->flags))
+		return -EBUSY;
+
+	switch (bt_cb(skb)->pkt_type) {
+	case HCI_COMMAND_PKT:
+		urb = usb_alloc_urb(0, GFP_ATOMIC);
+		if (!urb)
+			return -ENOMEM;
+
+		dr = kmalloc(sizeof(*dr), GFP_ATOMIC);
+		if (!dr) {
+			usb_free_urb(urb);
+			return -ENOMEM;
+		}
+
+		dr->bRequestType = USB_TYPE_CLASS;
+		dr->bRequest     = 0;
+		dr->wIndex       = 0;
+		dr->wValue       = 0;
+		dr->wLength      = __cpu_to_le16(skb->len);
+
+		pipe = usb_sndctrlpipe(data->udev, 0x00);
+
+		usb_fill_control_urb(urb, data->udev, pipe, (void *) dr,
+				skb->data, skb->len, btusb_tx_complete, skb);
+
+		hdev->stat.cmd_tx++;
+		break;
+
+	case HCI_ACLDATA_PKT:
+		urb = usb_alloc_urb(0, GFP_ATOMIC);
+		if (!urb)
+			return -ENOMEM;
+
+		pipe = usb_sndbulkpipe(data->udev,
+					data->bulk_tx_ep->bEndpointAddress);
+
+		usb_fill_bulk_urb(urb, data->udev, pipe,
+				skb->data, skb->len, btusb_tx_complete, skb);
+
+		hdev->stat.acl_tx++;
+		break;
+
+	case HCI_SCODATA_PKT:
+		hdev->stat.sco_tx++;
+		kfree_skb(skb);
+		return 0;
+
+	default:
+		return -EILSEQ;
+	}
+
+	usb_anchor_urb(urb, &data->tx_anchor);
+
+	err = usb_submit_urb(urb, GFP_ATOMIC);
+	if (err < 0) {
+		BT_ERR("%s urb %p submission failed", hdev->name, urb);
+		kfree(urb->setup_packet);
+		usb_unanchor_urb(urb);
+	}
+
+	usb_free_urb(urb);
+
+	return err;
+}
+
+static void btusb_destruct(struct hci_dev *hdev)
+{
+	struct btusb_data *data = hdev->driver_data;
+
+	BT_DBG("%s", hdev->name);
+
+	kfree(data);
+}
+
+static void btusb_notify(struct hci_dev *hdev, unsigned int evt)
+{
+	struct btusb_data *data = hdev->driver_data;
+
+	BT_DBG("%s evt %d", hdev->name, evt);
+
+	if (evt == HCI_NOTIFY_CONN_ADD || evt == HCI_NOTIFY_CONN_DEL)
+		schedule_work(&data->work);
+}
+
+static void btusb_work(struct work_struct *work)
+{
+	struct btusb_data *data = container_of(work, struct btusb_data, work);
+	struct hci_dev *hdev = data->hdev;
+
+	if (hdev->conn_hash.acl_num == 0) {
+		clear_bit(BTUSB_BULK_RUNNING, &data->flags);
+		usb_kill_anchored_urbs(&data->bulk_anchor);
+		return;
+	}
+
+	if (!test_and_set_bit(BTUSB_BULK_RUNNING, &data->flags)) {
+		if (btusb_submit_bulk_urb(hdev) < 0)
+			clear_bit(BTUSB_BULK_RUNNING, &data->flags);
+		else
+			btusb_submit_bulk_urb(hdev);
+	}
+}
+
+static int btusb_probe(struct usb_interface *intf,
+				const struct usb_device_id *id)
+{
+	struct usb_endpoint_descriptor *ep_desc;
+	struct btusb_data *data;
+	struct hci_dev *hdev;
+	int i, err;
+
+	BT_DBG("intf %p id %p", intf, id);
+
+	if (intf->cur_altsetting->desc.bInterfaceNumber != 0)
+		return -ENODEV;
+
+	if (!id->driver_info) {
+		const struct usb_device_id *match;
+		match = usb_match_id(intf, blacklist_table);
+		if (match)
+			id = match;
+	}
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	for (i = 0; i < intf->cur_altsetting->desc.bNumEndpoints; i++) {
+		ep_desc = &intf->cur_altsetting->endpoint[i].desc;
+
+		if (!data->intr_ep && usb_endpoint_is_int_in(ep_desc)) {
+			data->intr_ep = ep_desc;
+			continue;
+		}
+
+		if (!data->bulk_tx_ep && usb_endpoint_is_bulk_out(ep_desc)) {
+			data->bulk_tx_ep = ep_desc;
+			continue;
+		}
+
+		if (!data->bulk_rx_ep && usb_endpoint_is_bulk_in(ep_desc)) {
+			data->bulk_rx_ep = ep_desc;
+			continue;
+		}
+	}
+
+	if (!data->intr_ep || !data->bulk_tx_ep || !data->bulk_rx_ep) {
+		kfree(data);
+		return -ENODEV;
+	}
+
+	data->udev = interface_to_usbdev(intf);
+
+	spin_lock_init(&data->lock);
+
+	INIT_WORK(&data->work, btusb_work);
+
+	init_usb_anchor(&data->tx_anchor);
+	init_usb_anchor(&data->intr_anchor);
+	init_usb_anchor(&data->bulk_anchor);
+
+	hdev = hci_alloc_dev();
+	if (!hdev) {
+		kfree(data);
+		return -ENOMEM;
+	}
+
+	hdev->type = HCI_USB;
+	hdev->driver_data = data;
+
+	data->hdev = hdev;
+
+	SET_HCIDEV_DEV(hdev, &intf->dev);
+
+	hdev->open     = btusb_open;
+	hdev->close    = btusb_close;
+	hdev->flush    = btusb_flush;
+	hdev->send     = btusb_send_frame;
+	hdev->destruct = btusb_destruct;
+	hdev->notify   = btusb_notify;
+
+	hdev->owner = THIS_MODULE;
+
+	set_bit(HCI_QUIRK_RESET_ON_INIT, &hdev->quirks);
+
+	err = hci_register_dev(hdev);
+	if (err < 0) {
+		hci_free_dev(hdev);
+		kfree(data);
+		return err;
+	}
+
+	usb_set_intfdata(intf, data);
+
+	return 0;
+}
+
+static void btusb_disconnect(struct usb_interface *intf)
+{
+	struct btusb_data *data = usb_get_intfdata(intf);
+	struct hci_dev *hdev;
+
+	BT_DBG("intf %p", intf);
+
+	if (!data)
+		return;
+
+	hdev = data->hdev;
+
+	usb_set_intfdata(intf, NULL);
+
+	hci_unregister_dev(hdev);
+
+	hci_free_dev(hdev);
+}
+
+static struct usb_driver btusb_driver = {
+	.name		= "btusb",
+	.probe		= btusb_probe,
+	.disconnect	= btusb_disconnect,
+	.id_table	= btusb_table,
+};
+
+static int __init btusb_init(void)
+{
+	BT_INFO("Generic Bluetooth USB driver ver %s", VERSION);
+
+	return usb_register(&btusb_driver);
+}
+
+static void __exit btusb_exit(void)
+{
+	usb_deregister(&btusb_driver);
+}
+
+module_init(btusb_init);
+module_exit(btusb_exit);
+
+MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>");
+MODULE_DESCRIPTION("Generic Bluetooth USB driver ver " VERSION);
+MODULE_VERSION(VERSION);
+MODULE_LICENSE("GPL");
diff --git a/drivers/bluetooth/dtl1_cs.c b/drivers/bluetooth/dtl1_cs.c
index 7f9c54b9964..dae45cdf02b 100644
--- a/drivers/bluetooth/dtl1_cs.c
+++ b/drivers/bluetooth/dtl1_cs.c
@@ -298,10 +298,7 @@ static irqreturn_t dtl1_interrupt(int irq, void *dev_inst)
 	int boguscount = 0;
 	int iir, lsr;
 
-	if (!info || !info->hdev) {
-		BT_ERR("Call of irq %d for unknown device", irq);
-		return IRQ_NONE;
-	}
+	BUG_ON(!info->hdev);
 
 	iobase = info->p_dev->io.BasePort1;
 
diff --git a/drivers/bluetooth/hci_bcsp.c b/drivers/bluetooth/hci_bcsp.c
index d66064ccb31..696f7528f02 100644
--- a/drivers/bluetooth/hci_bcsp.c
+++ b/drivers/bluetooth/hci_bcsp.c
@@ -237,7 +237,8 @@ static struct sk_buff *bcsp_prepare_pkt(struct bcsp_struct *bcsp, u8 *data,
 	if (hciextn && chan == 5) {
 		struct hci_command_hdr *hdr = (struct hci_command_hdr *) data;
 
-		if (hci_opcode_ogf(__le16_to_cpu(hdr->opcode)) == OGF_VENDOR_CMD) {
+		/* Vendor specific commands */
+		if (hci_opcode_ogf(__le16_to_cpu(hdr->opcode)) == 0x3f) {
 			u8 desc = *(data + HCI_COMMAND_HDR_SIZE);
 			if ((desc & 0xf0) == 0xc0) {
 				data += HCI_COMMAND_HDR_SIZE + 1;
diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c
index 6055b9c0ac0..e68821d074b 100644
--- a/drivers/bluetooth/hci_ldisc.c
+++ b/drivers/bluetooth/hci_ldisc.c
@@ -549,7 +549,10 @@ static int __init hci_uart_init(void)
 #ifdef CONFIG_BT_HCIUART_BCSP
 	bcsp_init();
 #endif
-	
+#ifdef CONFIG_BT_HCIUART_LL
+	ll_init();
+#endif
+
 	return 0;
 }
 
@@ -563,6 +566,9 @@ static void __exit hci_uart_exit(void)
 #ifdef CONFIG_BT_HCIUART_BCSP
 	bcsp_deinit();
 #endif
+#ifdef CONFIG_BT_HCIUART_LL
+	ll_deinit();
+#endif
 
 	/* Release tty registration of line discipline */
 	if ((err = tty_unregister_ldisc(N_HCI)))
diff --git a/drivers/bluetooth/hci_ll.c b/drivers/bluetooth/hci_ll.c
new file mode 100644
index 00000000000..8c3e62a17b4
--- /dev/null
+++ b/drivers/bluetooth/hci_ll.c
@@ -0,0 +1,531 @@
+/*
+ *  Texas Instruments' Bluetooth HCILL UART protocol
+ *
+ *  HCILL (HCI Low Level) is a Texas Instruments' power management
+ *  protocol extension to H4.
+ *
+ *  Copyright (C) 2007 Texas Instruments, Inc.
+ *
+ *  Written by Ohad Ben-Cohen <ohad@bencohen.org>
+ *
+ *  Acknowledgements:
+ *  This file is based on hci_h4.c, which was written
+ *  by Maxim Krasnyansky and Marcel Holtmann.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/interrupt.h>
+#include <linux/ptrace.h>
+#include <linux/poll.h>
+
+#include <linux/slab.h>
+#include <linux/tty.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/signal.h>
+#include <linux/ioctl.h>
+#include <linux/skbuff.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+#include "hci_uart.h"
+
+/* HCILL commands */
+#define HCILL_GO_TO_SLEEP_IND	0x30
+#define HCILL_GO_TO_SLEEP_ACK	0x31
+#define HCILL_WAKE_UP_IND	0x32
+#define HCILL_WAKE_UP_ACK	0x33
+
+/* HCILL receiver States */
+#define HCILL_W4_PACKET_TYPE	0
+#define HCILL_W4_EVENT_HDR	1
+#define HCILL_W4_ACL_HDR	2
+#define HCILL_W4_SCO_HDR	3
+#define HCILL_W4_DATA		4
+
+/* HCILL states */
+enum hcill_states_e {
+	HCILL_ASLEEP,
+	HCILL_ASLEEP_TO_AWAKE,
+	HCILL_AWAKE,
+	HCILL_AWAKE_TO_ASLEEP
+};
+
+struct hcill_cmd {
+	u8 cmd;
+} __attribute__((packed));
+
+struct ll_struct {
+	unsigned long rx_state;
+	unsigned long rx_count;
+	struct sk_buff *rx_skb;
+	struct sk_buff_head txq;
+	spinlock_t hcill_lock;		/* HCILL state lock	*/
+	unsigned long hcill_state;	/* HCILL power state	*/
+	struct sk_buff_head tx_wait_q;	/* HCILL wait queue	*/
+};
+
+/*
+ * Builds and sends an HCILL command packet.
+ * These are very simple packets with only 1 cmd byte
+ */
+static int send_hcill_cmd(u8 cmd, struct hci_uart *hu)
+{
+	int err = 0;
+	struct sk_buff *skb = NULL;
+	struct ll_struct *ll = hu->priv;
+	struct hcill_cmd *hcill_packet;
+
+	BT_DBG("hu %p cmd 0x%x", hu, cmd);
+
+	/* allocate packet */
+	skb = bt_skb_alloc(1, GFP_ATOMIC);
+	if (!skb) {
+		BT_ERR("cannot allocate memory for HCILL packet");
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/* prepare packet */
+	hcill_packet = (struct hcill_cmd *) skb_put(skb, 1);
+	hcill_packet->cmd = cmd;
+	skb->dev = (void *) hu->hdev;
+
+	/* send packet */
+	skb_queue_tail(&ll->txq, skb);
+out:
+	return err;
+}
+
+/* Initialize protocol */
+static int ll_open(struct hci_uart *hu)
+{
+	struct ll_struct *ll;
+
+	BT_DBG("hu %p", hu);
+
+	ll = kzalloc(sizeof(*ll), GFP_ATOMIC);
+	if (!ll)
+		return -ENOMEM;
+
+	skb_queue_head_init(&ll->txq);
+	skb_queue_head_init(&ll->tx_wait_q);
+	spin_lock_init(&ll->hcill_lock);
+
+	ll->hcill_state = HCILL_AWAKE;
+
+	hu->priv = ll;
+
+	return 0;
+}
+
+/* Flush protocol data */
+static int ll_flush(struct hci_uart *hu)
+{
+	struct ll_struct *ll = hu->priv;
+
+	BT_DBG("hu %p", hu);
+
+	skb_queue_purge(&ll->tx_wait_q);
+	skb_queue_purge(&ll->txq);
+
+	return 0;
+}
+
+/* Close protocol */
+static int ll_close(struct hci_uart *hu)
+{
+	struct ll_struct *ll = hu->priv;
+
+	BT_DBG("hu %p", hu);
+
+	skb_queue_purge(&ll->tx_wait_q);
+	skb_queue_purge(&ll->txq);
+
+	if (ll->rx_skb)
+		kfree_skb(ll->rx_skb);
+
+	hu->priv = NULL;
+
+	kfree(ll);
+
+	return 0;
+}
+
+/*
+ * internal function, which does common work of the device wake up process:
+ * 1. places all pending packets (waiting in tx_wait_q list) in txq list.
+ * 2. changes internal state to HCILL_AWAKE.
+ * Note: assumes that hcill_lock spinlock is taken,
+ * shouldn't be called otherwise!
+ */
+static void __ll_do_awake(struct ll_struct *ll)
+{
+	struct sk_buff *skb = NULL;
+
+	while ((skb = skb_dequeue(&ll->tx_wait_q)))
+		skb_queue_tail(&ll->txq, skb);
+
+	ll->hcill_state = HCILL_AWAKE;
+}
+
+/*
+ * Called upon a wake-up-indication from the device
+ */
+static void ll_device_want_to_wakeup(struct hci_uart *hu)
+{
+	unsigned long flags;
+	struct ll_struct *ll = hu->priv;
+
+	BT_DBG("hu %p", hu);
+
+	/* lock hcill state */
+	spin_lock_irqsave(&ll->hcill_lock, flags);
+
+	switch (ll->hcill_state) {
+	case HCILL_ASLEEP:
+		/* acknowledge device wake up */
+		if (send_hcill_cmd(HCILL_WAKE_UP_ACK, hu) < 0) {
+			BT_ERR("cannot acknowledge device wake up");
+			goto out;
+		}
+		break;
+	case HCILL_ASLEEP_TO_AWAKE:
+		/*
+		 * this state means that a wake-up-indication
+		 * is already on its way to the device,
+		 * and will serve as the required wake-up-ack
+		 */
+		BT_DBG("dual wake-up-indication");
+		break;
+	default:
+		/* any other state are illegal */
+		BT_ERR("received HCILL_WAKE_UP_IND in state %ld", ll->hcill_state);
+		break;
+	}
+
+	/* send pending packets and change state to HCILL_AWAKE */
+	__ll_do_awake(ll);
+
+out:
+	spin_unlock_irqrestore(&ll->hcill_lock, flags);
+
+	/* actually send the packets */
+	hci_uart_tx_wakeup(hu);
+}
+
+/*
+ * Called upon a sleep-indication from the device
+ */
+static void ll_device_want_to_sleep(struct hci_uart *hu)
+{
+	unsigned long flags;
+	struct ll_struct *ll = hu->priv;
+
+	BT_DBG("hu %p", hu);
+
+	/* lock hcill state */
+	spin_lock_irqsave(&ll->hcill_lock, flags);
+
+	/* sanity check */
+	if (ll->hcill_state != HCILL_AWAKE)
+		BT_ERR("ERR: HCILL_GO_TO_SLEEP_IND in state %ld", ll->hcill_state);
+
+	/* acknowledge device sleep */
+	if (send_hcill_cmd(HCILL_GO_TO_SLEEP_ACK, hu) < 0) {
+		BT_ERR("cannot acknowledge device sleep");
+		goto out;
+	}
+
+	/* update state */
+	ll->hcill_state = HCILL_ASLEEP;
+
+out:
+	spin_unlock_irqrestore(&ll->hcill_lock, flags);
+
+	/* actually send the sleep ack packet */
+	hci_uart_tx_wakeup(hu);
+}
+
+/*
+ * Called upon wake-up-acknowledgement from the device
+ */
+static void ll_device_woke_up(struct hci_uart *hu)
+{
+	unsigned long flags;
+	struct ll_struct *ll = hu->priv;
+
+	BT_DBG("hu %p", hu);
+
+	/* lock hcill state */
+	spin_lock_irqsave(&ll->hcill_lock, flags);
+
+	/* sanity check */
+	if (ll->hcill_state != HCILL_ASLEEP_TO_AWAKE)
+		BT_ERR("received HCILL_WAKE_UP_ACK in state %ld", ll->hcill_state);
+
+	/* send pending packets and change state to HCILL_AWAKE */
+	__ll_do_awake(ll);
+
+	spin_unlock_irqrestore(&ll->hcill_lock, flags);
+
+	/* actually send the packets */
+	hci_uart_tx_wakeup(hu);
+}
+
+/* Enqueue frame for transmittion (padding, crc, etc) */
+/* may be called from two simultaneous tasklets */
+static int ll_enqueue(struct hci_uart *hu, struct sk_buff *skb)
+{
+	unsigned long flags = 0;
+	struct ll_struct *ll = hu->priv;
+
+	BT_DBG("hu %p skb %p", hu, skb);
+
+	/* Prepend skb with frame type */
+	memcpy(skb_push(skb, 1), &bt_cb(skb)->pkt_type, 1);
+
+	/* lock hcill state */
+	spin_lock_irqsave(&ll->hcill_lock, flags);
+
+	/* act according to current state */
+	switch (ll->hcill_state) {
+	case HCILL_AWAKE:
+		BT_DBG("device awake, sending normally");
+		skb_queue_tail(&ll->txq, skb);
+		break;
+	case HCILL_ASLEEP:
+		BT_DBG("device asleep, waking up and queueing packet");
+		/* save packet for later */
+		skb_queue_tail(&ll->tx_wait_q, skb);
+		/* awake device */
+		if (send_hcill_cmd(HCILL_WAKE_UP_IND, hu) < 0) {
+			BT_ERR("cannot wake up device");
+			break;
+		}
+		ll->hcill_state = HCILL_ASLEEP_TO_AWAKE;
+		break;
+	case HCILL_ASLEEP_TO_AWAKE:
+		BT_DBG("device waking up, queueing packet");
+		/* transient state; just keep packet for later */
+		skb_queue_tail(&ll->tx_wait_q, skb);
+		break;
+	default:
+		BT_ERR("illegal hcill state: %ld (losing packet)", ll->hcill_state);
+		kfree_skb(skb);
+		break;
+	}
+
+	spin_unlock_irqrestore(&ll->hcill_lock, flags);
+
+	return 0;
+}
+
+static inline int ll_check_data_len(struct ll_struct *ll, int len)
+{
+	register int room = skb_tailroom(ll->rx_skb);
+
+	BT_DBG("len %d room %d", len, room);
+
+	if (!len) {
+		hci_recv_frame(ll->rx_skb);
+	} else if (len > room) {
+		BT_ERR("Data length is too large");
+		kfree_skb(ll->rx_skb);
+	} else {
+		ll->rx_state = HCILL_W4_DATA;
+		ll->rx_count = len;
+		return len;
+	}
+
+	ll->rx_state = HCILL_W4_PACKET_TYPE;
+	ll->rx_skb   = NULL;
+	ll->rx_count = 0;
+
+	return 0;
+}
+
+/* Recv data */
+static int ll_recv(struct hci_uart *hu, void *data, int count)
+{
+	struct ll_struct *ll = hu->priv;
+	register char *ptr;
+	struct hci_event_hdr *eh;
+	struct hci_acl_hdr   *ah;
+	struct hci_sco_hdr   *sh;
+	register int len, type, dlen;
+
+	BT_DBG("hu %p count %d rx_state %ld rx_count %ld", hu, count, ll->rx_state, ll->rx_count);
+
+	ptr = data;
+	while (count) {
+		if (ll->rx_count) {
+			len = min_t(unsigned int, ll->rx_count, count);
+			memcpy(skb_put(ll->rx_skb, len), ptr, len);
+			ll->rx_count -= len; count -= len; ptr += len;
+
+			if (ll->rx_count)
+				continue;
+
+			switch (ll->rx_state) {
+			case HCILL_W4_DATA:
+				BT_DBG("Complete data");
+				hci_recv_frame(ll->rx_skb);
+
+				ll->rx_state = HCILL_W4_PACKET_TYPE;
+				ll->rx_skb = NULL;
+				continue;
+
+			case HCILL_W4_EVENT_HDR:
+				eh = (struct hci_event_hdr *) ll->rx_skb->data;
+
+				BT_DBG("Event header: evt 0x%2.2x plen %d", eh->evt, eh->plen);
+
+				ll_check_data_len(ll, eh->plen);
+				continue;
+
+			case HCILL_W4_ACL_HDR:
+				ah = (struct hci_acl_hdr *) ll->rx_skb->data;
+				dlen = __le16_to_cpu(ah->dlen);
+
+				BT_DBG("ACL header: dlen %d", dlen);
+
+				ll_check_data_len(ll, dlen);
+				continue;
+
+			case HCILL_W4_SCO_HDR:
+				sh = (struct hci_sco_hdr *) ll->rx_skb->data;
+
+				BT_DBG("SCO header: dlen %d", sh->dlen);
+
+				ll_check_data_len(ll, sh->dlen);
+				continue;
+			}
+		}
+
+		/* HCILL_W4_PACKET_TYPE */
+		switch (*ptr) {
+		case HCI_EVENT_PKT:
+			BT_DBG("Event packet");
+			ll->rx_state = HCILL_W4_EVENT_HDR;
+			ll->rx_count = HCI_EVENT_HDR_SIZE;
+			type = HCI_EVENT_PKT;
+			break;
+
+		case HCI_ACLDATA_PKT:
+			BT_DBG("ACL packet");
+			ll->rx_state = HCILL_W4_ACL_HDR;
+			ll->rx_count = HCI_ACL_HDR_SIZE;
+			type = HCI_ACLDATA_PKT;
+			break;
+
+		case HCI_SCODATA_PKT:
+			BT_DBG("SCO packet");
+			ll->rx_state = HCILL_W4_SCO_HDR;
+			ll->rx_count = HCI_SCO_HDR_SIZE;
+			type = HCI_SCODATA_PKT;
+			break;
+
+		/* HCILL signals */
+		case HCILL_GO_TO_SLEEP_IND:
+			BT_DBG("HCILL_GO_TO_SLEEP_IND packet");
+			ll_device_want_to_sleep(hu);
+			ptr++; count--;
+			continue;
+
+		case HCILL_GO_TO_SLEEP_ACK:
+			/* shouldn't happen */
+			BT_ERR("received HCILL_GO_TO_SLEEP_ACK (in state %ld)", ll->hcill_state);
+			ptr++; count--;
+			continue;
+
+		case HCILL_WAKE_UP_IND:
+			BT_DBG("HCILL_WAKE_UP_IND packet");
+			ll_device_want_to_wakeup(hu);
+			ptr++; count--;
+			continue;
+
+		case HCILL_WAKE_UP_ACK:
+			BT_DBG("HCILL_WAKE_UP_ACK packet");
+			ll_device_woke_up(hu);
+			ptr++; count--;
+			continue;
+
+		default:
+			BT_ERR("Unknown HCI packet type %2.2x", (__u8)*ptr);
+			hu->hdev->stat.err_rx++;
+			ptr++; count--;
+			continue;
+		};
+
+		ptr++; count--;
+
+		/* Allocate packet */
+		ll->rx_skb = bt_skb_alloc(HCI_MAX_FRAME_SIZE, GFP_ATOMIC);
+		if (!ll->rx_skb) {
+			BT_ERR("Can't allocate mem for new packet");
+			ll->rx_state = HCILL_W4_PACKET_TYPE;
+			ll->rx_count = 0;
+			return 0;
+		}
+
+		ll->rx_skb->dev = (void *) hu->hdev;
+		bt_cb(ll->rx_skb)->pkt_type = type;
+	}
+
+	return count;
+}
+
+static struct sk_buff *ll_dequeue(struct hci_uart *hu)
+{
+	struct ll_struct *ll = hu->priv;
+	return skb_dequeue(&ll->txq);
+}
+
+static struct hci_uart_proto llp = {
+	.id		= HCI_UART_LL,
+	.open		= ll_open,
+	.close		= ll_close,
+	.recv		= ll_recv,
+	.enqueue	= ll_enqueue,
+	.dequeue	= ll_dequeue,
+	.flush		= ll_flush,
+};
+
+int ll_init(void)
+{
+	int err = hci_uart_register_proto(&llp);
+
+	if (!err)
+		BT_INFO("HCILL protocol initialized");
+	else
+		BT_ERR("HCILL protocol registration failed");
+
+	return err;
+}
+
+int ll_deinit(void)
+{
+	return hci_uart_unregister_proto(&llp);
+}
diff --git a/drivers/bluetooth/hci_uart.h b/drivers/bluetooth/hci_uart.h
index 1097ce72393..50113db06b9 100644
--- a/drivers/bluetooth/hci_uart.h
+++ b/drivers/bluetooth/hci_uart.h
@@ -33,12 +33,13 @@
 #define HCIUARTGETDEVICE	_IOR('U', 202, int)
 
 /* UART protocols */
-#define HCI_UART_MAX_PROTO	4
+#define HCI_UART_MAX_PROTO	5
 
 #define HCI_UART_H4	0
 #define HCI_UART_BCSP	1
 #define HCI_UART_3WIRE	2
 #define HCI_UART_H4DS	3
+#define HCI_UART_LL	4
 
 struct hci_uart;
 
@@ -85,3 +86,8 @@ int h4_deinit(void);
 int bcsp_init(void);
 int bcsp_deinit(void);
 #endif
+
+#ifdef CONFIG_BT_HCIUART_LL
+int ll_init(void);
+int ll_deinit(void);
+#endif
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 65491103e0f..bf18d757b87 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -613,6 +613,10 @@ config HVC_XEN
 	help
 	  Xen virtual console device driver
 
+config VIRTIO_CONSOLE
+	bool
+	select HVC_DRIVER
+
 config HVCS
 	tristate "IBM Hypervisor Virtual Console Server support"
 	depends on PPC_PSERIES
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index c78ff26647e..07304d50e0c 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -42,7 +42,6 @@ obj-$(CONFIG_SYNCLINK_GT)	+= synclink_gt.o
 obj-$(CONFIG_N_HDLC)		+= n_hdlc.o
 obj-$(CONFIG_AMIGA_BUILTIN_SERIAL) += amiserial.o
 obj-$(CONFIG_SX)		+= sx.o generic_serial.o
-obj-$(CONFIG_LGUEST_GUEST)	+= hvc_lguest.o
 obj-$(CONFIG_RIO)		+= rio/ generic_serial.o
 obj-$(CONFIG_HVC_CONSOLE)	+= hvc_vio.o hvsi.o
 obj-$(CONFIG_HVC_ISERIES)	+= hvc_iseries.o
@@ -50,6 +49,7 @@ obj-$(CONFIG_HVC_RTAS)		+= hvc_rtas.o
 obj-$(CONFIG_HVC_BEAT)		+= hvc_beat.o
 obj-$(CONFIG_HVC_DRIVER)	+= hvc_console.o
 obj-$(CONFIG_HVC_XEN)		+= hvc_xen.o
+obj-$(CONFIG_VIRTIO_CONSOLE)	+= virtio_console.o
 obj-$(CONFIG_RAW_DRIVER)	+= raw.o
 obj-$(CONFIG_SGI_SNSC)		+= snsc.o snsc_event.o
 obj-$(CONFIG_MSPEC)		+= mspec.o
diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index d1bd0f08a33..e4f579c3e24 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -1602,8 +1602,8 @@ static void cyz_handle_tx(struct cyclades_port *info,
 			info->icount.tx++;
 		}
 #endif
-ztxdone:
 		tty_wakeup(tty);
+ztxdone:
 		/* Update tx_put */
 		cy_writel(&buf_ctrl->tx_put, tx_put);
 	}
diff --git a/drivers/char/hvc_lguest.c b/drivers/char/hvc_lguest.c
deleted file mode 100644
index efccb215583..00000000000
--- a/drivers/char/hvc_lguest.c
+++ /dev/null
@@ -1,177 +0,0 @@
-/*D:300
- * The Guest console driver
- *
- * This is a trivial console driver: we use lguest's DMA mechanism to send
- * bytes out, and register a DMA buffer to receive bytes in.  It is assumed to
- * be present and available from the very beginning of boot.
- *
- * Writing console drivers is one of the few remaining Dark Arts in Linux.
- * Fortunately for us, the path of virtual consoles has been well-trodden by
- * the PowerPC folks, who wrote "hvc_console.c" to generically support any
- * virtual console.  We use that infrastructure which only requires us to write
- * the basic put_chars and get_chars functions and call the right register
- * functions.
- :*/
-
-/*M:002 The console can be flooded: while the Guest is processing input the
- * Host can send more.  Buffering in the Host could alleviate this, but it is a
- * difficult problem in general. :*/
-/* Copyright (C) 2006 Rusty Russell, IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/lguest_bus.h>
-#include <asm/paravirt.h>
-#include "hvc_console.h"
-
-/*D:340 This is our single console input buffer, with associated "struct
- * lguest_dma" referring to it.  Note the 0-terminated length array, and the
- * use of physical address for the buffer itself. */
-static char inbuf[256];
-static struct lguest_dma cons_input = { .used_len = 0,
-					.addr[0] = __pa(inbuf),
-					.len[0] = sizeof(inbuf),
-					.len[1] = 0 };
-
-/*D:310 The put_chars() callback is pretty straightforward.
- *
- * First we put the pointer and length in a "struct lguest_dma": we only have
- * one pointer, so we set the second length to 0.  Then we use SEND_DMA to send
- * the data to (Host) buffers attached to the console key.  Usually a device's
- * key is a physical address within the device's memory, but because the
- * console device doesn't have any associated physical memory, we use the
- * LGUEST_CONSOLE_DMA_KEY constant (aka 0). */
-static int put_chars(u32 vtermno, const char *buf, int count)
-{
-	struct lguest_dma dma;
-
-	/* FIXME: DMA buffers in a "struct lguest_dma" are not allowed
-	 * to go over page boundaries.  This never seems to happen,
-	 * but if it did we'd need to fix this code. */
-	dma.len[0] = count;
-	dma.len[1] = 0;
-	dma.addr[0] = __pa(buf);
-
-	lguest_send_dma(LGUEST_CONSOLE_DMA_KEY, &dma);
-	/* We're expected to return the amount of data we wrote: all of it. */
-	return count;
-}
-
-/*D:350 get_chars() is the callback from the hvc_console infrastructure when
- * an interrupt is received.
- *
- * Firstly we see if our buffer has been filled: if not, we return.  The rest
- * of the code deals with the fact that the hvc_console() infrastructure only
- * asks us for 16 bytes at a time.  We keep a "cons_offset" variable for
- * partially-read buffers. */
-static int get_chars(u32 vtermno, char *buf, int count)
-{
-	static int cons_offset;
-
-	/* Nothing left to see here... */
-	if (!cons_input.used_len)
-		return 0;
-
-	/* You want more than we have to give?  Well, try wanting less! */
-	if (cons_input.used_len - cons_offset < count)
-		count = cons_input.used_len - cons_offset;
-
-	/* Copy across to their buffer and increment offset. */
-	memcpy(buf, inbuf + cons_offset, count);
-	cons_offset += count;
-
-	/* Finished?  Zero offset, and reset cons_input so Host will use it
-	 * again. */
-	if (cons_offset == cons_input.used_len) {
-		cons_offset = 0;
-		cons_input.used_len = 0;
-	}
-	return count;
-}
-/*:*/
-
-static struct hv_ops lguest_cons = {
-	.get_chars = get_chars,
-	.put_chars = put_chars,
-};
-
-/*D:320 Console drivers are initialized very early so boot messages can go
- * out.  At this stage, the console is output-only.  Our driver checks we're a
- * Guest, and if so hands hvc_instantiate() the console number (0), priority
- * (0), and the struct hv_ops containing the put_chars() function. */
-static int __init cons_init(void)
-{
-	if (strcmp(pv_info.name, "lguest") != 0)
-		return 0;
-
-	return hvc_instantiate(0, 0, &lguest_cons);
-}
-console_initcall(cons_init);
-
-/*D:370 To set up and manage our virtual console, we call hvc_alloc() and
- * stash the result in the private pointer of the "struct lguest_device".
- * Since we never remove the console device we never need this pointer again,
- * but using ->private is considered good form, and you never know who's going
- * to copy your driver.
- *
- * Once the console is set up, we bind our input buffer ready for input. */
-static int lguestcons_probe(struct lguest_device *lgdev)
-{
-	int err;
-
-	/* The first argument of hvc_alloc() is the virtual console number, so
-	 * we use zero.  The second argument is the interrupt number.
-	 *
-	 * The third argument is a "struct hv_ops" containing the put_chars()
-	 * and get_chars() pointers.  The final argument is the output buffer
-	 * size: we use 256 and expect the Host to have room for us to send
-	 * that much. */
-	lgdev->private = hvc_alloc(0, lgdev_irq(lgdev), &lguest_cons, 256);
-	if (IS_ERR(lgdev->private))
-		return PTR_ERR(lgdev->private);
-
-	/* We bind a single DMA buffer at key LGUEST_CONSOLE_DMA_KEY.
-	 * "cons_input" is that statically-initialized global DMA buffer we saw
-	 * above, and we also give the interrupt we want. */
-	err = lguest_bind_dma(LGUEST_CONSOLE_DMA_KEY, &cons_input, 1,
-			      lgdev_irq(lgdev));
-	if (err)
-		printk("lguest console: failed to bind buffer.\n");
-	return err;
-}
-/* Note the use of lgdev_irq() for the interrupt number.  We tell hvc_alloc()
- * to expect input when this interrupt is triggered, and then tell
- * lguest_bind_dma() that is the interrupt to send us when input comes in. */
-
-/*D:360 From now on the console driver follows standard Guest driver form:
- * register_lguest_driver() registers the device type and probe function, and
- * the probe function sets up the device.
- *
- * The standard "struct lguest_driver": */
-static struct lguest_driver lguestcons_drv = {
-	.name = "lguestcons",
-	.owner = THIS_MODULE,
-	.device_type = LGUEST_DEVICE_T_CONSOLE,
-	.probe = lguestcons_probe,
-};
-
-/* The standard init function */
-static int __init hvc_lguest_init(void)
-{
-	return register_lguest_driver(&lguestcons_drv);
-}
-module_init(hvc_lguest_init);
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
new file mode 100644
index 00000000000..100e8a201e3
--- /dev/null
+++ b/drivers/char/virtio_console.c
@@ -0,0 +1,225 @@
+/*D:300
+ * The Guest console driver
+ *
+ * Writing console drivers is one of the few remaining Dark Arts in Linux.
+ * Fortunately for us, the path of virtual consoles has been well-trodden by
+ * the PowerPC folks, who wrote "hvc_console.c" to generically support any
+ * virtual console.  We use that infrastructure which only requires us to write
+ * the basic put_chars and get_chars functions and call the right register
+ * functions.
+ :*/
+
+/*M:002 The console can be flooded: while the Guest is processing input the
+ * Host can send more.  Buffering in the Host could alleviate this, but it is a
+ * difficult problem in general. :*/
+/* Copyright (C) 2006, 2007 Rusty Russell, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/virtio.h>
+#include <linux/virtio_console.h>
+#include "hvc_console.h"
+
+/*D:340 These represent our input and output console queues, and the virtio
+ * operations for them. */
+static struct virtqueue *in_vq, *out_vq;
+static struct virtio_device *vdev;
+
+/* This is our input buffer, and how much data is left in it. */
+static unsigned int in_len;
+static char *in, *inbuf;
+
+/* The operations for our console. */
+static struct hv_ops virtio_cons;
+
+/*D:310 The put_chars() callback is pretty straightforward.
+ *
+ * We turn the characters into a scatter-gather list, add it to the output
+ * queue and then kick the Host.  Then we sit here waiting for it to finish:
+ * inefficient in theory, but in practice implementations will do it
+ * immediately (lguest's Launcher does). */
+static int put_chars(u32 vtermno, const char *buf, int count)
+{
+	struct scatterlist sg[1];
+	unsigned int len;
+
+	/* This is a convenient routine to initialize a single-elem sg list */
+	sg_init_one(sg, buf, count);
+
+	/* add_buf wants a token to identify this buffer: we hand it any
+	 * non-NULL pointer, since there's only ever one buffer. */
+	if (out_vq->vq_ops->add_buf(out_vq, sg, 1, 0, (void *)1) == 0) {
+		/* Tell Host to go! */
+		out_vq->vq_ops->kick(out_vq);
+		/* Chill out until it's done with the buffer. */
+		while (!out_vq->vq_ops->get_buf(out_vq, &len))
+			cpu_relax();
+	}
+
+	/* We're expected to return the amount of data we wrote: all of it. */
+	return count;
+}
+
+/* Create a scatter-gather list representing our input buffer and put it in the
+ * queue. */
+static void add_inbuf(void)
+{
+	struct scatterlist sg[1];
+	sg_init_one(sg, inbuf, PAGE_SIZE);
+
+	/* We should always be able to add one buffer to an empty queue. */
+	if (in_vq->vq_ops->add_buf(in_vq, sg, 0, 1, inbuf) != 0)
+		BUG();
+	in_vq->vq_ops->kick(in_vq);
+}
+
+/*D:350 get_chars() is the callback from the hvc_console infrastructure when
+ * an interrupt is received.
+ *
+ * Most of the code deals with the fact that the hvc_console() infrastructure
+ * only asks us for 16 bytes at a time.  We keep in_offset and in_used fields
+ * for partially-filled buffers. */
+static int get_chars(u32 vtermno, char *buf, int count)
+{
+	/* If we don't have an input queue yet, we can't get input. */
+	BUG_ON(!in_vq);
+
+	/* No buffer?  Try to get one. */
+	if (!in_len) {
+		in = in_vq->vq_ops->get_buf(in_vq, &in_len);
+		if (!in)
+			return 0;
+	}
+
+	/* You want more than we have to give?  Well, try wanting less! */
+	if (in_len < count)
+		count = in_len;
+
+	/* Copy across to their buffer and increment offset. */
+	memcpy(buf, in, count);
+	in += count;
+	in_len -= count;
+
+	/* Finished?  Re-register buffer so Host will use it again. */
+	if (in_len == 0)
+		add_inbuf();
+
+	return count;
+}
+/*:*/
+
+/*D:320 Console drivers are initialized very early so boot messages can go out,
+ * so we do things slightly differently from the generic virtio initialization
+ * of the net and block drivers.
+ *
+ * At this stage, the console is output-only.  It's too early to set up a
+ * virtqueue, so we let the drivers do some boutique early-output thing. */
+int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int))
+{
+	virtio_cons.put_chars = put_chars;
+	return hvc_instantiate(0, 0, &virtio_cons);
+}
+
+/*D:370 Once we're further in boot, we get probed like any other virtio device.
+ * At this stage we set up the output virtqueue.
+ *
+ * To set up and manage our virtual console, we call hvc_alloc().  Since we
+ * never remove the console device we never need this pointer again.
+ *
+ * Finally we put our input buffer in the input queue, ready to receive. */
+static int virtcons_probe(struct virtio_device *dev)
+{
+	int err;
+	struct hvc_struct *hvc;
+
+	vdev = dev;
+
+	/* This is the scratch page we use to receive console input */
+	inbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!inbuf) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	/* Find the input queue. */
+	/* FIXME: This is why we want to wean off hvc: we do nothing
+	 * when input comes in. */
+	in_vq = vdev->config->find_vq(vdev, NULL);
+	if (IS_ERR(in_vq)) {
+		err = PTR_ERR(in_vq);
+		goto free;
+	}
+
+	out_vq = vdev->config->find_vq(vdev, NULL);
+	if (IS_ERR(out_vq)) {
+		err = PTR_ERR(out_vq);
+		goto free_in_vq;
+	}
+
+	/* Start using the new console output. */
+	virtio_cons.get_chars = get_chars;
+	virtio_cons.put_chars = put_chars;
+
+	/* The first argument of hvc_alloc() is the virtual console number, so
+	 * we use zero.  The second argument is the interrupt number; we
+	 * currently leave this as zero: it would be better not to use the
+	 * hvc mechanism and fix this (FIXME!).
+	 *
+	 * The third argument is a "struct hv_ops" containing the put_chars()
+	 * and get_chars() pointers.  The final argument is the output buffer
+	 * size: we can do any size, so we put PAGE_SIZE here. */
+	hvc = hvc_alloc(0, 0, &virtio_cons, PAGE_SIZE);
+	if (IS_ERR(hvc)) {
+		err = PTR_ERR(hvc);
+		goto free_out_vq;
+	}
+
+	/* Register the input buffer the first time. */
+	add_inbuf();
+	return 0;
+
+free_out_vq:
+	vdev->config->del_vq(out_vq);
+free_in_vq:
+	vdev->config->del_vq(in_vq);
+free:
+	kfree(inbuf);
+fail:
+	return err;
+}
+
+static struct virtio_device_id id_table[] = {
+	{ VIRTIO_ID_CONSOLE, VIRTIO_DEV_ANY_ID },
+	{ 0 },
+};
+
+static struct virtio_driver virtio_console = {
+	.driver.name =	KBUILD_MODNAME,
+	.driver.owner =	THIS_MODULE,
+	.id_table =	id_table,
+	.probe =	virtcons_probe,
+};
+
+static int __init init(void)
+{
+	return register_virtio_driver(&virtio_console);
+}
+module_init(init);
+
+MODULE_DEVICE_TABLE(virtio, id_table);
+MODULE_DESCRIPTION("Virtio console driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/firewire/fw-ohci.c b/drivers/firewire/fw-ohci.c
index 2f307c4df33..67588326ae5 100644
--- a/drivers/firewire/fw-ohci.c
+++ b/drivers/firewire/fw-ohci.c
@@ -606,7 +606,7 @@ static int
 at_context_queue_packet(struct context *ctx, struct fw_packet *packet)
 {
 	struct fw_ohci *ohci = ctx->ohci;
-	dma_addr_t d_bus, payload_bus;
+	dma_addr_t d_bus, uninitialized_var(payload_bus);
 	struct driver_data *driver_data;
 	struct descriptor *d, *last;
 	__le32 *header;
@@ -1459,7 +1459,7 @@ ohci_allocate_iso_context(struct fw_card *card, int type, size_t header_size)
 	/* FIXME: We need a fallback for pre 1.1 OHCI. */
 	if (callback == handle_ir_dualbuffer_packet &&
 	    ohci->version < OHCI_VERSION_1_1)
-		return ERR_PTR(-EINVAL);
+		return ERR_PTR(-ENOSYS);
 
 	spin_lock_irqsave(&ohci->lock, flags);
 	index = ffs(*mask) - 1;
@@ -1778,7 +1778,7 @@ ohci_queue_iso(struct fw_iso_context *base,
 							 buffer, payload);
 	else
 		/* FIXME: Implement fallback for OHCI 1.0 controllers. */
-		return -EINVAL;
+		return -ENOSYS;
 }
 
 static const struct fw_card_driver ohci_driver = {
@@ -1898,7 +1898,12 @@ pci_probe(struct pci_dev *dev, const struct pci_device_id *ent)
 	ohci->version = reg_read(ohci, OHCI1394_Version) & 0x00ff00ff;
 	fw_notify("Added fw-ohci device %s, OHCI version %x.%x\n",
 		  dev->dev.bus_id, ohci->version >> 16, ohci->version & 0xff);
-
+	if (ohci->version < OHCI_VERSION_1_1) {
+		fw_notify("    Isochronous I/O is not yet implemented for "
+			  "OHCI 1.0 chips.\n");
+		fw_notify("    Cameras, audio devices etc. won't work on "
+			  "this controller with this driver version.\n");
+	}
 	return 0;
 
  fail_self_id:
diff --git a/drivers/ide/cris/ide-cris.c b/drivers/ide/cris/ide-cris.c
index ff20377b4c8..e196aefa207 100644
--- a/drivers/ide/cris/ide-cris.c
+++ b/drivers/ide/cris/ide-cris.c
@@ -935,11 +935,11 @@ static int cris_ide_build_dmatable (ide_drive_t *drive)
 		 * than two possibly non-adjacent physical 4kB pages.
 		 */
 		/* group sequential buffers into one large buffer */
-		addr = page_to_phys(sg->page) + sg->offset;
+		addr = sg_phys(sg);
 		size = sg_dma_len(sg);
 		while (--i) {
 			sg = sg_next(sg);
-			if ((addr + size) != page_to_phys(sg->page) + sg->offset)
+			if ((addr + size) != sg_phys(sg))
 				break;
 			size += sg_dma_len(sg);
 		}
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index d5146c57e5b..6a6f2e066b4 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -47,6 +47,7 @@
 #include <linux/spinlock.h>
 #include <linux/kmod.h>
 #include <linux/pci.h>
+#include <linux/scatterlist.h>
 
 #include <asm/byteorder.h>
 #include <asm/irq.h>
@@ -1317,12 +1318,14 @@ static int hwif_init(ide_hwif_t *hwif)
 	if (!hwif->sg_max_nents)
 		hwif->sg_max_nents = PRD_ENTRIES;
 
-	hwif->sg_table = kzalloc(sizeof(struct scatterlist)*hwif->sg_max_nents,
+	hwif->sg_table = kmalloc(sizeof(struct scatterlist)*hwif->sg_max_nents,
 				 GFP_KERNEL);
 	if (!hwif->sg_table) {
 		printk(KERN_ERR "%s: unable to allocate SG table.\n", hwif->name);
 		goto out;
 	}
+
+	sg_init_table(hwif->sg_table, hwif->sg_max_nents);
 	
 	if (init_irq(hwif) == 0)
 		goto done;
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 73ef6bf5fbc..d066546f283 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -261,7 +261,7 @@ static void ide_pio_sector(ide_drive_t *drive, unsigned int write)
 		hwif->cursg = sg;
 	}
 
-	page = cursg->page;
+	page = sg_page(cursg);
 	offset = cursg->offset + hwif->cursg_ofs * SECTOR_SIZE;
 
 	/* get the current page and offset */
diff --git a/drivers/ide/mips/au1xxx-ide.c b/drivers/ide/mips/au1xxx-ide.c
index 1de58566e5b..a4ce3ba15d6 100644
--- a/drivers/ide/mips/au1xxx-ide.c
+++ b/drivers/ide/mips/au1xxx-ide.c
@@ -276,8 +276,7 @@ static int auide_build_dmatable(ide_drive_t *drive)
 
 			if (iswrite) {
 				if(!put_source_flags(ahwif->tx_chan, 
-						     (void*)(page_address(sg->page) 
-							     + sg->offset), 
+						     (void*) sg_virt(sg),
 						     tc, flags)) { 
 					printk(KERN_ERR "%s failed %d\n", 
 					       __FUNCTION__, __LINE__);
@@ -285,8 +284,7 @@ static int auide_build_dmatable(ide_drive_t *drive)
 			} else 
 			{
 				if(!put_dest_flags(ahwif->rx_chan, 
-						   (void*)(page_address(sg->page) 
-							   + sg->offset), 
+						   (void*) sg_virt(sg),
 						   tc, flags)) { 
 					printk(KERN_ERR "%s failed %d\n", 
 					       __FUNCTION__, __LINE__);
diff --git a/drivers/ieee1394/dma.c b/drivers/ieee1394/dma.c
index 45d60558192..3051e312fdc 100644
--- a/drivers/ieee1394/dma.c
+++ b/drivers/ieee1394/dma.c
@@ -12,7 +12,7 @@
 #include <linux/pci.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 
 #include "dma.h"
 
@@ -111,7 +111,7 @@ int dma_region_alloc(struct dma_region *dma, unsigned long n_bytes,
 		unsigned long va =
 		    (unsigned long)dma->kvirt + (i << PAGE_SHIFT);
 
-		dma->sglist[i].page = vmalloc_to_page((void *)va);
+		sg_set_page(&dma->sglist[i], vmalloc_to_page((void *)va));
 		dma->sglist[i].length = PAGE_SIZE;
 	}
 
diff --git a/drivers/ieee1394/sbp2.c b/drivers/ieee1394/sbp2.c
index 1b353b964b3..d5dfe11aa5c 100644
--- a/drivers/ieee1394/sbp2.c
+++ b/drivers/ieee1394/sbp2.c
@@ -1466,7 +1466,7 @@ static void sbp2_prep_command_orb_sg(struct sbp2_command_orb *orb,
 		cmd->dma_size = sgpnt[0].length;
 		cmd->dma_type = CMD_DMA_PAGE;
 		cmd->cmd_dma = dma_map_page(hi->host->device.parent,
-					    sgpnt[0].page, sgpnt[0].offset,
+					    sg_page(&sgpnt[0]), sgpnt[0].offset,
 					    cmd->dma_size, cmd->dma_dir);
 
 		orb->data_descriptor_lo = cmd->cmd_dma;
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index d08fb30768b..0751697ef98 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -114,13 +114,16 @@ struct rdma_id_private {
 
 	struct rdma_bind_list	*bind_list;
 	struct hlist_node	node;
-	struct list_head	list;
-	struct list_head	listen_list;
+	struct list_head	list; /* listen_any_list or cma_device.list */
+	struct list_head	listen_list; /* per device listens */
 	struct cma_device	*cma_dev;
 	struct list_head	mc_list;
 
+	int			internal_id;
 	enum cma_state		state;
 	spinlock_t		lock;
+	struct mutex		qp_mutex;
+
 	struct completion	comp;
 	atomic_t		refcount;
 	wait_queue_head_t	wait_remove;
@@ -389,6 +392,7 @@ struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
 	id_priv->id.event_handler = event_handler;
 	id_priv->id.ps = ps;
 	spin_lock_init(&id_priv->lock);
+	mutex_init(&id_priv->qp_mutex);
 	init_completion(&id_priv->comp);
 	atomic_set(&id_priv->refcount, 1);
 	init_waitqueue_head(&id_priv->wait_remove);
@@ -474,61 +478,86 @@ EXPORT_SYMBOL(rdma_create_qp);
 
 void rdma_destroy_qp(struct rdma_cm_id *id)
 {
-	ib_destroy_qp(id->qp);
+	struct rdma_id_private *id_priv;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	mutex_lock(&id_priv->qp_mutex);
+	ib_destroy_qp(id_priv->id.qp);
+	id_priv->id.qp = NULL;
+	mutex_unlock(&id_priv->qp_mutex);
 }
 EXPORT_SYMBOL(rdma_destroy_qp);
 
-static int cma_modify_qp_rtr(struct rdma_cm_id *id)
+static int cma_modify_qp_rtr(struct rdma_id_private *id_priv)
 {
 	struct ib_qp_attr qp_attr;
 	int qp_attr_mask, ret;
 
-	if (!id->qp)
-		return 0;
+	mutex_lock(&id_priv->qp_mutex);
+	if (!id_priv->id.qp) {
+		ret = 0;
+		goto out;
+	}
 
 	/* Need to update QP attributes from default values. */
 	qp_attr.qp_state = IB_QPS_INIT;
-	ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
 	if (ret)
-		return ret;
+		goto out;
 
-	ret = ib_modify_qp(id->qp, &qp_attr, qp_attr_mask);
+	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
 	if (ret)
-		return ret;
+		goto out;
 
 	qp_attr.qp_state = IB_QPS_RTR;
-	ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
 	if (ret)
-		return ret;
+		goto out;
 
-	return ib_modify_qp(id->qp, &qp_attr, qp_attr_mask);
+	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+out:
+	mutex_unlock(&id_priv->qp_mutex);
+	return ret;
 }
 
-static int cma_modify_qp_rts(struct rdma_cm_id *id)
+static int cma_modify_qp_rts(struct rdma_id_private *id_priv)
 {
 	struct ib_qp_attr qp_attr;
 	int qp_attr_mask, ret;
 
-	if (!id->qp)
-		return 0;
+	mutex_lock(&id_priv->qp_mutex);
+	if (!id_priv->id.qp) {
+		ret = 0;
+		goto out;
+	}
 
 	qp_attr.qp_state = IB_QPS_RTS;
-	ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
 	if (ret)
-		return ret;
+		goto out;
 
-	return ib_modify_qp(id->qp, &qp_attr, qp_attr_mask);
+	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+out:
+	mutex_unlock(&id_priv->qp_mutex);
+	return ret;
 }
 
-static int cma_modify_qp_err(struct rdma_cm_id *id)
+static int cma_modify_qp_err(struct rdma_id_private *id_priv)
 {
 	struct ib_qp_attr qp_attr;
+	int ret;
 
-	if (!id->qp)
-		return 0;
+	mutex_lock(&id_priv->qp_mutex);
+	if (!id_priv->id.qp) {
+		ret = 0;
+		goto out;
+	}
 
 	qp_attr.qp_state = IB_QPS_ERR;
-	return ib_modify_qp(id->qp, &qp_attr, IB_QP_STATE);
+	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE);
+out:
+	mutex_unlock(&id_priv->qp_mutex);
+	return ret;
 }
 
 static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv,
@@ -717,50 +746,27 @@ static void cma_cancel_route(struct rdma_id_private *id_priv)
 	}
 }
 
-static inline int cma_internal_listen(struct rdma_id_private *id_priv)
-{
-	return (id_priv->state == CMA_LISTEN) && id_priv->cma_dev &&
-	       cma_any_addr(&id_priv->id.route.addr.src_addr);
-}
-
-static void cma_destroy_listen(struct rdma_id_private *id_priv)
-{
-	cma_exch(id_priv, CMA_DESTROYING);
-
-	if (id_priv->cma_dev) {
-		switch (rdma_node_get_transport(id_priv->id.device->node_type)) {
-		case RDMA_TRANSPORT_IB:
-			if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib))
-				ib_destroy_cm_id(id_priv->cm_id.ib);
-			break;
-		case RDMA_TRANSPORT_IWARP:
-			if (id_priv->cm_id.iw && !IS_ERR(id_priv->cm_id.iw))
-				iw_destroy_cm_id(id_priv->cm_id.iw);
-			break;
-		default:
-			break;
-		}
-		cma_detach_from_dev(id_priv);
-	}
-	list_del(&id_priv->listen_list);
-
-	cma_deref_id(id_priv);
-	wait_for_completion(&id_priv->comp);
-
-	kfree(id_priv);
-}
-
 static void cma_cancel_listens(struct rdma_id_private *id_priv)
 {
 	struct rdma_id_private *dev_id_priv;
 
+	/*
+	 * Remove from listen_any_list to prevent added devices from spawning
+	 * additional listen requests.
+	 */
 	mutex_lock(&lock);
 	list_del(&id_priv->list);
 
 	while (!list_empty(&id_priv->listen_list)) {
 		dev_id_priv = list_entry(id_priv->listen_list.next,
 					 struct rdma_id_private, listen_list);
-		cma_destroy_listen(dev_id_priv);
+		/* sync with device removal to avoid duplicate destruction */
+		list_del_init(&dev_id_priv->list);
+		list_del(&dev_id_priv->listen_list);
+		mutex_unlock(&lock);
+
+		rdma_destroy_id(&dev_id_priv->id);
+		mutex_lock(&lock);
 	}
 	mutex_unlock(&lock);
 }
@@ -848,6 +854,9 @@ void rdma_destroy_id(struct rdma_cm_id *id)
 	cma_deref_id(id_priv);
 	wait_for_completion(&id_priv->comp);
 
+	if (id_priv->internal_id)
+		cma_deref_id(id_priv->id.context);
+
 	kfree(id_priv->id.route.path_rec);
 	kfree(id_priv);
 }
@@ -857,11 +866,11 @@ static int cma_rep_recv(struct rdma_id_private *id_priv)
 {
 	int ret;
 
-	ret = cma_modify_qp_rtr(&id_priv->id);
+	ret = cma_modify_qp_rtr(id_priv);
 	if (ret)
 		goto reject;
 
-	ret = cma_modify_qp_rts(&id_priv->id);
+	ret = cma_modify_qp_rts(id_priv);
 	if (ret)
 		goto reject;
 
@@ -871,7 +880,7 @@ static int cma_rep_recv(struct rdma_id_private *id_priv)
 
 	return 0;
 reject:
-	cma_modify_qp_err(&id_priv->id);
+	cma_modify_qp_err(id_priv);
 	ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED,
 		       NULL, 0, NULL, 0);
 	return ret;
@@ -947,7 +956,7 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 		/* ignore event */
 		goto out;
 	case IB_CM_REJ_RECEIVED:
-		cma_modify_qp_err(&id_priv->id);
+		cma_modify_qp_err(id_priv);
 		event.status = ib_event->param.rej_rcvd.reason;
 		event.event = RDMA_CM_EVENT_REJECTED;
 		event.param.conn.private_data = ib_event->private_data;
@@ -1404,14 +1413,13 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv,
 
 	cma_attach_to_dev(dev_id_priv, cma_dev);
 	list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list);
+	atomic_inc(&id_priv->refcount);
+	dev_id_priv->internal_id = 1;
 
 	ret = rdma_listen(id, id_priv->backlog);
 	if (ret)
-		goto err;
-
-	return;
-err:
-	cma_destroy_listen(dev_id_priv);
+		printk(KERN_WARNING "RDMA CMA: cma_listen_on_dev, error %d, "
+		       "listening on device %s", ret, cma_dev->device->name);
 }
 
 static void cma_listen_on_all(struct rdma_id_private *id_priv)
@@ -2264,7 +2272,7 @@ static int cma_connect_iw(struct rdma_id_private *id_priv,
 	sin = (struct sockaddr_in*) &id_priv->id.route.addr.dst_addr;
 	cm_id->remote_addr = *sin;
 
-	ret = cma_modify_qp_rtr(&id_priv->id);
+	ret = cma_modify_qp_rtr(id_priv);
 	if (ret)
 		goto out;
 
@@ -2331,7 +2339,7 @@ static int cma_accept_ib(struct rdma_id_private *id_priv,
 	int qp_attr_mask, ret;
 
 	if (id_priv->id.qp) {
-		ret = cma_modify_qp_rtr(&id_priv->id);
+		ret = cma_modify_qp_rtr(id_priv);
 		if (ret)
 			goto out;
 
@@ -2370,7 +2378,7 @@ static int cma_accept_iw(struct rdma_id_private *id_priv,
 	struct iw_cm_conn_param iw_param;
 	int ret;
 
-	ret = cma_modify_qp_rtr(&id_priv->id);
+	ret = cma_modify_qp_rtr(id_priv);
 	if (ret)
 		return ret;
 
@@ -2442,7 +2450,7 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
 
 	return 0;
 reject:
-	cma_modify_qp_err(id);
+	cma_modify_qp_err(id_priv);
 	rdma_reject(id, NULL, 0);
 	return ret;
 }
@@ -2512,7 +2520,7 @@ int rdma_disconnect(struct rdma_cm_id *id)
 
 	switch (rdma_node_get_transport(id->device->node_type)) {
 	case RDMA_TRANSPORT_IB:
-		ret = cma_modify_qp_err(id);
+		ret = cma_modify_qp_err(id_priv);
 		if (ret)
 			goto out;
 		/* Initiate or respond to a disconnect. */
@@ -2543,9 +2551,11 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
 	    cma_disable_remove(id_priv, CMA_ADDR_RESOLVED))
 		return 0;
 
+	mutex_lock(&id_priv->qp_mutex);
 	if (!status && id_priv->id.qp)
 		status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid,
 					 multicast->rec.mlid);
+	mutex_unlock(&id_priv->qp_mutex);
 
 	memset(&event, 0, sizeof event);
 	event.status = status;
@@ -2757,16 +2767,12 @@ static void cma_process_remove(struct cma_device *cma_dev)
 		id_priv = list_entry(cma_dev->id_list.next,
 				     struct rdma_id_private, list);
 
-		if (cma_internal_listen(id_priv)) {
-			cma_destroy_listen(id_priv);
-			continue;
-		}
-
+		list_del(&id_priv->listen_list);
 		list_del_init(&id_priv->list);
 		atomic_inc(&id_priv->refcount);
 		mutex_unlock(&lock);
 
-		ret = cma_remove_id_dev(id_priv);
+		ret = id_priv->internal_id ? 1 : cma_remove_id_dev(id_priv);
 		cma_deref_id(id_priv);
 		if (ret)
 			rdma_destroy_id(&id_priv->id);
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 2f54e29dc7a..14159ff2940 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -55,9 +55,11 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
 		ib_dma_unmap_sg(dev, chunk->page_list,
 				chunk->nents, DMA_BIDIRECTIONAL);
 		for (i = 0; i < chunk->nents; ++i) {
+			struct page *page = sg_page(&chunk->page_list[i]);
+
 			if (umem->writable && dirty)
-				set_page_dirty_lock(chunk->page_list[i].page);
-			put_page(chunk->page_list[i].page);
+				set_page_dirty_lock(page);
+			put_page(page);
 		}
 
 		kfree(chunk);
@@ -164,11 +166,12 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 			}
 
 			chunk->nents = min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK);
+			sg_init_table(chunk->page_list, chunk->nents);
 			for (i = 0; i < chunk->nents; ++i) {
 				if (vma_list &&
 				    !is_vm_hugetlb_page(vma_list[i + off]))
 					umem->hugetlb = 0;
-				chunk->page_list[i].page   = page_list[i + off];
+				sg_set_page(&chunk->page_list[i], page_list[i + off]);
 				chunk->page_list[i].offset = 0;
 				chunk->page_list[i].length = PAGE_SIZE;
 			}
@@ -179,7 +182,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 						    DMA_BIDIRECTIONAL);
 			if (chunk->nmap <= 0) {
 				for (i = 0; i < chunk->nents; ++i)
-					put_page(chunk->page_list[i].page);
+					put_page(sg_page(&chunk->page_list[i]));
 				kfree(chunk);
 
 				ret = -ENOMEM;
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 01d70084aeb..495c803fb11 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -147,8 +147,12 @@ static struct ib_uobject *__idr_get_uobj(struct idr *idr, int id,
 
 	spin_lock(&ib_uverbs_idr_lock);
 	uobj = idr_find(idr, id);
-	if (uobj)
-		kref_get(&uobj->ref);
+	if (uobj) {
+		if (uobj->context == context)
+			kref_get(&uobj->ref);
+		else
+			uobj = NULL;
+	}
 	spin_unlock(&ib_uverbs_idr_lock);
 
 	return uobj;
diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h
index 3f2d68cff76..2d660ae189e 100644
--- a/drivers/infiniband/hw/ehca/ehca_classes.h
+++ b/drivers/infiniband/hw/ehca/ehca_classes.h
@@ -323,7 +323,6 @@ extern int ehca_static_rate;
 extern int ehca_port_act_time;
 extern int ehca_use_hp_mr;
 extern int ehca_scaling_code;
-extern int ehca_mr_largepage;
 
 struct ipzu_queue_resp {
 	u32 qe_size;      /* queue entry size */
diff --git a/drivers/infiniband/hw/ehca/ehca_hca.c b/drivers/infiniband/hw/ehca/ehca_hca.c
index 4aa3ffa6a19..15806d14046 100644
--- a/drivers/infiniband/hw/ehca/ehca_hca.c
+++ b/drivers/infiniband/hw/ehca/ehca_hca.c
@@ -77,6 +77,7 @@ int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props)
 	}
 
 	memset(props, 0, sizeof(struct ib_device_attr));
+	props->page_size_cap   = shca->hca_cap_mr_pgsize;
 	props->fw_ver          = rblock->hw_ver;
 	props->max_mr_size     = rblock->max_mr_size;
 	props->vendor_id       = rblock->vendor_id >> 8;
diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
index 7a7dab890f6..c6cd38c5321 100644
--- a/drivers/infiniband/hw/ehca/ehca_main.c
+++ b/drivers/infiniband/hw/ehca/ehca_main.c
@@ -65,7 +65,7 @@ int ehca_port_act_time = 30;
 int ehca_poll_all_eqs  = 1;
 int ehca_static_rate   = -1;
 int ehca_scaling_code  = 0;
-int ehca_mr_largepage  = 0;
+int ehca_mr_largepage  = 1;
 
 module_param_named(open_aqp1,     ehca_open_aqp1,     int, S_IRUGO);
 module_param_named(debug_level,   ehca_debug_level,   int, S_IRUGO);
@@ -260,13 +260,20 @@ static struct cap_descr {
 	{ HCA_CAP_MINI_QP, "HCA_CAP_MINI_QP" },
 };
 
-int ehca_sense_attributes(struct ehca_shca *shca)
+static int ehca_sense_attributes(struct ehca_shca *shca)
 {
 	int i, ret = 0;
 	u64 h_ret;
 	struct hipz_query_hca *rblock;
 	struct hipz_query_port *port;
 
+	static const u32 pgsize_map[] = {
+		HCA_CAP_MR_PGSIZE_4K,  0x1000,
+		HCA_CAP_MR_PGSIZE_64K, 0x10000,
+		HCA_CAP_MR_PGSIZE_1M,  0x100000,
+		HCA_CAP_MR_PGSIZE_16M, 0x1000000,
+	};
+
 	rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
 	if (!rblock) {
 		ehca_gen_err("Cannot allocate rblock memory.");
@@ -329,8 +336,15 @@ int ehca_sense_attributes(struct ehca_shca *shca)
 		if (EHCA_BMASK_GET(hca_cap_descr[i].mask, shca->hca_cap))
 			ehca_gen_dbg("   %s", hca_cap_descr[i].descr);
 
-	shca->hca_cap_mr_pgsize = rblock->memory_page_size_supported;
+	/* translate supported MR page sizes; always support 4K */
+	shca->hca_cap_mr_pgsize = EHCA_PAGESIZE;
+	if (ehca_mr_largepage) { /* support extra sizes only if enabled */
+		for (i = 0; i < ARRAY_SIZE(pgsize_map); i += 2)
+			if (rblock->memory_page_size_supported & pgsize_map[i])
+				shca->hca_cap_mr_pgsize |= pgsize_map[i + 1];
+	}
 
+	/* query max MTU from first port -- it's the same for all ports */
 	port = (struct hipz_query_port *)rblock;
 	h_ret = hipz_h_query_port(shca->ipz_hca_handle, 1, port);
 	if (h_ret != H_SUCCESS) {
diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c
index da88738265e..e239bbf54da 100644
--- a/drivers/infiniband/hw/ehca/ehca_mrmw.c
+++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c
@@ -72,24 +72,14 @@ enum ehca_mr_pgsize {
 
 static u32 ehca_encode_hwpage_size(u32 pgsize)
 {
-	u32 idx = 0;
-	pgsize >>= 12;
-	/*
-	 * map mr page size into hw code:
-	 * 0, 1, 2, 3 for 4K, 64K, 1M, 64M
-	 */
-	while (!(pgsize & 1)) {
-		idx++;
-		pgsize >>= 4;
-	}
-	return idx;
+	int log = ilog2(pgsize);
+	WARN_ON(log < 12 || log > 24 || log & 3);
+	return (log - 12) / 4;
 }
 
 static u64 ehca_get_max_hwpage_size(struct ehca_shca *shca)
 {
-	if (shca->hca_cap_mr_pgsize & HCA_CAP_MR_PGSIZE_16M)
-		return EHCA_MR_PGSIZE16M;
-	return EHCA_MR_PGSIZE4K;
+	return 1UL << ilog2(shca->hca_cap_mr_pgsize);
 }
 
 static struct ehca_mr *ehca_mr_new(void)
@@ -259,7 +249,7 @@ struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd,
 		pginfo.u.phy.num_phys_buf = num_phys_buf;
 		pginfo.u.phy.phys_buf_array = phys_buf_array;
 		pginfo.next_hwpage =
-			((u64)iova_start & ~(hw_pgsize - 1)) / hw_pgsize;
+			((u64)iova_start & ~PAGE_MASK) / hw_pgsize;
 
 		ret = ehca_reg_mr(shca, e_mr, iova_start, size, mr_access_flags,
 				  e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
@@ -296,7 +286,7 @@ struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 		container_of(pd->device, struct ehca_shca, ib_device);
 	struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
 	struct ehca_mr_pginfo pginfo;
-	int ret;
+	int ret, page_shift;
 	u32 num_kpages;
 	u32 num_hwpages;
 	u64 hwpage_size;
@@ -351,19 +341,20 @@ struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	/* determine number of MR pages */
 	num_kpages = NUM_CHUNKS((virt % PAGE_SIZE) + length, PAGE_SIZE);
 	/* select proper hw_pgsize */
-	if (ehca_mr_largepage &&
-	    (shca->hca_cap_mr_pgsize & HCA_CAP_MR_PGSIZE_16M)) {
-		int page_shift = PAGE_SHIFT;
-		if (e_mr->umem->hugetlb) {
-			/* determine page_shift, clamp between 4K and 16M */
-			page_shift = (fls64(length - 1) + 3) & ~3;
-			page_shift = min(max(page_shift, EHCA_MR_PGSHIFT4K),
-					 EHCA_MR_PGSHIFT16M);
-		}
-		hwpage_size = 1UL << page_shift;
-	} else
-		hwpage_size = EHCA_MR_PGSIZE4K; /* ehca1 only supports 4k */
-	ehca_dbg(pd->device, "hwpage_size=%lx", hwpage_size);
+	page_shift = PAGE_SHIFT;
+	if (e_mr->umem->hugetlb) {
+		/* determine page_shift, clamp between 4K and 16M */
+		page_shift = (fls64(length - 1) + 3) & ~3;
+		page_shift = min(max(page_shift, EHCA_MR_PGSHIFT4K),
+				 EHCA_MR_PGSHIFT16M);
+	}
+	hwpage_size = 1UL << page_shift;
+
+	/* now that we have the desired page size, shift until it's
+	 * supported, too. 4K is always supported, so this terminates.
+	 */
+	while (!(hwpage_size & shca->hca_cap_mr_pgsize))
+		hwpage_size >>= 4;
 
 reg_user_mr_fallback:
 	num_hwpages = NUM_CHUNKS((virt % hwpage_size) + length, hwpage_size);
@@ -547,7 +538,7 @@ int ehca_rereg_phys_mr(struct ib_mr *mr,
 		pginfo.u.phy.num_phys_buf = num_phys_buf;
 		pginfo.u.phy.phys_buf_array = phys_buf_array;
 		pginfo.next_hwpage =
-			((u64)iova_start & ~(hw_pgsize - 1)) / hw_pgsize;
+			((u64)iova_start & ~PAGE_MASK) / hw_pgsize;
 	}
 	if (mr_rereg_mask & IB_MR_REREG_ACCESS)
 		new_acl = mr_access_flags;
@@ -809,8 +800,9 @@ struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd,
 		ib_fmr = ERR_PTR(-EINVAL);
 		goto alloc_fmr_exit0;
 	}
-	hw_pgsize = ehca_get_max_hwpage_size(shca);
-	if ((1 << fmr_attr->page_shift) != hw_pgsize) {
+
+	hw_pgsize = 1 << fmr_attr->page_shift;
+	if (!(hw_pgsize & shca->hca_cap_mr_pgsize)) {
 		ehca_err(pd->device, "unsupported fmr_attr->page_shift=%x",
 			 fmr_attr->page_shift);
 		ib_fmr = ERR_PTR(-EINVAL);
@@ -826,6 +818,7 @@ struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd,
 
 	/* register MR on HCA */
 	memset(&pginfo, 0, sizeof(pginfo));
+	pginfo.hwpage_size = hw_pgsize;
 	/*
 	 * pginfo.num_hwpages==0, ie register_rpages() will not be called
 	 * but deferred to map_phys_fmr()
@@ -1776,7 +1769,7 @@ static int ehca_set_pagebuf_user1(struct ehca_mr_pginfo *pginfo,
 	list_for_each_entry_continue(
 		chunk, (&(pginfo->u.usr.region->chunk_list)), list) {
 		for (i = pginfo->u.usr.next_nmap; i < chunk->nmap; ) {
-			pgaddr = page_to_pfn(chunk->page_list[i].page)
+			pgaddr = page_to_pfn(sg_page(&chunk->page_list[i]))
 				<< PAGE_SHIFT ;
 			*kpage = phys_to_abs(pgaddr +
 					     (pginfo->next_hwpage *
@@ -1832,7 +1825,7 @@ static int ehca_check_kpages_per_ate(struct scatterlist *page_list,
 {
 	int t;
 	for (t = start_idx; t <= end_idx; t++) {
-		u64 pgaddr = page_to_pfn(page_list[t].page) << PAGE_SHIFT;
+		u64 pgaddr = page_to_pfn(sg_page(&page_list[t])) << PAGE_SHIFT;
 		ehca_gen_dbg("chunk_page=%lx value=%016lx", pgaddr,
 			     *(u64 *)abs_to_virt(phys_to_abs(pgaddr)));
 		if (pgaddr - PAGE_SIZE != *prev_pgaddr) {
@@ -1867,7 +1860,7 @@ static int ehca_set_pagebuf_user2(struct ehca_mr_pginfo *pginfo,
 		chunk, (&(pginfo->u.usr.region->chunk_list)), list) {
 		for (i = pginfo->u.usr.next_nmap; i < chunk->nmap; ) {
 			if (nr_kpages == kpages_per_hwpage) {
-				pgaddr = ( page_to_pfn(chunk->page_list[i].page)
+				pgaddr = ( page_to_pfn(sg_page(&chunk->page_list[i]))
 					   << PAGE_SHIFT );
 				*kpage = phys_to_abs(pgaddr);
 				if ( !(*kpage) ) {
diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c
index e2bd62be11e..de182648b28 100644
--- a/drivers/infiniband/hw/ehca/ehca_qp.c
+++ b/drivers/infiniband/hw/ehca/ehca_qp.c
@@ -451,7 +451,6 @@ static struct ehca_qp *internal_create_qp(
 		has_srq = 1;
 		parms.ext_type = EQPT_SRQBASE;
 		parms.srq_qpn = my_srq->real_qp_num;
-		parms.srq_token = my_srq->token;
 	}
 
 	if (is_llqp && has_srq) {
@@ -583,6 +582,9 @@ static struct ehca_qp *internal_create_qp(
 		goto create_qp_exit1;
 	}
 
+	if (has_srq)
+		parms.srq_token = my_qp->token;
+
 	parms.servicetype = ibqptype2servicetype(qp_type);
 	if (parms.servicetype < 0) {
 		ret = -EINVAL;
diff --git a/drivers/infiniband/hw/ipath/ipath_dma.c b/drivers/infiniband/hw/ipath/ipath_dma.c
index 22709a4f8fc..e90a0ea538a 100644
--- a/drivers/infiniband/hw/ipath/ipath_dma.c
+++ b/drivers/infiniband/hw/ipath/ipath_dma.c
@@ -108,7 +108,7 @@ static int ipath_map_sg(struct ib_device *dev, struct scatterlist *sgl,
 	BUG_ON(!valid_dma_direction(direction));
 
 	for_each_sg(sgl, sg, nents, i) {
-		addr = (u64) page_address(sg->page);
+		addr = (u64) page_address(sg_page(sg));
 		/* TODO: handle highmem pages */
 		if (!addr) {
 			ret = 0;
@@ -127,7 +127,7 @@ static void ipath_unmap_sg(struct ib_device *dev,
 
 static u64 ipath_sg_dma_address(struct ib_device *dev, struct scatterlist *sg)
 {
-	u64 addr = (u64) page_address(sg->page);
+	u64 addr = (u64) page_address(sg_page(sg));
 
 	if (addr)
 		addr += sg->offset;
diff --git a/drivers/infiniband/hw/ipath/ipath_mr.c b/drivers/infiniband/hw/ipath/ipath_mr.c
index e442470a237..db4ba92f79f 100644
--- a/drivers/infiniband/hw/ipath/ipath_mr.c
+++ b/drivers/infiniband/hw/ipath/ipath_mr.c
@@ -225,7 +225,7 @@ struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 		for (i = 0; i < chunk->nents; i++) {
 			void *vaddr;
 
-			vaddr = page_address(chunk->page_list[i].page);
+			vaddr = page_address(sg_page(&chunk->page_list[i]));
 			if (!vaddr) {
 				ret = ERR_PTR(-EINVAL);
 				goto bail;
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 31a480e5b0d..6b3322486b5 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -63,6 +63,10 @@ struct mlx4_ib_sqp {
 	u8			header_buf[MLX4_IB_UD_HEADER_SIZE];
 };
 
+enum {
+	MLX4_IB_MIN_SQ_STRIDE = 6
+};
+
 static const __be32 mlx4_ib_opcode[] = {
 	[IB_WR_SEND]			= __constant_cpu_to_be32(MLX4_OPCODE_SEND),
 	[IB_WR_SEND_WITH_IMM]		= __constant_cpu_to_be32(MLX4_OPCODE_SEND_IMM),
@@ -285,9 +289,17 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
 	return 0;
 }
 
-static int set_user_sq_size(struct mlx4_ib_qp *qp,
+static int set_user_sq_size(struct mlx4_ib_dev *dev,
+			    struct mlx4_ib_qp *qp,
 			    struct mlx4_ib_create_qp *ucmd)
 {
+	/* Sanity check SQ size before proceeding */
+	if ((1 << ucmd->log_sq_bb_count) > dev->dev->caps.max_wqes	 ||
+	    ucmd->log_sq_stride >
+		ilog2(roundup_pow_of_two(dev->dev->caps.max_sq_desc_sz)) ||
+	    ucmd->log_sq_stride < MLX4_IB_MIN_SQ_STRIDE)
+		return -EINVAL;
+
 	qp->sq.wqe_cnt   = 1 << ucmd->log_sq_bb_count;
 	qp->sq.wqe_shift = ucmd->log_sq_stride;
 
@@ -330,7 +342,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 
 		qp->sq_no_prefetch = ucmd.sq_no_prefetch;
 
-		err = set_user_sq_size(qp, &ucmd);
+		err = set_user_sq_size(dev, qp, &ucmd);
 		if (err)
 			goto err;
 
diff --git a/drivers/infiniband/hw/mthca/mthca_cq.c b/drivers/infiniband/hw/mthca/mthca_cq.c
index be6e1e03bda..6bd9f139334 100644
--- a/drivers/infiniband/hw/mthca/mthca_cq.c
+++ b/drivers/infiniband/hw/mthca/mthca_cq.c
@@ -204,16 +204,11 @@ static void dump_cqe(struct mthca_dev *dev, void *cqe_ptr)
 static inline void update_cons_index(struct mthca_dev *dev, struct mthca_cq *cq,
 				     int incr)
 {
-	__be32 doorbell[2];
-
 	if (mthca_is_memfree(dev)) {
 		*cq->set_ci_db = cpu_to_be32(cq->cons_index);
 		wmb();
 	} else {
-		doorbell[0] = cpu_to_be32(MTHCA_TAVOR_CQ_DB_INC_CI | cq->cqn);
-		doorbell[1] = cpu_to_be32(incr - 1);
-
-		mthca_write64(doorbell,
+		mthca_write64(MTHCA_TAVOR_CQ_DB_INC_CI | cq->cqn, incr - 1,
 			      dev->kar + MTHCA_CQ_DOORBELL,
 			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
 		/*
@@ -731,17 +726,12 @@ repoll:
 
 int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags)
 {
-	__be32 doorbell[2];
+	u32 dbhi = ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
+		    MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL :
+		    MTHCA_TAVOR_CQ_DB_REQ_NOT) |
+		to_mcq(cq)->cqn;
 
-	doorbell[0] = cpu_to_be32(((flags & IB_CQ_SOLICITED_MASK) ==
-				   IB_CQ_SOLICITED ?
-				   MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL :
-				   MTHCA_TAVOR_CQ_DB_REQ_NOT)      |
-				  to_mcq(cq)->cqn);
-	doorbell[1] = (__force __be32) 0xffffffff;
-
-	mthca_write64(doorbell,
-		      to_mdev(cq->device)->kar + MTHCA_CQ_DOORBELL,
+	mthca_write64(dbhi, 0xffffffff, to_mdev(cq->device)->kar + MTHCA_CQ_DOORBELL,
 		      MTHCA_GET_DOORBELL_LOCK(&to_mdev(cq->device)->doorbell_lock));
 
 	return 0;
@@ -750,19 +740,16 @@ int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags)
 int mthca_arbel_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
 {
 	struct mthca_cq *cq = to_mcq(ibcq);
-	__be32 doorbell[2];
-	u32 sn;
-	__be32 ci;
-
-	sn = cq->arm_sn & 3;
-	ci = cpu_to_be32(cq->cons_index);
+	__be32 db_rec[2];
+	u32 dbhi;
+	u32 sn = cq->arm_sn & 3;
 
-	doorbell[0] = ci;
-	doorbell[1] = cpu_to_be32((cq->cqn << 8) | (2 << 5) | (sn << 3) |
-				  ((flags & IB_CQ_SOLICITED_MASK) ==
-				   IB_CQ_SOLICITED ? 1 : 2));
+	db_rec[0] = cpu_to_be32(cq->cons_index);
+	db_rec[1] = cpu_to_be32((cq->cqn << 8) | (2 << 5) | (sn << 3) |
+				((flags & IB_CQ_SOLICITED_MASK) ==
+				 IB_CQ_SOLICITED ? 1 : 2));
 
-	mthca_write_db_rec(doorbell, cq->arm_db);
+	mthca_write_db_rec(db_rec, cq->arm_db);
 
 	/*
 	 * Make sure that the doorbell record in host memory is
@@ -770,14 +757,12 @@ int mthca_arbel_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
 	 */
 	wmb();
 
-	doorbell[0] = cpu_to_be32((sn << 28)                       |
-				  ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
-				   MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL :
-				   MTHCA_ARBEL_CQ_DB_REQ_NOT)      |
-				  cq->cqn);
-	doorbell[1] = ci;
+	dbhi = (sn << 28) |
+		((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
+		 MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL :
+		 MTHCA_ARBEL_CQ_DB_REQ_NOT) | cq->cqn;
 
-	mthca_write64(doorbell,
+	mthca_write64(dbhi, cq->cons_index,
 		      to_mdev(ibcq->device)->kar + MTHCA_CQ_DOORBELL,
 		      MTHCA_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->doorbell_lock));
 
diff --git a/drivers/infiniband/hw/mthca/mthca_doorbell.h b/drivers/infiniband/hw/mthca/mthca_doorbell.h
index dd9a44d170c..b374dc395be 100644
--- a/drivers/infiniband/hw/mthca/mthca_doorbell.h
+++ b/drivers/infiniband/hw/mthca/mthca_doorbell.h
@@ -58,10 +58,10 @@ static inline void mthca_write64_raw(__be64 val, void __iomem *dest)
 	__raw_writeq((__force u64) val, dest);
 }
 
-static inline void mthca_write64(__be32 val[2], void __iomem *dest,
+static inline void mthca_write64(u32 hi, u32 lo, void __iomem *dest,
 				 spinlock_t *doorbell_lock)
 {
-	__raw_writeq(*(u64 *) val, dest);
+	__raw_writeq((__force u64) cpu_to_be64((u64) hi << 32 | lo), dest);
 }
 
 static inline void mthca_write_db_rec(__be32 val[2], __be32 *db)
@@ -87,14 +87,17 @@ static inline void mthca_write64_raw(__be64 val, void __iomem *dest)
 	__raw_writel(((__force u32 *) &val)[1], dest + 4);
 }
 
-static inline void mthca_write64(__be32 val[2], void __iomem *dest,
+static inline void mthca_write64(u32 hi, u32 lo, void __iomem *dest,
 				 spinlock_t *doorbell_lock)
 {
 	unsigned long flags;
 
+	hi = (__force u32) cpu_to_be32(hi);
+	lo = (__force u32) cpu_to_be32(lo);
+
 	spin_lock_irqsave(doorbell_lock, flags);
-	__raw_writel((__force u32) val[0], dest);
-	__raw_writel((__force u32) val[1], dest + 4);
+	__raw_writel(hi, dest);
+	__raw_writel(lo, dest + 4);
 	spin_unlock_irqrestore(doorbell_lock, flags);
 }
 
diff --git a/drivers/infiniband/hw/mthca/mthca_eq.c b/drivers/infiniband/hw/mthca/mthca_eq.c
index 8592b26dc4e..b29de51b7f3 100644
--- a/drivers/infiniband/hw/mthca/mthca_eq.c
+++ b/drivers/infiniband/hw/mthca/mthca_eq.c
@@ -173,11 +173,6 @@ static inline u64 async_mask(struct mthca_dev *dev)
 
 static inline void tavor_set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci)
 {
-	__be32 doorbell[2];
-
-	doorbell[0] = cpu_to_be32(MTHCA_EQ_DB_SET_CI | eq->eqn);
-	doorbell[1] = cpu_to_be32(ci & (eq->nent - 1));
-
 	/*
 	 * This barrier makes sure that all updates to ownership bits
 	 * done by set_eqe_hw() hit memory before the consumer index
@@ -187,7 +182,7 @@ static inline void tavor_set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u
 	 * having set_eqe_hw() overwrite the owner field.
 	 */
 	wmb();
-	mthca_write64(doorbell,
+	mthca_write64(MTHCA_EQ_DB_SET_CI | eq->eqn, ci & (eq->nent - 1),
 		      dev->kar + MTHCA_EQ_DOORBELL,
 		      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
 }
@@ -212,12 +207,7 @@ static inline void set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci)
 
 static inline void tavor_eq_req_not(struct mthca_dev *dev, int eqn)
 {
-	__be32 doorbell[2];
-
-	doorbell[0] = cpu_to_be32(MTHCA_EQ_DB_REQ_NOT | eqn);
-	doorbell[1] = 0;
-
-	mthca_write64(doorbell,
+	mthca_write64(MTHCA_EQ_DB_REQ_NOT | eqn, 0,
 		      dev->kar + MTHCA_EQ_DOORBELL,
 		      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
 }
@@ -230,12 +220,7 @@ static inline void arbel_eq_req_not(struct mthca_dev *dev, u32 eqn_mask)
 static inline void disarm_cq(struct mthca_dev *dev, int eqn, int cqn)
 {
 	if (!mthca_is_memfree(dev)) {
-		__be32 doorbell[2];
-
-		doorbell[0] = cpu_to_be32(MTHCA_EQ_DB_DISARM_CQ | eqn);
-		doorbell[1] = cpu_to_be32(cqn);
-
-		mthca_write64(doorbell,
+		mthca_write64(MTHCA_EQ_DB_DISARM_CQ | eqn, cqn,
 			      dev->kar + MTHCA_EQ_DOORBELL,
 			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
 	}
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c
index e61f3e62698..007b38157fc 100644
--- a/drivers/infiniband/hw/mthca/mthca_memfree.c
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -71,7 +71,7 @@ static void mthca_free_icm_pages(struct mthca_dev *dev, struct mthca_icm_chunk *
 			     PCI_DMA_BIDIRECTIONAL);
 
 	for (i = 0; i < chunk->npages; ++i)
-		__free_pages(chunk->mem[i].page,
+		__free_pages(sg_page(&chunk->mem[i]),
 			     get_order(chunk->mem[i].length));
 }
 
@@ -81,7 +81,7 @@ static void mthca_free_icm_coherent(struct mthca_dev *dev, struct mthca_icm_chun
 
 	for (i = 0; i < chunk->npages; ++i) {
 		dma_free_coherent(&dev->pdev->dev, chunk->mem[i].length,
-				  lowmem_page_address(chunk->mem[i].page),
+				  lowmem_page_address(sg_page(&chunk->mem[i])),
 				  sg_dma_address(&chunk->mem[i]));
 	}
 }
@@ -107,10 +107,13 @@ void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm, int coherent)
 
 static int mthca_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_mask)
 {
-	mem->page = alloc_pages(gfp_mask, order);
-	if (!mem->page)
+	struct page *page;
+
+	page = alloc_pages(gfp_mask, order);
+	if (!page)
 		return -ENOMEM;
 
+	sg_set_page(mem, page);
 	mem->length = PAGE_SIZE << order;
 	mem->offset = 0;
 	return 0;
@@ -157,6 +160,7 @@ struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages,
 			if (!chunk)
 				goto fail;
 
+			sg_init_table(chunk->mem, MTHCA_ICM_CHUNK_LEN);
 			chunk->npages = 0;
 			chunk->nsg    = 0;
 			list_add_tail(&chunk->list, &icm->chunk_list);
@@ -304,7 +308,7 @@ void *mthca_table_find(struct mthca_icm_table *table, int obj, dma_addr_t *dma_h
 			 * so if we found the page, dma_handle has already
 			 * been assigned to. */
 			if (chunk->mem[i].length > offset) {
-				page = chunk->mem[i].page;
+				page = sg_page(&chunk->mem[i]);
 				goto out;
 			}
 			offset -= chunk->mem[i].length;
@@ -445,6 +449,7 @@ static u64 mthca_uarc_virt(struct mthca_dev *dev, struct mthca_uar *uar, int pag
 int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
 		      struct mthca_user_db_table *db_tab, int index, u64 uaddr)
 {
+	struct page *pages[1];
 	int ret = 0;
 	u8 status;
 	int i;
@@ -472,16 +477,17 @@ int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
 	}
 
 	ret = get_user_pages(current, current->mm, uaddr & PAGE_MASK, 1, 1, 0,
-			     &db_tab->page[i].mem.page, NULL);
+			     pages, NULL);
 	if (ret < 0)
 		goto out;
 
+	sg_set_page(&db_tab->page[i].mem, pages[0]);
 	db_tab->page[i].mem.length = MTHCA_ICM_PAGE_SIZE;
 	db_tab->page[i].mem.offset = uaddr & ~PAGE_MASK;
 
 	ret = pci_map_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
 	if (ret < 0) {
-		put_page(db_tab->page[i].mem.page);
+		put_page(pages[0]);
 		goto out;
 	}
 
@@ -491,7 +497,7 @@ int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
 		ret = -EINVAL;
 	if (ret) {
 		pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
-		put_page(db_tab->page[i].mem.page);
+		put_page(sg_page(&db_tab->page[i].mem));
 		goto out;
 	}
 
@@ -557,7 +563,7 @@ void mthca_cleanup_user_db_tab(struct mthca_dev *dev, struct mthca_uar *uar,
 		if (db_tab->page[i].uvirt) {
 			mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, uar, i), 1, &status);
 			pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
-			put_page(db_tab->page[i].mem.page);
+			put_page(sg_page(&db_tab->page[i].mem));
 		}
 	}
 
diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index df01b2026a6..0e5461c6573 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -1799,15 +1799,11 @@ int mthca_tavor_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 
 out:
 	if (likely(nreq)) {
-		__be32 doorbell[2];
-
-		doorbell[0] = cpu_to_be32(((qp->sq.next_ind << qp->sq.wqe_shift) +
-					   qp->send_wqe_offset) | f0 | op0);
-		doorbell[1] = cpu_to_be32((qp->qpn << 8) | size0);
-
 		wmb();
 
-		mthca_write64(doorbell,
+		mthca_write64(((qp->sq.next_ind << qp->sq.wqe_shift) +
+			       qp->send_wqe_offset) | f0 | op0,
+			      (qp->qpn << 8) | size0,
 			      dev->kar + MTHCA_SEND_DOORBELL,
 			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
 		/*
@@ -1829,7 +1825,6 @@ int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 {
 	struct mthca_dev *dev = to_mdev(ibqp->device);
 	struct mthca_qp *qp = to_mqp(ibqp);
-	__be32 doorbell[2];
 	unsigned long flags;
 	int err = 0;
 	int nreq;
@@ -1907,13 +1902,10 @@ int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 		if (unlikely(nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB)) {
 			nreq = 0;
 
-			doorbell[0] = cpu_to_be32((qp->rq.next_ind << qp->rq.wqe_shift) | size0);
-			doorbell[1] = cpu_to_be32(qp->qpn << 8);
-
 			wmb();
 
-			mthca_write64(doorbell,
-				      dev->kar + MTHCA_RECEIVE_DOORBELL,
+			mthca_write64((qp->rq.next_ind << qp->rq.wqe_shift) | size0,
+				      qp->qpn << 8, dev->kar + MTHCA_RECEIVE_DOORBELL,
 				      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
 
 			qp->rq.next_ind = ind;
@@ -1923,13 +1915,10 @@ int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 
 out:
 	if (likely(nreq)) {
-		doorbell[0] = cpu_to_be32((qp->rq.next_ind << qp->rq.wqe_shift) | size0);
-		doorbell[1] = cpu_to_be32((qp->qpn << 8) | nreq);
-
 		wmb();
 
-		mthca_write64(doorbell,
-			      dev->kar + MTHCA_RECEIVE_DOORBELL,
+		mthca_write64((qp->rq.next_ind << qp->rq.wqe_shift) | size0,
+			      qp->qpn << 8 | nreq, dev->kar + MTHCA_RECEIVE_DOORBELL,
 			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
 	}
 
@@ -1951,7 +1940,7 @@ int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 {
 	struct mthca_dev *dev = to_mdev(ibqp->device);
 	struct mthca_qp *qp = to_mqp(ibqp);
-	__be32 doorbell[2];
+	u32 dbhi;
 	void *wqe;
 	void *prev_wqe;
 	unsigned long flags;
@@ -1981,10 +1970,8 @@ int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 		if (unlikely(nreq == MTHCA_ARBEL_MAX_WQES_PER_SEND_DB)) {
 			nreq = 0;
 
-			doorbell[0] = cpu_to_be32((MTHCA_ARBEL_MAX_WQES_PER_SEND_DB << 24) |
-						  ((qp->sq.head & 0xffff) << 8) |
-						  f0 | op0);
-			doorbell[1] = cpu_to_be32((qp->qpn << 8) | size0);
+			dbhi = (MTHCA_ARBEL_MAX_WQES_PER_SEND_DB << 24) |
+				((qp->sq.head & 0xffff) << 8) | f0 | op0;
 
 			qp->sq.head += MTHCA_ARBEL_MAX_WQES_PER_SEND_DB;
 
@@ -2000,7 +1987,8 @@ int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 			 * write MMIO send doorbell.
 			 */
 			wmb();
-			mthca_write64(doorbell,
+
+			mthca_write64(dbhi, (qp->qpn << 8) | size0,
 				      dev->kar + MTHCA_SEND_DOORBELL,
 				      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
 		}
@@ -2154,10 +2142,7 @@ int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 
 out:
 	if (likely(nreq)) {
-		doorbell[0] = cpu_to_be32((nreq << 24)                  |
-					  ((qp->sq.head & 0xffff) << 8) |
-					  f0 | op0);
-		doorbell[1] = cpu_to_be32((qp->qpn << 8) | size0);
+		dbhi = (nreq << 24) | ((qp->sq.head & 0xffff) << 8) | f0 | op0;
 
 		qp->sq.head += nreq;
 
@@ -2173,8 +2158,8 @@ out:
 		 * write MMIO send doorbell.
 		 */
 		wmb();
-		mthca_write64(doorbell,
-			      dev->kar + MTHCA_SEND_DOORBELL,
+
+		mthca_write64(dbhi, (qp->qpn << 8) | size0, dev->kar + MTHCA_SEND_DOORBELL,
 			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
 	}
 
diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c
index 3f58c11a62b..553d681f681 100644
--- a/drivers/infiniband/hw/mthca/mthca_srq.c
+++ b/drivers/infiniband/hw/mthca/mthca_srq.c
@@ -491,7 +491,6 @@ int mthca_tavor_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
 {
 	struct mthca_dev *dev = to_mdev(ibsrq->device);
 	struct mthca_srq *srq = to_msrq(ibsrq);
-	__be32 doorbell[2];
 	unsigned long flags;
 	int err = 0;
 	int first_ind;
@@ -563,16 +562,13 @@ int mthca_tavor_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
 		if (unlikely(nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB)) {
 			nreq = 0;
 
-			doorbell[0] = cpu_to_be32(first_ind << srq->wqe_shift);
-			doorbell[1] = cpu_to_be32(srq->srqn << 8);
-
 			/*
 			 * Make sure that descriptors are written
 			 * before doorbell is rung.
 			 */
 			wmb();
 
-			mthca_write64(doorbell,
+			mthca_write64(first_ind << srq->wqe_shift, srq->srqn << 8,
 				      dev->kar + MTHCA_RECEIVE_DOORBELL,
 				      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
 
@@ -581,16 +577,13 @@ int mthca_tavor_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
 	}
 
 	if (likely(nreq)) {
-		doorbell[0] = cpu_to_be32(first_ind << srq->wqe_shift);
-		doorbell[1] = cpu_to_be32((srq->srqn << 8) | nreq);
-
 		/*
 		 * Make sure that descriptors are written before
 		 * doorbell is rung.
 		 */
 		wmb();
 
-		mthca_write64(doorbell,
+		mthca_write64(first_ind << srq->wqe_shift, (srq->srqn << 8) | nreq,
 			      dev->kar + MTHCA_RECEIVE_DOORBELL,
 			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
 	}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 1b3327ad6bc..eb7edab0e83 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -84,9 +84,8 @@ enum {
 	IPOIB_MCAST_RUN 	  = 6,
 	IPOIB_STOP_REAPER         = 7,
 	IPOIB_MCAST_STARTED       = 8,
-	IPOIB_FLAG_NETIF_STOPPED  = 9,
-	IPOIB_FLAG_ADMIN_CM 	  = 10,
-	IPOIB_FLAG_UMCAST	  = 11,
+	IPOIB_FLAG_ADMIN_CM 	  = 9,
+	IPOIB_FLAG_UMCAST	  = 10,
 
 	IPOIB_MAX_BACKOFF_SECONDS = 16,
 
@@ -98,9 +97,9 @@ enum {
 
 #define	IPOIB_OP_RECV   (1ul << 31)
 #ifdef CONFIG_INFINIBAND_IPOIB_CM
-#define	IPOIB_CM_OP_SRQ (1ul << 30)
+#define	IPOIB_OP_CM     (1ul << 30)
 #else
-#define	IPOIB_CM_OP_SRQ (0)
+#define	IPOIB_OP_CM     (0)
 #endif
 
 /* structs */
@@ -197,7 +196,6 @@ struct ipoib_cm_rx {
 
 struct ipoib_cm_tx {
 	struct ib_cm_id     *id;
-	struct ib_cq        *cq;
 	struct ib_qp        *qp;
 	struct list_head     list;
 	struct net_device   *dev;
@@ -294,6 +292,7 @@ struct ipoib_dev_priv {
 	unsigned             tx_tail;
 	struct ib_sge        tx_sge;
 	struct ib_send_wr    tx_wr;
+	unsigned             tx_outstanding;
 
 	struct ib_wc ibwc[IPOIB_NUM_WC];
 
@@ -504,6 +503,7 @@ void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx);
 void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
 			   unsigned int mtu);
 void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc);
+void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc);
 #else
 
 struct ipoib_cm_tx;
@@ -592,6 +592,9 @@ static inline void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *w
 {
 }
 
+static inline void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
+{
+}
 #endif
 
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 0a0dcb8fdfd..87610772a97 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -87,7 +87,7 @@ static int ipoib_cm_post_receive(struct net_device *dev, int id)
 	struct ib_recv_wr *bad_wr;
 	int i, ret;
 
-	priv->cm.rx_wr.wr_id = id | IPOIB_CM_OP_SRQ;
+	priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
 
 	for (i = 0; i < IPOIB_CM_RX_SG; ++i)
 		priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i];
@@ -401,7 +401,7 @@ static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space,
 void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	unsigned int wr_id = wc->wr_id & ~IPOIB_CM_OP_SRQ;
+	unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV);
 	struct sk_buff *skb, *newskb;
 	struct ipoib_cm_rx *p;
 	unsigned long flags;
@@ -412,7 +412,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 		       wr_id, wc->status);
 
 	if (unlikely(wr_id >= ipoib_recvq_size)) {
-		if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~IPOIB_CM_OP_SRQ)) {
+		if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) {
 			spin_lock_irqsave(&priv->lock, flags);
 			list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
 			ipoib_cm_start_rx_drain(priv);
@@ -434,7 +434,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 		goto repost;
 	}
 
-	if (!likely(wr_id & IPOIB_CM_RX_UPDATE_MASK)) {
+	if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) {
 		p = wc->qp->qp_context;
 		if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
 			spin_lock_irqsave(&priv->lock, flags);
@@ -498,7 +498,7 @@ static inline int post_send(struct ipoib_dev_priv *priv,
 	priv->tx_sge.addr             = addr;
 	priv->tx_sge.length           = len;
 
-	priv->tx_wr.wr_id 	      = wr_id;
+	priv->tx_wr.wr_id 	      = wr_id | IPOIB_OP_CM;
 
 	return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
 }
@@ -549,20 +549,19 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
 		dev->trans_start = jiffies;
 		++tx->tx_head;
 
-		if (tx->tx_head - tx->tx_tail == ipoib_sendq_size) {
+		if (++priv->tx_outstanding == ipoib_sendq_size) {
 			ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",
 				  tx->qp->qp_num);
 			netif_stop_queue(dev);
-			set_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags);
 		}
 	}
 }
 
-static void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ipoib_cm_tx *tx,
-				  struct ib_wc *wc)
+void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	unsigned int wr_id = wc->wr_id;
+	struct ipoib_cm_tx *tx = wc->qp->qp_context;
+	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
 	struct ipoib_tx_buf *tx_req;
 	unsigned long flags;
 
@@ -587,11 +586,10 @@ static void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ipoib_cm_tx *tx
 
 	spin_lock_irqsave(&priv->tx_lock, flags);
 	++tx->tx_tail;
-	if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags)) &&
-	    tx->tx_head - tx->tx_tail <= ipoib_sendq_size >> 1) {
-		clear_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags);
+	if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
+	    netif_queue_stopped(dev) &&
+	    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
 		netif_wake_queue(dev);
-	}
 
 	if (wc->status != IB_WC_SUCCESS &&
 	    wc->status != IB_WC_WR_FLUSH_ERR) {
@@ -614,11 +612,6 @@ static void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ipoib_cm_tx *tx
 			tx->neigh = NULL;
 		}
 
-		/* queue would be re-started anyway when TX is destroyed,
-		 * but it makes sense to do it ASAP here. */
-		if (test_and_clear_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags))
-			netif_wake_queue(dev);
-
 		if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
 			list_move(&tx->list, &priv->cm.reap_list);
 			queue_work(ipoib_workqueue, &priv->cm.reap_task);
@@ -632,19 +625,6 @@ static void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ipoib_cm_tx *tx
 	spin_unlock_irqrestore(&priv->tx_lock, flags);
 }
 
-static void ipoib_cm_tx_completion(struct ib_cq *cq, void *tx_ptr)
-{
-	struct ipoib_cm_tx *tx = tx_ptr;
-	int n, i;
-
-	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
-	do {
-		n = ib_poll_cq(cq, IPOIB_NUM_WC, tx->ibwc);
-		for (i = 0; i < n; ++i)
-			ipoib_cm_handle_tx_wc(tx->dev, tx, tx->ibwc + i);
-	} while (n == IPOIB_NUM_WC);
-}
-
 int ipoib_cm_dev_open(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -807,17 +787,18 @@ static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
 	return 0;
 }
 
-static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ib_cq *cq)
+static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_cm_tx *tx)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_qp_init_attr attr = {
-		.send_cq		= cq,
+		.send_cq		= priv->cq,
 		.recv_cq		= priv->cq,
 		.srq			= priv->cm.srq,
 		.cap.max_send_wr	= ipoib_sendq_size,
 		.cap.max_send_sge	= 1,
 		.sq_sig_type		= IB_SIGNAL_ALL_WR,
 		.qp_type		= IB_QPT_RC,
+		.qp_context		= tx
         };
 
 	return ib_create_qp(priv->pd, &attr);
@@ -899,21 +880,7 @@ static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn,
 		goto err_tx;
 	}
 
-	p->cq = ib_create_cq(priv->ca, ipoib_cm_tx_completion, NULL, p,
-			     ipoib_sendq_size + 1, 0);
-	if (IS_ERR(p->cq)) {
-		ret = PTR_ERR(p->cq);
-		ipoib_warn(priv, "failed to allocate tx cq: %d\n", ret);
-		goto err_cq;
-	}
-
-	ret = ib_req_notify_cq(p->cq, IB_CQ_NEXT_COMP);
-	if (ret) {
-		ipoib_warn(priv, "failed to request completion notification: %d\n", ret);
-		goto err_req_notify;
-	}
-
-	p->qp = ipoib_cm_create_tx_qp(p->dev, p->cq);
+	p->qp = ipoib_cm_create_tx_qp(p->dev, p);
 	if (IS_ERR(p->qp)) {
 		ret = PTR_ERR(p->qp);
 		ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret);
@@ -950,12 +917,8 @@ err_modify:
 err_id:
 	p->id = NULL;
 	ib_destroy_qp(p->qp);
-err_req_notify:
 err_qp:
 	p->qp = NULL;
-	ib_destroy_cq(p->cq);
-err_cq:
-	p->cq = NULL;
 err_tx:
 	return ret;
 }
@@ -964,6 +927,8 @@ static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(p->dev);
 	struct ipoib_tx_buf *tx_req;
+	unsigned long flags;
+	unsigned long begin;
 
 	ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
 		  p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail);
@@ -971,27 +936,40 @@ static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
 	if (p->id)
 		ib_destroy_cm_id(p->id);
 
-	if (p->qp)
-		ib_destroy_qp(p->qp);
-
-	if (p->cq)
-		ib_destroy_cq(p->cq);
-
-	if (test_bit(IPOIB_FLAG_NETIF_STOPPED, &p->flags))
-		netif_wake_queue(p->dev);
-
 	if (p->tx_ring) {
+		/* Wait for all sends to complete */
+		begin = jiffies;
 		while ((int) p->tx_tail - (int) p->tx_head < 0) {
-			tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
-			ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
-					 DMA_TO_DEVICE);
-			dev_kfree_skb_any(tx_req->skb);
-			++p->tx_tail;
+			if (time_after(jiffies, begin + 5 * HZ)) {
+				ipoib_warn(priv, "timing out; %d sends not completed\n",
+					   p->tx_head - p->tx_tail);
+				goto timeout;
+			}
+
+			msleep(1);
 		}
+	}
 
-		kfree(p->tx_ring);
+timeout:
+
+	while ((int) p->tx_tail - (int) p->tx_head < 0) {
+		tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
+		ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
+				    DMA_TO_DEVICE);
+		dev_kfree_skb_any(tx_req->skb);
+		++p->tx_tail;
+		spin_lock_irqsave(&priv->tx_lock, flags);
+		if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
+		    netif_queue_stopped(p->dev) &&
+		    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+			netif_wake_queue(p->dev);
+		spin_unlock_irqrestore(&priv->tx_lock, flags);
 	}
 
+	if (p->qp)
+		ib_destroy_qp(p->qp);
+
+	kfree(p->tx_ring);
 	kfree(p);
 }
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 1a77e79f6b4..5063dd509ad 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -267,11 +267,10 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 
 	spin_lock_irqsave(&priv->tx_lock, flags);
 	++priv->tx_tail;
-	if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags)) &&
-	    priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1) {
-		clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
+	if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
+	    netif_queue_stopped(dev) &&
+	    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
 		netif_wake_queue(dev);
-	}
 	spin_unlock_irqrestore(&priv->tx_lock, flags);
 
 	if (wc->status != IB_WC_SUCCESS &&
@@ -301,14 +300,18 @@ poll_more:
 		for (i = 0; i < n; i++) {
 			struct ib_wc *wc = priv->ibwc + i;
 
-			if (wc->wr_id & IPOIB_CM_OP_SRQ) {
-				++done;
-				ipoib_cm_handle_rx_wc(dev, wc);
-			} else if (wc->wr_id & IPOIB_OP_RECV) {
+			if (wc->wr_id & IPOIB_OP_RECV) {
 				++done;
-				ipoib_ib_handle_rx_wc(dev, wc);
-			} else
-				ipoib_ib_handle_tx_wc(dev, wc);
+				if (wc->wr_id & IPOIB_OP_CM)
+					ipoib_cm_handle_rx_wc(dev, wc);
+				else
+					ipoib_ib_handle_rx_wc(dev, wc);
+			} else {
+				if (wc->wr_id & IPOIB_OP_CM)
+					ipoib_cm_handle_tx_wc(dev, wc);
+				else
+					ipoib_ib_handle_tx_wc(dev, wc);
+			}
 		}
 
 		if (n != t)
@@ -401,10 +404,9 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
 		address->last_send = priv->tx_head;
 		++priv->tx_head;
 
-		if (priv->tx_head - priv->tx_tail == ipoib_sendq_size) {
+		if (++priv->tx_outstanding == ipoib_sendq_size) {
 			ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
 			netif_stop_queue(dev);
-			set_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
 		}
 	}
 }
@@ -436,7 +438,8 @@ void ipoib_reap_ah(struct work_struct *work)
 	__ipoib_reap_ah(dev);
 
 	if (!test_bit(IPOIB_STOP_REAPER, &priv->flags))
-		queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ);
+		queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task,
+				   round_jiffies_relative(HZ));
 }
 
 int ipoib_ib_dev_open(struct net_device *dev)
@@ -472,7 +475,8 @@ int ipoib_ib_dev_open(struct net_device *dev)
 	}
 
 	clear_bit(IPOIB_STOP_REAPER, &priv->flags);
-	queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ);
+	queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task,
+			   round_jiffies_relative(HZ));
 
 	set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
 
@@ -561,12 +565,17 @@ void ipoib_drain_cq(struct net_device *dev)
 			if (priv->ibwc[i].status == IB_WC_SUCCESS)
 				priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR;
 
-			if (priv->ibwc[i].wr_id & IPOIB_CM_OP_SRQ)
-				ipoib_cm_handle_rx_wc(dev, priv->ibwc + i);
-			else if (priv->ibwc[i].wr_id & IPOIB_OP_RECV)
-				ipoib_ib_handle_rx_wc(dev, priv->ibwc + i);
-			else
-				ipoib_ib_handle_tx_wc(dev, priv->ibwc + i);
+			if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) {
+				if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
+					ipoib_cm_handle_rx_wc(dev, priv->ibwc + i);
+				else
+					ipoib_ib_handle_rx_wc(dev, priv->ibwc + i);
+			} else {
+				if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
+					ipoib_cm_handle_tx_wc(dev, priv->ibwc + i);
+				else
+					ipoib_ib_handle_tx_wc(dev, priv->ibwc + i);
+			}
 		}
 	} while (n == IPOIB_NUM_WC);
 }
@@ -612,6 +621,7 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush)
 						    DMA_TO_DEVICE);
 				dev_kfree_skb_any(tx_req->skb);
 				++priv->tx_tail;
+				--priv->tx_outstanding;
 			}
 
 			for (i = 0; i < ipoib_recvq_size; ++i) {
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 362610d870e..a03a65ebcf0 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -148,8 +148,6 @@ static int ipoib_stop(struct net_device *dev)
 
 	netif_stop_queue(dev);
 
-	clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
-
 	/*
 	 * Now flush workqueue to make sure a scheduled task doesn't
 	 * bring our internal state back up.
@@ -902,7 +900,7 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
 		goto out_rx_ring_cleanup;
 	}
 
-	/* priv->tx_head & tx_tail are already 0 */
+	/* priv->tx_head, tx_tail & tx_outstanding are already 0 */
 
 	if (ipoib_ib_dev_init(dev, ca, port))
 		goto out_tx_ring_cleanup;
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index f3529b6f0a3..d6879806179 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -131,7 +131,7 @@ static int iser_start_rdma_unaligned_sg(struct iscsi_iser_cmd_task *iser_ctask,
 
 		p = mem;
 		for_each_sg(sgl, sg, data->size, i) {
-			from = kmap_atomic(sg->page, KM_USER0);
+			from = kmap_atomic(sg_page(sg), KM_USER0);
 			memcpy(p,
 			       from + sg->offset,
 			       sg->length);
@@ -191,7 +191,7 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_cmd_task *iser_ctask,
 
 		p = mem;
 		for_each_sg(sgl, sg, sg_size, i) {
-			to = kmap_atomic(sg->page, KM_SOFTIRQ0);
+			to = kmap_atomic(sg_page(sg), KM_SOFTIRQ0);
 			memcpy(to + sg->offset,
 			       p,
 			       sg->length);
@@ -300,7 +300,7 @@ static unsigned int iser_data_buf_aligned_len(struct iser_data_buf *data,
 	for_each_sg(sgl, sg, data->dma_nents, i) {
 		/* iser_dbg("Checking sg iobuf [%d]: phys=0x%08lX "
 		   "offset: %ld sz: %ld\n", i,
-		   (unsigned long)page_to_phys(sg->page),
+		   (unsigned long)sg_phys(sg),
 		   (unsigned long)sg->offset,
 		   (unsigned long)sg->length); */
 		end_addr = ib_sg_dma_address(ibdev, sg) +
@@ -336,7 +336,7 @@ static void iser_data_buf_dump(struct iser_data_buf *data,
 		iser_err("sg[%d] dma_addr:0x%lX page:0x%p "
 			 "off:0x%x sz:0x%x dma_len:0x%x\n",
 			 i, (unsigned long)ib_sg_dma_address(ibdev, sg),
-			 sg->page, sg->offset,
+			 sg_page(sg), sg->offset,
 			 sg->length, ib_sg_dma_len(ibdev, sg));
 }
 
diff --git a/drivers/input/keyboard/bf54x-keys.c b/drivers/input/keyboard/bf54x-keys.c
index a67b29b089e..e5f4da92834 100644
--- a/drivers/input/keyboard/bf54x-keys.c
+++ b/drivers/input/keyboard/bf54x-keys.c
@@ -256,7 +256,6 @@ static int __devinit bfin_kpad_probe(struct platform_device *pdev)
 		printk(KERN_ERR DRV_NAME
 			": unable to claim irq %d; error %d\n",
 			bf54x_kpad->irq, error);
-		error = -EBUSY;
 		goto out2;
 	}
 
diff --git a/drivers/input/mouse/appletouch.c b/drivers/input/mouse/appletouch.c
index 0117817bf53..f132702d137 100644
--- a/drivers/input/mouse/appletouch.c
+++ b/drivers/input/mouse/appletouch.c
@@ -504,25 +504,22 @@ static void atp_complete(struct urb* urb)
 		memset(dev->xy_acc, 0, sizeof(dev->xy_acc));
 	}
 
-	/* Geyser 3 will continue to send packets continually after
+	input_report_key(dev->input, BTN_LEFT, key);
+	input_sync(dev->input);
+
+	/* Many Geysers will continue to send packets continually after
 	   the first touch unless reinitialised. Do so if it's been
 	   idle for a while in order to avoid waking the kernel up
 	   several hundred times a second */
 
-	if (atp_is_geyser_3(dev)) {
-		if (!x && !y && !key) {
-			dev->idlecount++;
-			if (dev->idlecount == 10) {
-				dev->valid = 0;
-				schedule_work(&dev->work);
-			}
+	if (!x && !y && !key) {
+		dev->idlecount++;
+		if (dev->idlecount == 10) {
+			dev->valid = 0;
+			schedule_work(&dev->work);
 		}
-		else
-			dev->idlecount = 0;
-	}
-
-	input_report_key(dev->input, BTN_LEFT, key);
-	input_sync(dev->input);
+	} else
+		dev->idlecount = 0;
 
 exit:
 	retval = usb_submit_urb(dev->urb, GFP_ATOMIC);
diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c
index 11dafc0ee99..1a0cea3c529 100644
--- a/drivers/input/serio/i8042.c
+++ b/drivers/input/serio/i8042.c
@@ -20,6 +20,7 @@
 #include <linux/err.h>
 #include <linux/rcupdate.h>
 #include <linux/platform_device.h>
+#include <linux/i8042.h>
 
 #include <asm/io.h>
 
@@ -208,7 +209,7 @@ static int __i8042_command(unsigned char *param, int command)
 	return 0;
 }
 
-static int i8042_command(unsigned char *param, int command)
+int i8042_command(unsigned char *param, int command)
 {
 	unsigned long flags;
 	int retval;
@@ -219,6 +220,7 @@ static int i8042_command(unsigned char *param, int command)
 
 	return retval;
 }
+EXPORT_SYMBOL(i8042_command);
 
 /*
  * i8042_kbd_write() sends a byte out through the keyboard interface.
diff --git a/drivers/input/serio/i8042.h b/drivers/input/serio/i8042.h
index b3eb7a72d96..dd22d91f8b3 100644
--- a/drivers/input/serio/i8042.h
+++ b/drivers/input/serio/i8042.h
@@ -61,28 +61,6 @@
 #define I8042_CTR_XLATE		0x40
 
 /*
- * Commands.
- */
-
-#define I8042_CMD_CTL_RCTR	0x0120
-#define I8042_CMD_CTL_WCTR	0x1060
-#define I8042_CMD_CTL_TEST	0x01aa
-
-#define I8042_CMD_KBD_DISABLE	0x00ad
-#define I8042_CMD_KBD_ENABLE	0x00ae
-#define I8042_CMD_KBD_TEST	0x01ab
-#define I8042_CMD_KBD_LOOP	0x11d2
-
-#define I8042_CMD_AUX_DISABLE	0x00a7
-#define I8042_CMD_AUX_ENABLE	0x00a8
-#define I8042_CMD_AUX_TEST	0x01a9
-#define I8042_CMD_AUX_SEND	0x10d4
-#define I8042_CMD_AUX_LOOP	0x11d3
-
-#define I8042_CMD_MUX_PFX	0x0090
-#define I8042_CMD_MUX_SEND	0x1090
-
-/*
  * Return codes.
  */
 
diff --git a/drivers/input/touchscreen/Kconfig b/drivers/input/touchscreen/Kconfig
index e3e0baa1a15..fa8442b6241 100644
--- a/drivers/input/touchscreen/Kconfig
+++ b/drivers/input/touchscreen/Kconfig
@@ -202,6 +202,7 @@ config TOUCHSCREEN_USB_COMPOSITE
 	  - DMC TSC-10/25
 	  - IRTOUCHSYSTEMS/UNITOP
 	  - IdealTEK URTC1000
+	  - GoTop Super_Q2/GogoPen/PenPower tablets
 
 	  Have a look at <http://linux.chapter7.ch/touchkit/> for
 	  a usage description and the required user-space stuff.
@@ -259,4 +260,9 @@ config TOUCHSCREEN_USB_GENERAL_TOUCH
 	bool "GeneralTouch Touchscreen device support" if EMBEDDED
 	depends on TOUCHSCREEN_USB_COMPOSITE
 
+config TOUCHSCREEN_USB_GOTOP
+	default y
+	bool "GoTop Super_Q2/GogoPen/PenPower tablet device support" if EMBEDDED
+	depends on TOUCHSCREEN_USB_COMPOSITE
+
 endif
diff --git a/drivers/input/touchscreen/usbtouchscreen.c b/drivers/input/touchscreen/usbtouchscreen.c
index 5f34b78d5dd..19055e7381f 100644
--- a/drivers/input/touchscreen/usbtouchscreen.c
+++ b/drivers/input/touchscreen/usbtouchscreen.c
@@ -11,8 +11,9 @@
  *  - DMC TSC-10/25
  *  - IRTOUCHSYSTEMS/UNITOP
  *  - IdealTEK URTC1000
+ *  - GoTop Super_Q2/GogoPen/PenPower tablets
  *
- * Copyright (C) 2004-2006 by Daniel Ritz <daniel.ritz@gmx.ch>
+ * Copyright (C) 2004-2007 by Daniel Ritz <daniel.ritz@gmx.ch>
  * Copyright (C) by Todd E. Johnson (mtouchusb.c)
  *
  * This program is free software; you can redistribute it and/or
@@ -115,6 +116,7 @@ enum {
 	DEVTYPE_IRTOUCH,
 	DEVTYPE_IDEALTEK,
 	DEVTYPE_GENERAL_TOUCH,
+	DEVTYPE_GOTOP,
 };
 
 static struct usb_device_id usbtouch_devices[] = {
@@ -168,6 +170,12 @@ static struct usb_device_id usbtouch_devices[] = {
 	{USB_DEVICE(0x0dfc, 0x0001), .driver_info = DEVTYPE_GENERAL_TOUCH},
 #endif
 
+#ifdef CONFIG_TOUCHSCREEN_USB_GOTOP
+	{USB_DEVICE(0x08f2, 0x007f), .driver_info = DEVTYPE_GOTOP},
+	{USB_DEVICE(0x08f2, 0x00ce), .driver_info = DEVTYPE_GOTOP},
+	{USB_DEVICE(0x08f2, 0x00f4), .driver_info = DEVTYPE_GOTOP},
+#endif
+
 	{}
 };
 
@@ -501,6 +509,20 @@ static int general_touch_read_data(struct usbtouch_usb *dev, unsigned char *pkt)
 #endif
 
 /*****************************************************************************
+ * GoTop Part
+ */
+#ifdef CONFIG_TOUCHSCREEN_USB_GOTOP
+static int gotop_read_data(struct usbtouch_usb *dev, unsigned char *pkt)
+{
+	dev->x = ((pkt[1] & 0x38) << 4) | pkt[2];
+	dev->y = ((pkt[1] & 0x07) << 7) | pkt[3];
+	dev->touch = pkt[0] & 0x01;
+	return 1;
+}
+#endif
+
+
+/*****************************************************************************
  * the different device descriptors
  */
 static struct usbtouch_device_info usbtouch_dev_info[] = {
@@ -623,9 +645,19 @@ static struct usbtouch_device_info usbtouch_dev_info[] = {
 		.max_yc		= 0x0500,
 		.rept_size	= 7,
 		.read_data	= general_touch_read_data,
-	}
+	},
 #endif
 
+#ifdef CONFIG_TOUCHSCREEN_USB_GOTOP
+	[DEVTYPE_GOTOP] = {
+		.min_xc		= 0x0,
+		.max_xc		= 0x03ff,
+		.min_yc		= 0x0,
+		.max_yc		= 0x03ff,
+		.rept_size	= 4,
+		.read_data	= gotop_read_data,
+	},
+#endif
 };
 
 
diff --git a/drivers/kvm/Kconfig b/drivers/kvm/Kconfig
index 8749fa4ffce..656920636cb 100644
--- a/drivers/kvm/Kconfig
+++ b/drivers/kvm/Kconfig
@@ -47,4 +47,8 @@ config KVM_AMD
 	  Provides support for KVM on AMD processors equipped with the AMD-V
 	  (SVM) extensions.
 
+# OK, it's a little counter-intuitive to do this, but it puts it neatly under
+# the virtualization menu.
+source drivers/lguest/Kconfig
+
 endif # VIRTUALIZATION
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index af2d288c881..07ae280e8fe 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -198,21 +198,15 @@ static void vcpu_put(struct kvm_vcpu *vcpu)
 
 static void ack_flush(void *_completed)
 {
-	atomic_t *completed = _completed;
-
-	atomic_inc(completed);
 }
 
 void kvm_flush_remote_tlbs(struct kvm *kvm)
 {
-	int i, cpu, needed;
+	int i, cpu;
 	cpumask_t cpus;
 	struct kvm_vcpu *vcpu;
-	atomic_t completed;
 
-	atomic_set(&completed, 0);
 	cpus_clear(cpus);
-	needed = 0;
 	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
 		vcpu = kvm->vcpus[i];
 		if (!vcpu)
@@ -221,23 +215,9 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
 			continue;
 		cpu = vcpu->cpu;
 		if (cpu != -1 && cpu != raw_smp_processor_id())
-			if (!cpu_isset(cpu, cpus)) {
-				cpu_set(cpu, cpus);
-				++needed;
-			}
-	}
-
-	/*
-	 * We really want smp_call_function_mask() here.  But that's not
-	 * available, so ipi all cpus in parallel and wait for them
-	 * to complete.
-	 */
-	for (cpu = first_cpu(cpus); cpu != NR_CPUS; cpu = next_cpu(cpu, cpus))
-		smp_call_function_single(cpu, ack_flush, &completed, 1, 0);
-	while (atomic_read(&completed) != needed) {
-		cpu_relax();
-		barrier();
+			cpu_set(cpu, cpus);
 	}
+	smp_call_function_mask(cpus, ack_flush, NULL, 1);
 }
 
 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
@@ -2054,12 +2034,21 @@ again:
 
 	kvm_x86_ops->run(vcpu, kvm_run);
 
-	kvm_guest_exit();
 	vcpu->guest_mode = 0;
 	local_irq_enable();
 
 	++vcpu->stat.exits;
 
+	/*
+	 * We must have an instruction between local_irq_enable() and
+	 * kvm_guest_exit(), so the timer interrupt isn't delayed by
+	 * the interrupt shadow.  The stat.exits increment will do nicely.
+	 * But we need to prevent reordering, hence this barrier():
+	 */
+	barrier();
+
+	kvm_guest_exit();
+
 	preempt_enable();
 
 	/*
diff --git a/drivers/kvm/lapic.c b/drivers/kvm/lapic.c
index a190587cf6a..238fcad3cec 100644
--- a/drivers/kvm/lapic.c
+++ b/drivers/kvm/lapic.c
@@ -494,12 +494,19 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 
 static u32 apic_get_tmcct(struct kvm_lapic *apic)
 {
-	u32 counter_passed;
-	ktime_t passed, now = apic->timer.dev.base->get_time();
-	u32 tmcct = apic_get_reg(apic, APIC_TMICT);
+	u64 counter_passed;
+	ktime_t passed, now;
+	u32 tmcct;
 
 	ASSERT(apic != NULL);
 
+	now = apic->timer.dev.base->get_time();
+	tmcct = apic_get_reg(apic, APIC_TMICT);
+
+	/* if initial count is 0, current count should also be 0 */
+	if (tmcct == 0)
+		return 0;
+
 	if (unlikely(ktime_to_ns(now) <=
 		ktime_to_ns(apic->timer.last_update))) {
 		/* Wrap around */
@@ -514,15 +521,24 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
 
 	counter_passed = div64_64(ktime_to_ns(passed),
 				  (APIC_BUS_CYCLE_NS * apic->timer.divide_count));
-	tmcct -= counter_passed;
 
-	if (tmcct <= 0) {
-		if (unlikely(!apic_lvtt_period(apic)))
+	if (counter_passed > tmcct) {
+		if (unlikely(!apic_lvtt_period(apic))) {
+			/* one-shot timers stick at 0 until reset */
 			tmcct = 0;
-		else
-			do {
-				tmcct += apic_get_reg(apic, APIC_TMICT);
-			} while (tmcct <= 0);
+		} else {
+			/*
+			 * periodic timers reset to APIC_TMICT when they
+			 * hit 0. The while loop simulates this happening N
+			 * times. (counter_passed %= tmcct) would also work,
+			 * but might be slower or not work on 32-bit??
+			 */
+			while (counter_passed > tmcct)
+				counter_passed -= tmcct;
+			tmcct -= counter_passed;
+		}
+	} else {
+		tmcct -= counter_passed;
 	}
 
 	return tmcct;
@@ -853,7 +869,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 		apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
 		apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
 	}
-	apic->timer.divide_count = 0;
+	update_divide_count(apic);
 	atomic_set(&apic->timer.pending, 0);
 	if (vcpu->vcpu_id == 0)
 		vcpu->apic_base |= MSR_IA32_APICBASE_BSP;
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
index 6d84d30f5ed..feb5ac986c5 100644
--- a/drivers/kvm/mmu.c
+++ b/drivers/kvm/mmu.c
@@ -1049,6 +1049,7 @@ int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
 	destroy_kvm_mmu(vcpu);
 	return init_kvm_mmu(vcpu);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
 
 int kvm_mmu_load(struct kvm_vcpu *vcpu)
 {
@@ -1088,7 +1089,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
 			mmu_page_remove_parent_pte(child, spte);
 		}
 	}
-	*spte = 0;
+	set_shadow_pte(spte, 0);
 	kvm_flush_remote_tlbs(vcpu->kvm);
 }
 
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index 4f115a8e45e..bb56ae3f89b 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -523,6 +523,8 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
 
 static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
+	if (vcpu->rmode.active)
+		rflags |= IOPL_MASK | X86_EFLAGS_VM;
 	vmcs_writel(GUEST_RFLAGS, rflags);
 }
 
@@ -1128,6 +1130,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 	fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs);
 	fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs);
 
+	kvm_mmu_reset_context(vcpu);
 	init_rmode_tss(vcpu->kvm);
 }
 
@@ -1760,10 +1763,8 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
 	}
 
-	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */
-		asm ("int $2");
-		return 1;
-	}
+	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
+		return 1;  /* already handled by vmx_vcpu_run() */
 
 	if (is_no_device(intr_info)) {
 		vmx_fpu_activate(vcpu);
@@ -2196,6 +2197,7 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 intr_info;
 
 	/*
 	 * Loading guest fpu may have cleared host cr0.ts
@@ -2322,6 +2324,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
 	vmx->launched = 1;
+
+	intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+	/* We need to handle NMIs before interrupts are enabled */
+	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
+		asm("int $2");
 }
 
 static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c
index 9737c3b2f48..a6ace302e0c 100644
--- a/drivers/kvm/x86_emulate.c
+++ b/drivers/kvm/x86_emulate.c
@@ -212,7 +212,8 @@ static u16 twobyte_table[256] = {
 	0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
 	    DstReg | SrcMem16 | ModRM | Mov,
 	/* 0xC0 - 0xCF */
-	0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
+	0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0xD0 - 0xDF */
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0xE0 - 0xEF */
@@ -596,11 +597,10 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		case 0xf0:	/* LOCK */
 			lock_prefix = 1;
 			break;
+		case 0xf2:	/* REPNE/REPNZ */
 		case 0xf3:	/* REP/REPE/REPZ */
 			rep_prefix = 1;
 			break;
-		case 0xf2:	/* REPNE/REPNZ */
-			break;
 		default:
 			goto done_prefixes;
 		}
@@ -825,6 +825,14 @@ done_prefixes:
 		if (twobyte && b == 0x01 && modrm_reg == 7)
 			break;
 	      srcmem_common:
+		/*
+		 * For instructions with a ModR/M byte, switch to register
+		 * access if Mod = 3.
+		 */
+		if ((d & ModRM) && modrm_mod == 3) {
+			src.type = OP_REG;
+			break;
+		}
 		src.type = OP_MEM;
 		src.ptr = (unsigned long *)cr2;
 		src.val = 0;
@@ -893,6 +901,14 @@ done_prefixes:
 		dst.ptr = (unsigned long *)cr2;
 		dst.bytes = (d & ByteOp) ? 1 : op_bytes;
 		dst.val = 0;
+		/*
+		 * For instructions with a ModR/M byte, switch to register
+		 * access if Mod = 3.
+		 */
+		if ((d & ModRM) && modrm_mod == 3) {
+			dst.type = OP_REG;
+			break;
+		}
 		if (d & BitOp) {
 			unsigned long mask = ~(dst.bytes * 8 - 1);
 
@@ -1083,31 +1099,6 @@ push:
 	case 0xd2 ... 0xd3:	/* Grp2 */
 		src.val = _regs[VCPU_REGS_RCX];
 		goto grp2;
-	case 0xe8: /* call (near) */ {
-		long int rel;
-		switch (op_bytes) {
-		case 2:
-			rel = insn_fetch(s16, 2, _eip);
-			break;
-		case 4:
-			rel = insn_fetch(s32, 4, _eip);
-			break;
-		case 8:
-			rel = insn_fetch(s64, 8, _eip);
-			break;
-		default:
-			DPRINTF("Call: Invalid op_bytes\n");
-			goto cannot_emulate;
-		}
-		src.val = (unsigned long) _eip;
-		JMP_REL(rel);
-		goto push;
-	}
-	case 0xe9: /* jmp rel */
-	case 0xeb: /* jmp rel short */
-		JMP_REL(src.val);
-		no_wb = 1; /* Disable writeback. */
-		break;
 	case 0xf6 ... 0xf7:	/* Grp3 */
 		switch (modrm_reg) {
 		case 0 ... 1:	/* test */
@@ -1350,6 +1341,32 @@ special_insn:
 	case 0xae ... 0xaf:	/* scas */
 		DPRINTF("Urk! I don't handle SCAS.\n");
 		goto cannot_emulate;
+	case 0xe8: /* call (near) */ {
+		long int rel;
+		switch (op_bytes) {
+		case 2:
+			rel = insn_fetch(s16, 2, _eip);
+			break;
+		case 4:
+			rel = insn_fetch(s32, 4, _eip);
+			break;
+		case 8:
+			rel = insn_fetch(s64, 8, _eip);
+			break;
+		default:
+			DPRINTF("Call: Invalid op_bytes\n");
+			goto cannot_emulate;
+		}
+		src.val = (unsigned long) _eip;
+		JMP_REL(rel);
+		goto push;
+	}
+	case 0xe9: /* jmp rel */
+	case 0xeb: /* jmp rel short */
+		JMP_REL(src.val);
+		no_wb = 1; /* Disable writeback. */
+		break;
+
 
 	}
 	goto writeback;
@@ -1501,6 +1518,10 @@ twobyte_insn:
 		dst.bytes = op_bytes;
 		dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val;
 		break;
+	case 0xc3:		/* movnti */
+		dst.bytes = op_bytes;
+		dst.val = (op_bytes == 4) ? (u32) src.val : (u64) src.val;
+		break;
 	}
 	goto writeback;
 
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig
index 41e2250613a..7eb9ecff8f4 100644
--- a/drivers/lguest/Kconfig
+++ b/drivers/lguest/Kconfig
@@ -1,7 +1,6 @@
 config LGUEST
 	tristate "Linux hypervisor example code"
-	depends on X86 && PARAVIRT && EXPERIMENTAL && !X86_PAE && FUTEX
-	select LGUEST_GUEST
+	depends on X86_32 && EXPERIMENTAL && !X86_PAE && FUTEX && !(X86_VISWS || X86_VOYAGER)
 	select HVC_DRIVER
 	---help---
 	  This is a very simple module which allows you to run
@@ -18,13 +17,3 @@ config LGUEST_GUEST
 	  The guest needs code built-in, even if the host has lguest
 	  support as a module.  The drivers are tiny, so we build them
 	  in too.
-
-config LGUEST_NET
-	tristate
-	default y
-	depends on LGUEST_GUEST && NET
-
-config LGUEST_BLOCK
-	tristate
-	default y
-	depends on LGUEST_GUEST && BLOCK
diff --git a/drivers/lguest/Makefile b/drivers/lguest/Makefile
index e5047471c33..5e8272d296d 100644
--- a/drivers/lguest/Makefile
+++ b/drivers/lguest/Makefile
@@ -1,10 +1,12 @@
-# Guest requires the paravirt_ops replacement and the bus driver.
-obj-$(CONFIG_LGUEST_GUEST) += lguest.o lguest_asm.o lguest_bus.o
+# Guest requires the device configuration and probing code.
+obj-$(CONFIG_LGUEST_GUEST) += lguest_device.o
 
 # Host requires the other files, which can be a module.
 obj-$(CONFIG_LGUEST)	+= lg.o
-lg-y := core.o hypercalls.o page_tables.o interrupts_and_traps.o \
-	segments.o io.o lguest_user.o switcher.o
+lg-y = core.o hypercalls.o page_tables.o interrupts_and_traps.o \
+	segments.o lguest_user.o
+
+lg-$(CONFIG_X86_32) += x86/switcher_32.o x86/core.o
 
 Preparation Preparation!: PREFIX=P
 Guest: PREFIX=G
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index a0788c12b39..35d19ae58de 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -11,58 +11,20 @@
 #include <linux/vmalloc.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
+#include <linux/highmem.h>
 #include <asm/paravirt.h>
-#include <asm/desc.h>
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
 #include <asm/poll.h>
-#include <asm/highmem.h>
 #include <asm/asm-offsets.h>
-#include <asm/i387.h>
 #include "lg.h"
 
-/* Found in switcher.S */
-extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
-extern unsigned long default_idt_entries[];
-
-/* Every guest maps the core switcher code. */
-#define SHARED_SWITCHER_PAGES \
-	DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE)
-/* Pages for switcher itself, then two pages per cpu */
-#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS)
-
-/* We map at -4M for ease of mapping into the guest (one PTE page). */
-#define SWITCHER_ADDR 0xFFC00000
 
 static struct vm_struct *switcher_vma;
 static struct page **switcher_page;
 
-static int cpu_had_pge;
-static struct {
-	unsigned long offset;
-	unsigned short segment;
-} lguest_entry;
-
 /* This One Big lock protects all inter-guest data structures. */
 DEFINE_MUTEX(lguest_lock);
-static DEFINE_PER_CPU(struct lguest *, last_guest);
-
-/* FIXME: Make dynamic. */
-#define MAX_LGUEST_GUESTS 16
-struct lguest lguests[MAX_LGUEST_GUESTS];
-
-/* Offset from where switcher.S was compiled to where we've copied it */
-static unsigned long switcher_offset(void)
-{
-	return SWITCHER_ADDR - (unsigned long)start_switcher_text;
-}
-
-/* This cpu's struct lguest_pages. */
-static struct lguest_pages *lguest_pages(unsigned int cpu)
-{
-	return &(((struct lguest_pages *)
-		  (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
-}
 
 /*H:010 We need to set up the Switcher at a high virtual address.  Remember the
  * Switcher is a few hundred bytes of assembler code which actually changes the
@@ -73,9 +35,7 @@ static struct lguest_pages *lguest_pages(unsigned int cpu)
  * Host since it will be running as the switchover occurs.
  *
  * Trying to map memory at a particular address is an unusual thing to do, so
- * it's not a simple one-liner.  We also set up the per-cpu parts of the
- * Switcher here.
- */
+ * it's not a simple one-liner. */
 static __init int map_switcher(void)
 {
 	int i, err;
@@ -132,90 +92,11 @@ static __init int map_switcher(void)
 		goto free_vma;
 	}
 
-	/* Now the switcher is mapped at the right address, we can't fail!
-	 * Copy in the compiled-in Switcher code (from switcher.S). */
+	/* Now the Switcher is mapped at the right address, we can't fail!
+	 * Copy in the compiled-in Switcher code (from <arch>_switcher.S). */
 	memcpy(switcher_vma->addr, start_switcher_text,
 	       end_switcher_text - start_switcher_text);
 
-	/* Most of the switcher.S doesn't care that it's been moved; on Intel,
-	 * jumps are relative, and it doesn't access any references to external
-	 * code or data.
-	 *
-	 * The only exception is the interrupt handlers in switcher.S: their
-	 * addresses are placed in a table (default_idt_entries), so we need to
-	 * update the table with the new addresses.  switcher_offset() is a
-	 * convenience function which returns the distance between the builtin
-	 * switcher code and the high-mapped copy we just made. */
-	for (i = 0; i < IDT_ENTRIES; i++)
-		default_idt_entries[i] += switcher_offset();
-
-	/*
-	 * Set up the Switcher's per-cpu areas.
-	 *
-	 * Each CPU gets two pages of its own within the high-mapped region
-	 * (aka. "struct lguest_pages").  Much of this can be initialized now,
-	 * but some depends on what Guest we are running (which is set up in
-	 * copy_in_guest_info()).
-	 */
-	for_each_possible_cpu(i) {
-		/* lguest_pages() returns this CPU's two pages. */
-		struct lguest_pages *pages = lguest_pages(i);
-		/* This is a convenience pointer to make the code fit one
-		 * statement to a line. */
-		struct lguest_ro_state *state = &pages->state;
-
-		/* The Global Descriptor Table: the Host has a different one
-		 * for each CPU.  We keep a descriptor for the GDT which says
-		 * where it is and how big it is (the size is actually the last
-		 * byte, not the size, hence the "-1"). */
-		state->host_gdt_desc.size = GDT_SIZE-1;
-		state->host_gdt_desc.address = (long)get_cpu_gdt_table(i);
-
-		/* All CPUs on the Host use the same Interrupt Descriptor
-		 * Table, so we just use store_idt(), which gets this CPU's IDT
-		 * descriptor. */
-		store_idt(&state->host_idt_desc);
-
-		/* The descriptors for the Guest's GDT and IDT can be filled
-		 * out now, too.  We copy the GDT & IDT into ->guest_gdt and
-		 * ->guest_idt before actually running the Guest. */
-		state->guest_idt_desc.size = sizeof(state->guest_idt)-1;
-		state->guest_idt_desc.address = (long)&state->guest_idt;
-		state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1;
-		state->guest_gdt_desc.address = (long)&state->guest_gdt;
-
-		/* We know where we want the stack to be when the Guest enters
-		 * the switcher: in pages->regs.  The stack grows upwards, so
-		 * we start it at the end of that structure. */
-		state->guest_tss.esp0 = (long)(&pages->regs + 1);
-		/* And this is the GDT entry to use for the stack: we keep a
-		 * couple of special LGUEST entries. */
-		state->guest_tss.ss0 = LGUEST_DS;
-
-		/* x86 can have a finegrained bitmap which indicates what I/O
-		 * ports the process can use.  We set it to the end of our
-		 * structure, meaning "none". */
-		state->guest_tss.io_bitmap_base = sizeof(state->guest_tss);
-
-		/* Some GDT entries are the same across all Guests, so we can
-		 * set them up now. */
-		setup_default_gdt_entries(state);
-		/* Most IDT entries are the same for all Guests, too.*/
-		setup_default_idt_entries(state, default_idt_entries);
-
-		/* The Host needs to be able to use the LGUEST segments on this
-		 * CPU, too, so put them in the Host GDT. */
-		get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
-		get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
-	}
-
-	/* In the Switcher, we want the %cs segment register to use the
-	 * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so
-	 * it will be undisturbed when we switch.  To change %cs and jump we
-	 * need this structure to feed to Intel's "lcall" instruction. */
-	lguest_entry.offset = (long)switch_to_guest + switcher_offset();
-	lguest_entry.segment = LGUEST_CS;
-
 	printk(KERN_INFO "lguest: mapped switcher at %p\n",
 	       switcher_vma->addr);
 	/* And we succeeded... */
@@ -247,86 +128,12 @@ static void unmap_switcher(void)
 		__free_pages(switcher_page[i], 0);
 }
 
-/*H:130 Our Guest is usually so well behaved; it never tries to do things it
- * isn't allowed to.  Unfortunately, Linux's paravirtual infrastructure isn't
- * quite complete, because it doesn't contain replacements for the Intel I/O
- * instructions.  As a result, the Guest sometimes fumbles across one during
- * the boot process as it probes for various things which are usually attached
- * to a PC.
- *
- * When the Guest uses one of these instructions, we get trap #13 (General
- * Protection Fault) and come here.  We see if it's one of those troublesome
- * instructions and skip over it.  We return true if we did. */
-static int emulate_insn(struct lguest *lg)
-{
-	u8 insn;
-	unsigned int insnlen = 0, in = 0, shift = 0;
-	/* The eip contains the *virtual* address of the Guest's instruction:
-	 * guest_pa just subtracts the Guest's page_offset. */
-	unsigned long physaddr = guest_pa(lg, lg->regs->eip);
-
-	/* The guest_pa() function only works for Guest kernel addresses, but
-	 * that's all we're trying to do anyway. */
-	if (lg->regs->eip < lg->page_offset)
-		return 0;
-
-	/* Decoding x86 instructions is icky. */
-	lgread(lg, &insn, physaddr, 1);
-
-	/* 0x66 is an "operand prefix".  It means it's using the upper 16 bits
-	   of the eax register. */
-	if (insn == 0x66) {
-		shift = 16;
-		/* The instruction is 1 byte so far, read the next byte. */
-		insnlen = 1;
-		lgread(lg, &insn, physaddr + insnlen, 1);
-	}
-
-	/* We can ignore the lower bit for the moment and decode the 4 opcodes
-	 * we need to emulate. */
-	switch (insn & 0xFE) {
-	case 0xE4: /* in     <next byte>,%al */
-		insnlen += 2;
-		in = 1;
-		break;
-	case 0xEC: /* in     (%dx),%al */
-		insnlen += 1;
-		in = 1;
-		break;
-	case 0xE6: /* out    %al,<next byte> */
-		insnlen += 2;
-		break;
-	case 0xEE: /* out    %al,(%dx) */
-		insnlen += 1;
-		break;
-	default:
-		/* OK, we don't know what this is, can't emulate. */
-		return 0;
-	}
-
-	/* If it was an "IN" instruction, they expect the result to be read
-	 * into %eax, so we change %eax.  We always return all-ones, which
-	 * traditionally means "there's nothing there". */
-	if (in) {
-		/* Lower bit tells is whether it's a 16 or 32 bit access */
-		if (insn & 0x1)
-			lg->regs->eax = 0xFFFFFFFF;
-		else
-			lg->regs->eax |= (0xFFFF << shift);
-	}
-	/* Finally, we've "done" the instruction, so move past it. */
-	lg->regs->eip += insnlen;
-	/* Success! */
-	return 1;
-}
-/*:*/
-
 /*L:305
  * Dealing With Guest Memory.
  *
  * When the Guest gives us (what it thinks is) a physical address, we can use
- * the normal copy_from_user() & copy_to_user() on that address: remember,
- * Guest physical == Launcher virtual.
+ * the normal copy_from_user() & copy_to_user() on the corresponding place in
+ * the memory region allocated by the Launcher.
  *
  * But we can't trust the Guest: it might be trying to access the Launcher
  * code.  We have to check that the range is below the pfn_limit the Launcher
@@ -338,148 +145,27 @@ int lguest_address_ok(const struct lguest *lg,
 	return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr);
 }
 
-/* This is a convenient routine to get a 32-bit value from the Guest (a very
- * common operation).  Here we can see how useful the kill_lguest() routine we
- * met in the Launcher can be: we return a random value (0) instead of needing
- * to return an error. */
-u32 lgread_u32(struct lguest *lg, unsigned long addr)
-{
-	u32 val = 0;
-
-	/* Don't let them access lguest binary. */
-	if (!lguest_address_ok(lg, addr, sizeof(val))
-	    || get_user(val, (u32 __user *)addr) != 0)
-		kill_guest(lg, "bad read address %#lx", addr);
-	return val;
-}
-
-/* Same thing for writing a value. */
-void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val)
-{
-	if (!lguest_address_ok(lg, addr, sizeof(val))
-	    || put_user(val, (u32 __user *)addr) != 0)
-		kill_guest(lg, "bad write address %#lx", addr);
-}
-
-/* This routine is more generic, and copies a range of Guest bytes into a
- * buffer.  If the copy_from_user() fails, we fill the buffer with zeroes, so
- * the caller doesn't end up using uninitialized kernel memory. */
-void lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes)
+/* This routine copies memory from the Guest.  Here we can see how useful the
+ * kill_lguest() routine we met in the Launcher can be: we return a random
+ * value (all zeroes) instead of needing to return an error. */
+void __lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes)
 {
 	if (!lguest_address_ok(lg, addr, bytes)
-	    || copy_from_user(b, (void __user *)addr, bytes) != 0) {
+	    || copy_from_user(b, lg->mem_base + addr, bytes) != 0) {
 		/* copy_from_user should do this, but as we rely on it... */
 		memset(b, 0, bytes);
 		kill_guest(lg, "bad read address %#lx len %u", addr, bytes);
 	}
 }
 
-/* Similarly, our generic routine to copy into a range of Guest bytes. */
-void lgwrite(struct lguest *lg, unsigned long addr, const void *b,
-	     unsigned bytes)
+/* This is the write (copy into guest) version. */
+void __lgwrite(struct lguest *lg, unsigned long addr, const void *b,
+	       unsigned bytes)
 {
 	if (!lguest_address_ok(lg, addr, bytes)
-	    || copy_to_user((void __user *)addr, b, bytes) != 0)
+	    || copy_to_user(lg->mem_base + addr, b, bytes) != 0)
 		kill_guest(lg, "bad write address %#lx len %u", addr, bytes);
 }
-/* (end of memory access helper routines) :*/
-
-static void set_ts(void)
-{
-	u32 cr0;
-
-	cr0 = read_cr0();
-	if (!(cr0 & 8))
-		write_cr0(cr0|8);
-}
-
-/*S:010
- * We are getting close to the Switcher.
- *
- * Remember that each CPU has two pages which are visible to the Guest when it
- * runs on that CPU.  This has to contain the state for that Guest: we copy the
- * state in just before we run the Guest.
- *
- * Each Guest has "changed" flags which indicate what has changed in the Guest
- * since it last ran.  We saw this set in interrupts_and_traps.c and
- * segments.c.
- */
-static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages)
-{
-	/* Copying all this data can be quite expensive.  We usually run the
-	 * same Guest we ran last time (and that Guest hasn't run anywhere else
-	 * meanwhile).  If that's not the case, we pretend everything in the
-	 * Guest has changed. */
-	if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) {
-		__get_cpu_var(last_guest) = lg;
-		lg->last_pages = pages;
-		lg->changed = CHANGED_ALL;
-	}
-
-	/* These copies are pretty cheap, so we do them unconditionally: */
-	/* Save the current Host top-level page directory. */
-	pages->state.host_cr3 = __pa(current->mm->pgd);
-	/* Set up the Guest's page tables to see this CPU's pages (and no
-	 * other CPU's pages). */
-	map_switcher_in_guest(lg, pages);
-	/* Set up the two "TSS" members which tell the CPU what stack to use
-	 * for traps which do directly into the Guest (ie. traps at privilege
-	 * level 1). */
-	pages->state.guest_tss.esp1 = lg->esp1;
-	pages->state.guest_tss.ss1 = lg->ss1;
-
-	/* Copy direct-to-Guest trap entries. */
-	if (lg->changed & CHANGED_IDT)
-		copy_traps(lg, pages->state.guest_idt, default_idt_entries);
-
-	/* Copy all GDT entries which the Guest can change. */
-	if (lg->changed & CHANGED_GDT)
-		copy_gdt(lg, pages->state.guest_gdt);
-	/* If only the TLS entries have changed, copy them. */
-	else if (lg->changed & CHANGED_GDT_TLS)
-		copy_gdt_tls(lg, pages->state.guest_gdt);
-
-	/* Mark the Guest as unchanged for next time. */
-	lg->changed = 0;
-}
-
-/* Finally: the code to actually call into the Switcher to run the Guest. */
-static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
-{
-	/* This is a dummy value we need for GCC's sake. */
-	unsigned int clobber;
-
-	/* Copy the guest-specific information into this CPU's "struct
-	 * lguest_pages". */
-	copy_in_guest_info(lg, pages);
-
-	/* Set the trap number to 256 (impossible value).  If we fault while
-	 * switching to the Guest (bad segment registers or bug), this will
-	 * cause us to abort the Guest. */
-	lg->regs->trapnum = 256;
-
-	/* Now: we push the "eflags" register on the stack, then do an "lcall".
-	 * This is how we change from using the kernel code segment to using
-	 * the dedicated lguest code segment, as well as jumping into the
-	 * Switcher.
-	 *
-	 * The lcall also pushes the old code segment (KERNEL_CS) onto the
-	 * stack, then the address of this call.  This stack layout happens to
-	 * exactly match the stack of an interrupt... */
-	asm volatile("pushf; lcall *lguest_entry"
-		     /* This is how we tell GCC that %eax ("a") and %ebx ("b")
-		      * are changed by this routine.  The "=" means output. */
-		     : "=a"(clobber), "=b"(clobber)
-		     /* %eax contains the pages pointer.  ("0" refers to the
-		      * 0-th argument above, ie "a").  %ebx contains the
-		      * physical address of the Guest's top-level page
-		      * directory. */
-		     : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir))
-		     /* We tell gcc that all these registers could change,
-		      * which means we don't have to save and restore them in
-		      * the Switcher. */
-		     : "memory", "%edx", "%ecx", "%edi", "%esi");
-}
 /*:*/
 
 /*H:030 Let's jump straight to the the main loop which runs the Guest.
@@ -489,22 +175,16 @@ int run_guest(struct lguest *lg, unsigned long __user *user)
 {
 	/* We stop running once the Guest is dead. */
 	while (!lg->dead) {
-		/* We need to initialize this, otherwise gcc complains.  It's
-		 * not (yet) clever enough to see that it's initialized when we
-		 * need it. */
-		unsigned int cr2 = 0; /* Damn gcc */
-
-		/* First we run any hypercalls the Guest wants done: either in
-		 * the hypercall ring in "struct lguest_data", or directly by
-		 * using int 31 (LGUEST_TRAP_ENTRY). */
-		do_hypercalls(lg);
-		/* It's possible the Guest did a SEND_DMA hypercall to the
+		/* First we run any hypercalls the Guest wants done. */
+		if (lg->hcall)
+			do_hypercalls(lg);
+
+		/* It's possible the Guest did a NOTIFY hypercall to the
 		 * Launcher, in which case we return from the read() now. */
-		if (lg->dma_is_pending) {
-			if (put_user(lg->pending_dma, user) ||
-			    put_user(lg->pending_key, user+1))
+		if (lg->pending_notify) {
+			if (put_user(lg->pending_notify, user))
 				return -EFAULT;
-			return sizeof(unsigned long)*2;
+			return sizeof(lg->pending_notify);
 		}
 
 		/* Check for signals */
@@ -542,144 +222,20 @@ int run_guest(struct lguest *lg, unsigned long __user *user)
 		 * the "Do Not Disturb" sign: */
 		local_irq_disable();
 
-		/* Remember the awfully-named TS bit?  If the Guest has asked
-		 * to set it we set it now, so we can trap and pass that trap
-		 * to the Guest if it uses the FPU. */
-		if (lg->ts)
-			set_ts();
-
-		/* SYSENTER is an optimized way of doing system calls.  We
-		 * can't allow it because it always jumps to privilege level 0.
-		 * A normal Guest won't try it because we don't advertise it in
-		 * CPUID, but a malicious Guest (or malicious Guest userspace
-		 * program) could, so we tell the CPU to disable it before
-		 * running the Guest. */
-		if (boot_cpu_has(X86_FEATURE_SEP))
-			wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
-
-		/* Now we actually run the Guest.  It will pop back out when
-		 * something interesting happens, and we can examine its
-		 * registers to see what it was doing. */
-		run_guest_once(lg, lguest_pages(raw_smp_processor_id()));
-
-		/* The "regs" pointer contains two extra entries which are not
-		 * really registers: a trap number which says what interrupt or
-		 * trap made the switcher code come back, and an error code
-		 * which some traps set.  */
-
-		/* If the Guest page faulted, then the cr2 register will tell
-		 * us the bad virtual address.  We have to grab this now,
-		 * because once we re-enable interrupts an interrupt could
-		 * fault and thus overwrite cr2, or we could even move off to a
-		 * different CPU. */
-		if (lg->regs->trapnum == 14)
-			cr2 = read_cr2();
-		/* Similarly, if we took a trap because the Guest used the FPU,
-		 * we have to restore the FPU it expects to see. */
-		else if (lg->regs->trapnum == 7)
-			math_state_restore();
-
-		/* Restore SYSENTER if it's supposed to be on. */
-		if (boot_cpu_has(X86_FEATURE_SEP))
-			wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
+		/* Actually run the Guest until something happens. */
+		lguest_arch_run_guest(lg);
 
 		/* Now we're ready to be interrupted or moved to other CPUs */
 		local_irq_enable();
 
-		/* OK, so what happened? */
-		switch (lg->regs->trapnum) {
-		case 13: /* We've intercepted a GPF. */
-			/* Check if this was one of those annoying IN or OUT
-			 * instructions which we need to emulate.  If so, we
-			 * just go back into the Guest after we've done it. */
-			if (lg->regs->errcode == 0) {
-				if (emulate_insn(lg))
-					continue;
-			}
-			break;
-		case 14: /* We've intercepted a page fault. */
-			/* The Guest accessed a virtual address that wasn't
-			 * mapped.  This happens a lot: we don't actually set
-			 * up most of the page tables for the Guest at all when
-			 * we start: as it runs it asks for more and more, and
-			 * we set them up as required. In this case, we don't
-			 * even tell the Guest that the fault happened.
-			 *
-			 * The errcode tells whether this was a read or a
-			 * write, and whether kernel or userspace code. */
-			if (demand_page(lg, cr2, lg->regs->errcode))
-				continue;
-
-			/* OK, it's really not there (or not OK): the Guest
-			 * needs to know.  We write out the cr2 value so it
-			 * knows where the fault occurred.
-			 *
-			 * Note that if the Guest were really messed up, this
-			 * could happen before it's done the INITIALIZE
-			 * hypercall, so lg->lguest_data will be NULL, so
-			 * &lg->lguest_data->cr2 will be address 8.  Writing
-			 * into that address won't hurt the Host at all,
-			 * though. */
-			if (put_user(cr2, &lg->lguest_data->cr2))
-				kill_guest(lg, "Writing cr2");
-			break;
-		case 7: /* We've intercepted a Device Not Available fault. */
-			/* If the Guest doesn't want to know, we already
-			 * restored the Floating Point Unit, so we just
-			 * continue without telling it. */
-			if (!lg->ts)
-				continue;
-			break;
-		case 32 ... 255:
-			/* These values mean a real interrupt occurred, in
-			 * which case the Host handler has already been run.
-			 * We just do a friendly check if another process
-			 * should now be run, then fall through to loop
-			 * around: */
-			cond_resched();
-		case LGUEST_TRAP_ENTRY: /* Handled at top of loop */
-			continue;
-		}
-
-		/* If we get here, it's a trap the Guest wants to know
-		 * about. */
-		if (deliver_trap(lg, lg->regs->trapnum))
-			continue;
-
-		/* If the Guest doesn't have a handler (either it hasn't
-		 * registered any yet, or it's one of the faults we don't let
-		 * it handle), it dies with a cryptic error message. */
-		kill_guest(lg, "unhandled trap %li at %#lx (%#lx)",
-			   lg->regs->trapnum, lg->regs->eip,
-			   lg->regs->trapnum == 14 ? cr2 : lg->regs->errcode);
+		/* Now we deal with whatever happened to the Guest. */
+		lguest_arch_handle_trap(lg);
 	}
+
 	/* The Guest is dead => "No such file or directory" */
 	return -ENOENT;
 }
 
-/* Now we can look at each of the routines this calls, in increasing order of
- * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(),
- * deliver_trap() and demand_page().  After all those, we'll be ready to
- * examine the Switcher, and our philosophical understanding of the Host/Guest
- * duality will be complete. :*/
-
-int find_free_guest(void)
-{
-	unsigned int i;
-	for (i = 0; i < MAX_LGUEST_GUESTS; i++)
-		if (!lguests[i].tsk)
-			return i;
-	return -1;
-}
-
-static void adjust_pge(void *on)
-{
-	if (on)
-		write_cr4(read_cr4() | X86_CR4_PGE);
-	else
-		write_cr4(read_cr4() & ~X86_CR4_PGE);
-}
-
 /*H:000
  * Welcome to the Host!
  *
@@ -701,72 +257,50 @@ static int __init init(void)
 	/* First we put the Switcher up in very high virtual memory. */
 	err = map_switcher();
 	if (err)
-		return err;
+		goto out;
 
 	/* Now we set up the pagetable implementation for the Guests. */
 	err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES);
-	if (err) {
-		unmap_switcher();
-		return err;
-	}
+	if (err)
+		goto unmap;
 
-	/* The I/O subsystem needs some things initialized. */
-	lguest_io_init();
+	/* We might need to reserve an interrupt vector. */
+	err = init_interrupts();
+	if (err)
+		goto free_pgtables;
 
 	/* /dev/lguest needs to be registered. */
 	err = lguest_device_init();
-	if (err) {
-		free_pagetables();
-		unmap_switcher();
-		return err;
-	}
+	if (err)
+		goto free_interrupts;
 
-	/* Finally, we need to turn off "Page Global Enable".  PGE is an
-	 * optimization where page table entries are specially marked to show
-	 * they never change.  The Host kernel marks all the kernel pages this
-	 * way because it's always present, even when userspace is running.
-	 *
-	 * Lguest breaks this: unbeknownst to the rest of the Host kernel, we
-	 * switch to the Guest kernel.  If you don't disable this on all CPUs,
-	 * you'll get really weird bugs that you'll chase for two days.
-	 *
-	 * I used to turn PGE off every time we switched to the Guest and back
-	 * on when we return, but that slowed the Switcher down noticibly. */
-
-	/* We don't need the complexity of CPUs coming and going while we're
-	 * doing this. */
-	lock_cpu_hotplug();
-	if (cpu_has_pge) { /* We have a broader idea of "global". */
-		/* Remember that this was originally set (for cleanup). */
-		cpu_had_pge = 1;
-		/* adjust_pge is a helper function which sets or unsets the PGE
-		 * bit on its CPU, depending on the argument (0 == unset). */
-		on_each_cpu(adjust_pge, (void *)0, 0, 1);
-		/* Turn off the feature in the global feature set. */
-		clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
-	}
-	unlock_cpu_hotplug();
+	/* Finally we do some architecture-specific setup. */
+	lguest_arch_host_init();
 
 	/* All good! */
 	return 0;
+
+free_interrupts:
+	free_interrupts();
+free_pgtables:
+	free_pagetables();
+unmap:
+	unmap_switcher();
+out:
+	return err;
 }
 
 /* Cleaning up is just the same code, backwards.  With a little French. */
 static void __exit fini(void)
 {
 	lguest_device_remove();
+	free_interrupts();
 	free_pagetables();
 	unmap_switcher();
 
-	/* If we had PGE before we started, turn it back on now. */
-	lock_cpu_hotplug();
-	if (cpu_had_pge) {
-		set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
-		/* adjust_pge's argument "1" means set PGE. */
-		on_each_cpu(adjust_pge, (void *)1, 0, 1);
-	}
-	unlock_cpu_hotplug();
+	lguest_arch_host_fini();
 }
+/*:*/
 
 /* The Host side of lguest can be a module.  This is a nice way for people to
  * play with it.  */
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index db6caace3b9..9d5184c7c14 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -25,17 +25,13 @@
 #include <linux/mm.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
-#include <irq_vectors.h>
 #include "lg.h"
 
-/*H:120 This is the core hypercall routine: where the Guest gets what it
- * wants.  Or gets killed.  Or, in the case of LHCALL_CRASH, both.
- *
- * Remember from the Guest: %eax == which call to make, and the arguments are
- * packed into %edx, %ebx and %ecx if needed. */
-static void do_hcall(struct lguest *lg, struct lguest_regs *regs)
+/*H:120 This is the core hypercall routine: where the Guest gets what it wants.
+ * Or gets killed.  Or, in the case of LHCALL_CRASH, both. */
+static void do_hcall(struct lguest *lg, struct hcall_args *args)
 {
-	switch (regs->eax) {
+	switch (args->arg0) {
 	case LHCALL_FLUSH_ASYNC:
 		/* This call does nothing, except by breaking out of the Guest
 		 * it makes us process all the asynchronous hypercalls. */
@@ -51,7 +47,7 @@ static void do_hcall(struct lguest *lg, struct lguest_regs *regs)
 		char msg[128];
 		/* If the lgread fails, it will call kill_guest() itself; the
 		 * kill_guest() with the message will be ignored. */
-		lgread(lg, msg, regs->edx, sizeof(msg));
+		__lgread(lg, msg, args->arg1, sizeof(msg));
 		msg[sizeof(msg)-1] = '\0';
 		kill_guest(lg, "CRASH: %s", msg);
 		break;
@@ -59,67 +55,49 @@ static void do_hcall(struct lguest *lg, struct lguest_regs *regs)
 	case LHCALL_FLUSH_TLB:
 		/* FLUSH_TLB comes in two flavors, depending on the
 		 * argument: */
-		if (regs->edx)
+		if (args->arg1)
 			guest_pagetable_clear_all(lg);
 		else
 			guest_pagetable_flush_user(lg);
 		break;
-	case LHCALL_BIND_DMA:
-		/* BIND_DMA really wants four arguments, but it's the only call
-		 * which does.  So the Guest packs the number of buffers and
-		 * the interrupt number into the final argument, and we decode
-		 * it here.  This can legitimately fail, since we currently
-		 * place a limit on the number of DMA pools a Guest can have.
-		 * So we return true or false from this call. */
-		regs->eax = bind_dma(lg, regs->edx, regs->ebx,
-				     regs->ecx >> 8, regs->ecx & 0xFF);
-		break;
 
 	/* All these calls simply pass the arguments through to the right
 	 * routines. */
-	case LHCALL_SEND_DMA:
-		send_dma(lg, regs->edx, regs->ebx);
-		break;
-	case LHCALL_LOAD_GDT:
-		load_guest_gdt(lg, regs->edx, regs->ebx);
-		break;
-	case LHCALL_LOAD_IDT_ENTRY:
-		load_guest_idt_entry(lg, regs->edx, regs->ebx, regs->ecx);
-		break;
 	case LHCALL_NEW_PGTABLE:
-		guest_new_pagetable(lg, regs->edx);
+		guest_new_pagetable(lg, args->arg1);
 		break;
 	case LHCALL_SET_STACK:
-		guest_set_stack(lg, regs->edx, regs->ebx, regs->ecx);
+		guest_set_stack(lg, args->arg1, args->arg2, args->arg3);
 		break;
 	case LHCALL_SET_PTE:
-		guest_set_pte(lg, regs->edx, regs->ebx, mkgpte(regs->ecx));
+		guest_set_pte(lg, args->arg1, args->arg2, __pte(args->arg3));
 		break;
 	case LHCALL_SET_PMD:
-		guest_set_pmd(lg, regs->edx, regs->ebx);
-		break;
-	case LHCALL_LOAD_TLS:
-		guest_load_tls(lg, regs->edx);
+		guest_set_pmd(lg, args->arg1, args->arg2);
 		break;
 	case LHCALL_SET_CLOCKEVENT:
-		guest_set_clockevent(lg, regs->edx);
+		guest_set_clockevent(lg, args->arg1);
 		break;
-
 	case LHCALL_TS:
 		/* This sets the TS flag, as we saw used in run_guest(). */
-		lg->ts = regs->edx;
+		lg->ts = args->arg1;
 		break;
 	case LHCALL_HALT:
 		/* Similarly, this sets the halted flag for run_guest(). */
 		lg->halted = 1;
 		break;
+	case LHCALL_NOTIFY:
+		lg->pending_notify = args->arg1;
+		break;
 	default:
-		kill_guest(lg, "Bad hypercall %li\n", regs->eax);
+		if (lguest_arch_do_hcall(lg, args))
+			kill_guest(lg, "Bad hypercall %li\n", args->arg0);
 	}
 }
+/*:*/
 
-/* Asynchronous hypercalls are easy: we just look in the array in the Guest's
- * "struct lguest_data" and see if there are any new ones marked "ready".
+/*H:124 Asynchronous hypercalls are easy: we just look in the array in the
+ * Guest's "struct lguest_data" to see if any new ones are marked "ready".
  *
  * We are careful to do these in order: obviously we respect the order the
  * Guest put them in the ring, but we also promise the Guest that they will
@@ -134,10 +112,9 @@ static void do_async_hcalls(struct lguest *lg)
 	if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st)))
 		return;
 
-
 	/* We process "struct lguest_data"s hcalls[] ring once. */
 	for (i = 0; i < ARRAY_SIZE(st); i++) {
-		struct lguest_regs regs;
+		struct hcall_args args;
 		/* We remember where we were up to from last time.  This makes
 		 * sure that the hypercalls are done in the order the Guest
 		 * places them in the ring. */
@@ -152,18 +129,16 @@ static void do_async_hcalls(struct lguest *lg)
 		if (++lg->next_hcall == LHCALL_RING_SIZE)
 			lg->next_hcall = 0;
 
-		/* We copy the hypercall arguments into a fake register
-		 * structure.  This makes life simple for do_hcall(). */
-		if (get_user(regs.eax, &lg->lguest_data->hcalls[n].eax)
-		    || get_user(regs.edx, &lg->lguest_data->hcalls[n].edx)
-		    || get_user(regs.ecx, &lg->lguest_data->hcalls[n].ecx)
-		    || get_user(regs.ebx, &lg->lguest_data->hcalls[n].ebx)) {
+		/* Copy the hypercall arguments into a local copy of
+		 * the hcall_args struct. */
+		if (copy_from_user(&args, &lg->lguest_data->hcalls[n],
+				   sizeof(struct hcall_args))) {
 			kill_guest(lg, "Fetching async hypercalls");
 			break;
 		}
 
 		/* Do the hypercall, same as a normal one. */
-		do_hcall(lg, &regs);
+		do_hcall(lg, &args);
 
 		/* Mark the hypercall done. */
 		if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) {
@@ -171,9 +146,9 @@ static void do_async_hcalls(struct lguest *lg)
 			break;
 		}
 
- 		/* Stop doing hypercalls if we've just done a DMA to the
-		 * Launcher: it needs to service this first. */
-		if (lg->dma_is_pending)
+		/* Stop doing hypercalls if they want to notify the Launcher:
+		 * it needs to service this first. */
+		if (lg->pending_notify)
 			break;
 	}
 }
@@ -182,76 +157,35 @@ static void do_async_hcalls(struct lguest *lg)
  * Guest makes a hypercall, we end up here to set things up: */
 static void initialize(struct lguest *lg)
 {
-	u32 tsc_speed;
 
 	/* You can't do anything until you're initialized.  The Guest knows the
 	 * rules, so we're unforgiving here. */
-	if (lg->regs->eax != LHCALL_LGUEST_INIT) {
-		kill_guest(lg, "hypercall %li before LGUEST_INIT",
-			   lg->regs->eax);
+	if (lg->hcall->arg0 != LHCALL_LGUEST_INIT) {
+		kill_guest(lg, "hypercall %li before INIT", lg->hcall->arg0);
 		return;
 	}
 
-	/* We insist that the Time Stamp Counter exist and doesn't change with
-	 * cpu frequency.  Some devious chip manufacturers decided that TSC
-	 * changes could be handled in software.  I decided that time going
-	 * backwards might be good for benchmarks, but it's bad for users.
-	 *
-	 * We also insist that the TSC be stable: the kernel detects unreliable
-	 * TSCs for its own purposes, and we use that here. */
-	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable())
-		tsc_speed = tsc_khz;
-	else
-		tsc_speed = 0;
-
-	/* The pointer to the Guest's "struct lguest_data" is the only
-	 * argument. */
-	lg->lguest_data = (struct lguest_data __user *)lg->regs->edx;
-	/* If we check the address they gave is OK now, we can simply
-	 * copy_to_user/from_user from now on rather than using lgread/lgwrite.
-	 * I put this in to show that I'm not immune to writing stupid
-	 * optimizations. */
-	if (!lguest_address_ok(lg, lg->regs->edx, sizeof(*lg->lguest_data))) {
+	if (lguest_arch_init_hypercalls(lg))
 		kill_guest(lg, "bad guest page %p", lg->lguest_data);
-		return;
-	}
+
 	/* The Guest tells us where we're not to deliver interrupts by putting
 	 * the range of addresses into "struct lguest_data". */
 	if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start)
-	    || get_user(lg->noirq_end, &lg->lguest_data->noirq_end)
-	    /* We tell the Guest that it can't use the top 4MB of virtual
-	     * addresses used by the Switcher. */
-	    || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem)
-	    || put_user(tsc_speed, &lg->lguest_data->tsc_khz)
-	    /* We also give the Guest a unique id, as used in lguest_net.c. */
-	    || put_user(lg->guestid, &lg->lguest_data->guestid))
+	    || get_user(lg->noirq_end, &lg->lguest_data->noirq_end))
 		kill_guest(lg, "bad guest page %p", lg->lguest_data);
 
 	/* We write the current time into the Guest's data page once now. */
 	write_timestamp(lg);
 
+	/* page_tables.c will also do some setup. */
+	page_table_guest_data_init(lg);
+
 	/* This is the one case where the above accesses might have been the
 	 * first write to a Guest page.  This may have caused a copy-on-write
 	 * fault, but the Guest might be referring to the old (read-only)
 	 * page. */
 	guest_pagetable_clear_all(lg);
 }
-/* Now we've examined the hypercall code; our Guest can make requests.  There
- * is one other way we can do things for the Guest, as we see in
- * emulate_insn(). */
-
-/*H:110 Tricky point: we mark the hypercall as "done" once we've done it.
- * Normally we don't need to do this: the Guest will run again and update the
- * trap number before we come back around the run_guest() loop to
- * do_hypercalls().
- *
- * However, if we are signalled or the Guest sends DMA to the Launcher, that
- * loop will exit without running the Guest.  When it comes back it would try
- * to re-run the hypercall. */
-static void clear_hcall(struct lguest *lg)
-{
-	lg->regs->trapnum = 255;
-}
 
 /*H:100
  * Hypercalls
@@ -261,16 +195,12 @@ static void clear_hcall(struct lguest *lg)
  */
 void do_hypercalls(struct lguest *lg)
 {
-	/* Not initialized yet? */
+	/* Not initialized yet?  This hypercall must do it. */
 	if (unlikely(!lg->lguest_data)) {
-		/* Did the Guest make a hypercall?  We might have come back for
-		 * some other reason (an interrupt, a different trap). */
-		if (lg->regs->trapnum == LGUEST_TRAP_ENTRY) {
-			/* Set up the "struct lguest_data" */
-			initialize(lg);
-			/* The hypercall is done. */
-			clear_hcall(lg);
-		}
+		/* Set up the "struct lguest_data" */
+		initialize(lg);
+		/* Hcall is done. */
+		lg->hcall = NULL;
 		return;
 	}
 
@@ -280,12 +210,21 @@ void do_hypercalls(struct lguest *lg)
 	do_async_hcalls(lg);
 
 	/* If we stopped reading the hypercall ring because the Guest did a
-	 * SEND_DMA to the Launcher, we want to return now.  Otherwise if the
-	 * Guest asked us to do a hypercall, we do it. */
-	if (!lg->dma_is_pending && lg->regs->trapnum == LGUEST_TRAP_ENTRY) {
-		do_hcall(lg, lg->regs);
-		/* The hypercall is done. */
-		clear_hcall(lg);
+	 * NOTIFY to the Launcher, we want to return now.  Otherwise we do
+	 * the hypercall. */
+	if (!lg->pending_notify) {
+		do_hcall(lg, lg->hcall);
+		/* Tricky point: we reset the hcall pointer to mark the
+		 * hypercall as "done".  We use the hcall pointer rather than
+		 * the trap number to indicate a hypercall is pending.
+		 * Normally it doesn't matter: the Guest will run again and
+		 * update the trap number before we come back here.
+		 *
+		 * However, if we are signalled or the Guest sends DMA to the
+		 * Launcher, the run_guest() loop will exit without running the
+		 * Guest.  When it comes back it would try to re-run the
+		 * hypercall. */
+		lg->hcall = NULL;
 	}
 }
 
@@ -295,6 +234,6 @@ void write_timestamp(struct lguest *lg)
 {
 	struct timespec now;
 	ktime_get_real_ts(&now);
-	if (put_user(now, &lg->lguest_data->time))
+	if (copy_to_user(&lg->lguest_data->time, &now, sizeof(struct timespec)))
 		kill_guest(lg, "Writing timestamp");
 }
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index 39731232d82..82966982cb3 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -12,8 +12,14 @@
  * them first, so we also have a way of "reflecting" them into the Guest as if
  * they had been delivered to it directly. :*/
 #include <linux/uaccess.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
 #include "lg.h"
 
+/* Allow Guests to use a non-128 (ie. non-Linux) syscall trap. */
+static unsigned int syscall_vector = SYSCALL_VECTOR;
+module_param(syscall_vector, uint, 0444);
+
 /* The address of the interrupt handler is split into two bits: */
 static unsigned long idt_address(u32 lo, u32 hi)
 {
@@ -39,7 +45,7 @@ static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val)
 {
 	/* Stack grows upwards: move stack then write value. */
 	*gstack -= 4;
-	lgwrite_u32(lg, *gstack, val);
+	lgwrite(lg, *gstack, u32, val);
 }
 
 /*H:210 The set_guest_interrupt() routine actually delivers the interrupt or
@@ -56,8 +62,9 @@ static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val)
  * it). */
 static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
 {
-	unsigned long gstack;
+	unsigned long gstack, origstack;
 	u32 eflags, ss, irq_enable;
+	unsigned long virtstack;
 
 	/* There are two cases for interrupts: one where the Guest is already
 	 * in the kernel, and a more complex one where the Guest is in
@@ -65,8 +72,10 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
 	if ((lg->regs->ss&0x3) != GUEST_PL) {
 		/* The Guest told us their kernel stack with the SET_STACK
 		 * hypercall: both the virtual address and the segment */
-		gstack = guest_pa(lg, lg->esp1);
+		virtstack = lg->esp1;
 		ss = lg->ss1;
+
+		origstack = gstack = guest_pa(lg, virtstack);
 		/* We push the old stack segment and pointer onto the new
 		 * stack: when the Guest does an "iret" back from the interrupt
 		 * handler the CPU will notice they're dropping privilege
@@ -75,8 +84,10 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
 		push_guest_stack(lg, &gstack, lg->regs->esp);
 	} else {
 		/* We're staying on the same Guest (kernel) stack. */
-		gstack = guest_pa(lg, lg->regs->esp);
+		virtstack = lg->regs->esp;
 		ss = lg->regs->ss;
+
+		origstack = gstack = guest_pa(lg, virtstack);
 	}
 
 	/* Remember that we never let the Guest actually disable interrupts, so
@@ -102,7 +113,7 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
 	/* Now we've pushed all the old state, we change the stack, the code
 	 * segment and the address to execute. */
 	lg->regs->ss = ss;
-	lg->regs->esp = gstack + lg->page_offset;
+	lg->regs->esp = virtstack + (gstack - origstack);
 	lg->regs->cs = (__KERNEL_CS|GUEST_PL);
 	lg->regs->eip = idt_address(lo, hi);
 
@@ -165,7 +176,7 @@ void maybe_do_interrupt(struct lguest *lg)
 	/* Look at the IDT entry the Guest gave us for this interrupt.  The
 	 * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip
 	 * over them. */
-	idt = &lg->idt[FIRST_EXTERNAL_VECTOR+irq];
+	idt = &lg->arch.idt[FIRST_EXTERNAL_VECTOR+irq];
 	/* If they don't have a handler (yet?), we just ignore it */
 	if (idt_present(idt->a, idt->b)) {
 		/* OK, mark it no longer pending and deliver it. */
@@ -183,6 +194,47 @@ void maybe_do_interrupt(struct lguest *lg)
 	 * timer interrupt. */
 	write_timestamp(lg);
 }
+/*:*/
+
+/* Linux uses trap 128 for system calls.  Plan9 uses 64, and Ron Minnich sent
+ * me a patch, so we support that too.  It'd be a big step for lguest if half
+ * the Plan 9 user base were to start using it.
+ *
+ * Actually now I think of it, it's possible that Ron *is* half the Plan 9
+ * userbase.  Oh well. */
+static bool could_be_syscall(unsigned int num)
+{
+	/* Normal Linux SYSCALL_VECTOR or reserved vector? */
+	return num == SYSCALL_VECTOR || num == syscall_vector;
+}
+
+/* The syscall vector it wants must be unused by Host. */
+bool check_syscall_vector(struct lguest *lg)
+{
+	u32 vector;
+
+	if (get_user(vector, &lg->lguest_data->syscall_vec))
+		return false;
+
+	return could_be_syscall(vector);
+}
+
+int init_interrupts(void)
+{
+	/* If they want some strange system call vector, reserve it now */
+	if (syscall_vector != SYSCALL_VECTOR
+	    && test_and_set_bit(syscall_vector, used_vectors)) {
+		printk("lg: couldn't reserve syscall %u\n", syscall_vector);
+		return -EBUSY;
+	}
+	return 0;
+}
+
+void free_interrupts(void)
+{
+	if (syscall_vector != SYSCALL_VECTOR)
+		clear_bit(syscall_vector, used_vectors);
+}
 
 /*H:220 Now we've got the routines to deliver interrupts, delivering traps
  * like page fault is easy.  The only trick is that Intel decided that some
@@ -197,14 +249,14 @@ int deliver_trap(struct lguest *lg, unsigned int num)
 {
 	/* Trap numbers are always 8 bit, but we set an impossible trap number
 	 * for traps inside the Switcher, so check that here. */
-	if (num >= ARRAY_SIZE(lg->idt))
+	if (num >= ARRAY_SIZE(lg->arch.idt))
 		return 0;
 
 	/* Early on the Guest hasn't set the IDT entries (or maybe it put a
 	 * bogus one in): if we fail here, the Guest will be killed. */
-	if (!idt_present(lg->idt[num].a, lg->idt[num].b))
+	if (!idt_present(lg->arch.idt[num].a, lg->arch.idt[num].b))
 		return 0;
-	set_guest_interrupt(lg, lg->idt[num].a, lg->idt[num].b, has_err(num));
+	set_guest_interrupt(lg, lg->arch.idt[num].a, lg->arch.idt[num].b, has_err(num));
 	return 1;
 }
 
@@ -218,28 +270,20 @@ int deliver_trap(struct lguest *lg, unsigned int num)
  * system calls down from 1750ns to 270ns.  Plus, if lguest didn't do it, all
  * the other hypervisors would tease it.
  *
- * This routine determines if a trap can be delivered directly. */
-static int direct_trap(const struct lguest *lg,
-		       const struct desc_struct *trap,
-		       unsigned int num)
+ * This routine indicates if a particular trap number could be delivered
+ * directly. */
+static int direct_trap(unsigned int num)
 {
 	/* Hardware interrupts don't go to the Guest at all (except system
 	 * call). */
-	if (num >= FIRST_EXTERNAL_VECTOR && num != SYSCALL_VECTOR)
+	if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num))
 		return 0;
 
 	/* The Host needs to see page faults (for shadow paging and to save the
 	 * fault address), general protection faults (in/out emulation) and
 	 * device not available (TS handling), and of course, the hypercall
 	 * trap. */
-	if (num == 14 || num == 13 || num == 7 || num == LGUEST_TRAP_ENTRY)
-		return 0;
-
-	/* Only trap gates (type 15) can go direct to the Guest.  Interrupt
-	 * gates (type 14) disable interrupts as they are entered, which we
-	 * never let the Guest do.  Not present entries (type 0x0) also can't
-	 * go direct, of course 8) */
-	return idt_type(trap->a, trap->b) == 0xF;
+	return num != 14 && num != 13 && num != 7 && num != LGUEST_TRAP_ENTRY;
 }
 /*:*/
 
@@ -348,15 +392,11 @@ void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi)
 	 * to copy this again. */
 	lg->changed |= CHANGED_IDT;
 
-	/* The IDT which we keep in "struct lguest" only contains 32 entries
-	 * for the traps and LGUEST_IRQS (32) entries for interrupts.  We
-	 * ignore attempts to set handlers for higher interrupt numbers, except
-	 * for the system call "interrupt" at 128: we have a special IDT entry
-	 * for that. */
-	if (num < ARRAY_SIZE(lg->idt))
-		set_trap(lg, &lg->idt[num], num, lo, hi);
-	else if (num == SYSCALL_VECTOR)
-		set_trap(lg, &lg->syscall_idt, num, lo, hi);
+	/* Check that the Guest doesn't try to step outside the bounds. */
+	if (num >= ARRAY_SIZE(lg->arch.idt))
+		kill_guest(lg, "Setting idt entry %u", num);
+	else
+		set_trap(lg, &lg->arch.idt[num], num, lo, hi);
 }
 
 /* The default entry for each interrupt points into the Switcher routines which
@@ -399,20 +439,21 @@ void copy_traps(const struct lguest *lg, struct desc_struct *idt,
 
 	/* We can simply copy the direct traps, otherwise we use the default
 	 * ones in the Switcher: they will return to the Host. */
-	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) {
-		if (direct_trap(lg, &lg->idt[i], i))
-			idt[i] = lg->idt[i];
+	for (i = 0; i < ARRAY_SIZE(lg->arch.idt); i++) {
+		/* If no Guest can ever override this trap, leave it alone. */
+		if (!direct_trap(i))
+			continue;
+
+		/* Only trap gates (type 15) can go direct to the Guest.
+		 * Interrupt gates (type 14) disable interrupts as they are
+		 * entered, which we never let the Guest do.  Not present
+		 * entries (type 0x0) also can't go direct, of course. */
+		if (idt_type(lg->arch.idt[i].a, lg->arch.idt[i].b) == 0xF)
+			idt[i] = lg->arch.idt[i];
 		else
+			/* Reset it to the default. */
 			default_idt_entry(&idt[i], i, def[i]);
 	}
-
-	/* Don't forget the system call trap!  The IDT entries for other
-	 * interupts never change, so no need to copy them. */
-	i = SYSCALL_VECTOR;
-	if (direct_trap(lg, &lg->syscall_idt, i))
-		idt[i] = lg->syscall_idt;
-	else
-		default_idt_entry(&idt[i], i, def[i]);
 }
 
 void guest_set_clockevent(struct lguest *lg, unsigned long delta)
diff --git a/drivers/lguest/io.c b/drivers/lguest/io.c
deleted file mode 100644
index ea68613b43f..00000000000
--- a/drivers/lguest/io.c
+++ /dev/null
@@ -1,626 +0,0 @@
-/*P:300 The I/O mechanism in lguest is simple yet flexible, allowing the Guest
- * to talk to the Launcher or directly to another Guest.  It uses familiar
- * concepts of DMA and interrupts, plus some neat code stolen from
- * futexes... :*/
-
-/* Copyright (C) 2006 Rusty Russell IBM Corporation
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
- */
-#include <linux/types.h>
-#include <linux/futex.h>
-#include <linux/jhash.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/uaccess.h>
-#include "lg.h"
-
-/*L:300
- * I/O
- *
- * Getting data in and out of the Guest is quite an art.  There are numerous
- * ways to do it, and they all suck differently.  We try to keep things fairly
- * close to "real" hardware so our Guest's drivers don't look like an alien
- * visitation in the middle of the Linux code, and yet make sure that Guests
- * can talk directly to other Guests, not just the Launcher.
- *
- * To do this, the Guest gives us a key when it binds or sends DMA buffers.
- * The key corresponds to a "physical" address inside the Guest (ie. a virtual
- * address inside the Launcher process).  We don't, however, use this key
- * directly.
- *
- * We want Guests which share memory to be able to DMA to each other: two
- * Launchers can mmap memory the same file, then the Guests can communicate.
- * Fortunately, the futex code provides us with a way to get a "union
- * futex_key" corresponding to the memory lying at a virtual address: if the
- * two processes share memory, the "union futex_key" for that memory will match
- * even if the memory is mapped at different addresses in each.  So we always
- * convert the keys to "union futex_key"s to compare them.
- *
- * Before we dive into this though, we need to look at another set of helper
- * routines used throughout the Host kernel code to access Guest memory.
- :*/
-static struct list_head dma_hash[61];
-
-/* An unfortunate side effect of the Linux double-linked list implementation is
- * that there's no good way to statically initialize an array of linked
- * lists. */
-void lguest_io_init(void)
-{
-	unsigned int i;
-
-	for (i = 0; i < ARRAY_SIZE(dma_hash); i++)
-		INIT_LIST_HEAD(&dma_hash[i]);
-}
-
-/* FIXME: allow multi-page lengths. */
-static int check_dma_list(struct lguest *lg, const struct lguest_dma *dma)
-{
-	unsigned int i;
-
-	for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
-		if (!dma->len[i])
-			return 1;
-		if (!lguest_address_ok(lg, dma->addr[i], dma->len[i]))
-			goto kill;
-		if (dma->len[i] > PAGE_SIZE)
-			goto kill;
-		/* We could do over a page, but is it worth it? */
-		if ((dma->addr[i] % PAGE_SIZE) + dma->len[i] > PAGE_SIZE)
-			goto kill;
-	}
-	return 1;
-
-kill:
-	kill_guest(lg, "bad DMA entry: %u@%#lx", dma->len[i], dma->addr[i]);
-	return 0;
-}
-
-/*L:330 This is our hash function, using the wonderful Jenkins hash.
- *
- * The futex key is a union with three parts: an unsigned long word, a pointer,
- * and an int "offset".  We could use jhash_2words() which takes three u32s.
- * (Ok, the hash functions are great: the naming sucks though).
- *
- * It's nice to be portable to 64-bit platforms, so we use the more generic
- * jhash2(), which takes an array of u32, the number of u32s, and an initial
- * u32 to roll in.  This is uglier, but breaks down to almost the same code on
- * 32-bit platforms like this one.
- *
- * We want a position in the array, so we modulo ARRAY_SIZE(dma_hash) (ie. 61).
- */
-static unsigned int hash(const union futex_key *key)
-{
-	return jhash2((u32*)&key->both.word,
-		      (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
-		      key->both.offset)
-		% ARRAY_SIZE(dma_hash);
-}
-
-/* This is a convenience routine to compare two keys.  It's a much bemoaned C
- * weakness that it doesn't allow '==' on structures or unions, so we have to
- * open-code it like this. */
-static inline int key_eq(const union futex_key *a, const union futex_key *b)
-{
-	return (a->both.word == b->both.word
-		&& a->both.ptr == b->both.ptr
-		&& a->both.offset == b->both.offset);
-}
-
-/*L:360 OK, when we need to actually free up a Guest's DMA array we do several
- * things, so we have a convenient function to do it.
- *
- * The caller must hold a read lock on dmainfo owner's current->mm->mmap_sem
- * for the drop_futex_key_refs(). */
-static void unlink_dma(struct lguest_dma_info *dmainfo)
-{
-	/* You locked this too, right? */
-	BUG_ON(!mutex_is_locked(&lguest_lock));
-	/* This is how we know that the entry is free. */
-	dmainfo->interrupt = 0;
-	/* Remove it from the hash table. */
-	list_del(&dmainfo->list);
-	/* Drop the references we were holding (to the inode or mm). */
-	drop_futex_key_refs(&dmainfo->key);
-}
-
-/*L:350 This is the routine which we call when the Guest asks to unregister a
- * DMA array attached to a given key.  Returns true if the array was found. */
-static int unbind_dma(struct lguest *lg,
-		      const union futex_key *key,
-		      unsigned long dmas)
-{
-	int i, ret = 0;
-
-	/* We don't bother with the hash table, just look through all this
-	 * Guest's DMA arrays. */
-	for (i = 0; i < LGUEST_MAX_DMA; i++) {
-		/* In theory it could have more than one array on the same key,
-		 * or one array on multiple keys, so we check both */
-		if (key_eq(key, &lg->dma[i].key) && dmas == lg->dma[i].dmas) {
-			unlink_dma(&lg->dma[i]);
-			ret = 1;
-			break;
-		}
-	}
-	return ret;
-}
-
-/*L:340 BIND_DMA: this is the hypercall which sets up an array of "struct
- * lguest_dma" for receiving I/O.
- *
- * The Guest wants to bind an array of "struct lguest_dma"s to a particular key
- * to receive input.  This only happens when the Guest is setting up a new
- * device, so it doesn't have to be very fast.
- *
- * It returns 1 on a successful registration (it can fail if we hit the limit
- * of registrations for this Guest).
- */
-int bind_dma(struct lguest *lg,
-	     unsigned long ukey, unsigned long dmas, u16 numdmas, u8 interrupt)
-{
-	unsigned int i;
-	int ret = 0;
-	union futex_key key;
-	/* Futex code needs the mmap_sem. */
-	struct rw_semaphore *fshared = &current->mm->mmap_sem;
-
-	/* Invalid interrupt?  (We could kill the guest here). */
-	if (interrupt >= LGUEST_IRQS)
-		return 0;
-
-	/* We need to grab the Big Lguest Lock, because other Guests may be
-	 * trying to look through this Guest's DMAs to send something while
-	 * we're doing this. */
-	mutex_lock(&lguest_lock);
-	down_read(fshared);
-	if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) {
-		kill_guest(lg, "bad dma key %#lx", ukey);
-		goto unlock;
-	}
-
-	/* We want to keep this key valid once we drop mmap_sem, so we have to
-	 * hold a reference. */
-	get_futex_key_refs(&key);
-
-	/* If the Guest specified an interrupt of 0, that means they want to
-	 * unregister this array of "struct lguest_dma"s. */
-	if (interrupt == 0)
-		ret = unbind_dma(lg, &key, dmas);
-	else {
-		/* Look through this Guest's dma array for an unused entry. */
-		for (i = 0; i < LGUEST_MAX_DMA; i++) {
-			/* If the interrupt is non-zero, the entry is already
-			 * used. */
-			if (lg->dma[i].interrupt)
-				continue;
-
-			/* OK, a free one!  Fill on our details. */
-			lg->dma[i].dmas = dmas;
-			lg->dma[i].num_dmas = numdmas;
-			lg->dma[i].next_dma = 0;
-			lg->dma[i].key = key;
-			lg->dma[i].guestid = lg->guestid;
-			lg->dma[i].interrupt = interrupt;
-
-			/* Now we add it to the hash table: the position
-			 * depends on the futex key that we got. */
-			list_add(&lg->dma[i].list, &dma_hash[hash(&key)]);
-			/* Success! */
-			ret = 1;
-			goto unlock;
-		}
-	}
-	/* If we didn't find a slot to put the key in, drop the reference
-	 * again. */
-	drop_futex_key_refs(&key);
-unlock:
-	/* Unlock and out. */
- 	up_read(fshared);
-	mutex_unlock(&lguest_lock);
-	return ret;
-}
-
-/*L:385 Note that our routines to access a different Guest's memory are called
- * lgread_other() and lgwrite_other(): these names emphasize that they are only
- * used when the Guest is *not* the current Guest.
- *
- * The interface for copying from another process's memory is called
- * access_process_vm(), with a final argument of 0 for a read, and 1 for a
- * write.
- *
- * We need lgread_other() to read the destination Guest's "struct lguest_dma"
- * array. */
-static int lgread_other(struct lguest *lg,
-			void *buf, u32 addr, unsigned bytes)
-{
-	if (!lguest_address_ok(lg, addr, bytes)
-	    || access_process_vm(lg->tsk, addr, buf, bytes, 0) != bytes) {
-		memset(buf, 0, bytes);
-		kill_guest(lg, "bad address in registered DMA struct");
-		return 0;
-	}
-	return 1;
-}
-
-/* "lgwrite()" to another Guest: used to update the destination "used_len" once
- * we've transferred data into the buffer. */
-static int lgwrite_other(struct lguest *lg, u32 addr,
-			 const void *buf, unsigned bytes)
-{
-	if (!lguest_address_ok(lg, addr, bytes)
-	    || (access_process_vm(lg->tsk, addr, (void *)buf, bytes, 1)
-		!= bytes)) {
-		kill_guest(lg, "bad address writing to registered DMA");
-		return 0;
-	}
-	return 1;
-}
-
-/*L:400 This is the generic engine which copies from a source "struct
- * lguest_dma" from this Guest into another Guest's "struct lguest_dma".  The
- * destination Guest's pages have already been mapped, as contained in the
- * pages array.
- *
- * If you're wondering if there's a nice "copy from one process to another"
- * routine, so was I.  But Linux isn't really set up to copy between two
- * unrelated processes, so we have to write it ourselves.
- */
-static u32 copy_data(struct lguest *srclg,
-		     const struct lguest_dma *src,
-		     const struct lguest_dma *dst,
-		     struct page *pages[])
-{
-	unsigned int totlen, si, di, srcoff, dstoff;
-	void *maddr = NULL;
-
-	/* We return the total length transferred. */
-	totlen = 0;
-
-	/* We keep indexes into the source and destination "struct lguest_dma",
-	 * and an offset within each region. */
-	si = di = 0;
-	srcoff = dstoff = 0;
-
-	/* We loop until the source or destination is exhausted. */
-	while (si < LGUEST_MAX_DMA_SECTIONS && src->len[si]
-	       && di < LGUEST_MAX_DMA_SECTIONS && dst->len[di]) {
-		/* We can only transfer the rest of the src buffer, or as much
-		 * as will fit into the destination buffer. */
-		u32 len = min(src->len[si] - srcoff, dst->len[di] - dstoff);
-
-		/* For systems using "highmem" we need to use kmap() to access
-		 * the page we want.  We often use the same page over and over,
-		 * so rather than kmap() it on every loop, we set the maddr
-		 * pointer to NULL when we need to move to the next
-		 * destination page. */
-		if (!maddr)
-			maddr = kmap(pages[di]);
-
-		/* Copy directly from (this Guest's) source address to the
-		 * destination Guest's kmap()ed buffer.  Note that maddr points
-		 * to the start of the page: we need to add the offset of the
-		 * destination address and offset within the buffer. */
-
-		/* FIXME: This is not completely portable.  I looked at
-		 * copy_to_user_page(), and some arch's seem to need special
-		 * flushes.  x86 is fine. */
-		if (copy_from_user(maddr + (dst->addr[di] + dstoff)%PAGE_SIZE,
-				   (void __user *)src->addr[si], len) != 0) {
-			/* If a copy failed, it's the source's fault. */
-			kill_guest(srclg, "bad address in sending DMA");
-			totlen = 0;
-			break;
-		}
-
-		/* Increment the total and src & dst offsets */
-		totlen += len;
-		srcoff += len;
-		dstoff += len;
-
-		/* Presumably we reached the end of the src or dest buffers: */
-		if (srcoff == src->len[si]) {
-			/* Move to the next buffer at offset 0 */
-			si++;
-			srcoff = 0;
-		}
-		if (dstoff == dst->len[di]) {
-			/* We need to unmap that destination page and reset
-			 * maddr ready for the next one. */
-			kunmap(pages[di]);
-			maddr = NULL;
-			di++;
-			dstoff = 0;
-		}
-	}
-
-	/* If we still had a page mapped at the end, unmap now. */
-	if (maddr)
-		kunmap(pages[di]);
-
-	return totlen;
-}
-
-/*L:390 This is how we transfer a "struct lguest_dma" from the source Guest
- * (the current Guest which called SEND_DMA) to another Guest. */
-static u32 do_dma(struct lguest *srclg, const struct lguest_dma *src,
-		  struct lguest *dstlg, const struct lguest_dma *dst)
-{
-	int i;
-	u32 ret;
-	struct page *pages[LGUEST_MAX_DMA_SECTIONS];
-
-	/* We check that both source and destination "struct lguest_dma"s are
-	 * within the bounds of the source and destination Guests */
-	if (!check_dma_list(dstlg, dst) || !check_dma_list(srclg, src))
-		return 0;
-
-	/* We need to map the pages which correspond to each parts of
-	 * destination buffer. */
-	for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
-		if (dst->len[i] == 0)
-			break;
-		/* get_user_pages() is a complicated function, especially since
-		 * we only want a single page.  But it works, and returns the
-		 * number of pages.  Note that we're holding the destination's
-		 * mmap_sem, as get_user_pages() requires. */
-		if (get_user_pages(dstlg->tsk, dstlg->mm,
-				   dst->addr[i], 1, 1, 1, pages+i, NULL)
-		    != 1) {
-			/* This means the destination gave us a bogus buffer */
-			kill_guest(dstlg, "Error mapping DMA pages");
-			ret = 0;
-			goto drop_pages;
-		}
-	}
-
-	/* Now copy the data until we run out of src or dst. */
-	ret = copy_data(srclg, src, dst, pages);
-
-drop_pages:
-	while (--i >= 0)
-		put_page(pages[i]);
-	return ret;
-}
-
-/*L:380 Transferring data from one Guest to another is not as simple as I'd
- * like.  We've found the "struct lguest_dma_info" bound to the same address as
- * the send, we need to copy into it.
- *
- * This function returns true if the destination array was empty. */
-static int dma_transfer(struct lguest *srclg,
-			unsigned long udma,
-			struct lguest_dma_info *dst)
-{
-	struct lguest_dma dst_dma, src_dma;
-	struct lguest *dstlg;
-	u32 i, dma = 0;
-
-	/* From the "struct lguest_dma_info" we found in the hash, grab the
-	 * Guest. */
-	dstlg = &lguests[dst->guestid];
-	/* Read in the source "struct lguest_dma" handed to SEND_DMA. */
-	lgread(srclg, &src_dma, udma, sizeof(src_dma));
-
-	/* We need the destination's mmap_sem, and we already hold the source's
-	 * mmap_sem for the futex key lookup.  Normally this would suggest that
-	 * we could deadlock if the destination Guest was trying to send to
-	 * this source Guest at the same time, which is another reason that all
-	 * I/O is done under the big lguest_lock. */
-	down_read(&dstlg->mm->mmap_sem);
-
-	/* Look through the destination DMA array for an available buffer. */
-	for (i = 0; i < dst->num_dmas; i++) {
-		/* We keep a "next_dma" pointer which often helps us avoid
-		 * looking at lots of previously-filled entries. */
-		dma = (dst->next_dma + i) % dst->num_dmas;
-		if (!lgread_other(dstlg, &dst_dma,
-				  dst->dmas + dma * sizeof(struct lguest_dma),
-				  sizeof(dst_dma))) {
-			goto fail;
-		}
-		if (!dst_dma.used_len)
-			break;
-	}
-
-	/* If we found a buffer, we do the actual data copy. */
-	if (i != dst->num_dmas) {
-		unsigned long used_lenp;
-		unsigned int ret;
-
-		ret = do_dma(srclg, &src_dma, dstlg, &dst_dma);
-		/* Put used length in the source "struct lguest_dma"'s used_len
-		 * field.  It's a little tricky to figure out where that is,
-		 * though. */
-		lgwrite_u32(srclg,
-			    udma+offsetof(struct lguest_dma, used_len), ret);
-		/* Tranferring 0 bytes is OK if the source buffer was empty. */
-		if (ret == 0 && src_dma.len[0] != 0)
-			goto fail;
-
-		/* The destination Guest might be running on a different CPU:
-		 * we have to make sure that it will see the "used_len" field
-		 * change to non-zero *after* it sees the data we copied into
-		 * the buffer.  Hence a write memory barrier. */
-		wmb();
-		/* Figuring out where the destination's used_len field for this
-		 * "struct lguest_dma" in the array is also a little ugly. */
-		used_lenp = dst->dmas
-			+ dma * sizeof(struct lguest_dma)
-			+ offsetof(struct lguest_dma, used_len);
-		lgwrite_other(dstlg, used_lenp, &ret, sizeof(ret));
-		/* Move the cursor for next time. */
-		dst->next_dma++;
-	}
- 	up_read(&dstlg->mm->mmap_sem);
-
-	/* We trigger the destination interrupt, even if the destination was
-	 * empty and we didn't transfer anything: this gives them a chance to
-	 * wake up and refill. */
-	set_bit(dst->interrupt, dstlg->irqs_pending);
-	/* Wake up the destination process. */
-	wake_up_process(dstlg->tsk);
-	/* If we passed the last "struct lguest_dma", the receive had no
-	 * buffers left. */
-	return i == dst->num_dmas;
-
-fail:
-	up_read(&dstlg->mm->mmap_sem);
-	return 0;
-}
-
-/*L:370 This is the counter-side to the BIND_DMA hypercall; the SEND_DMA
- * hypercall.  We find out who's listening, and send to them. */
-void send_dma(struct lguest *lg, unsigned long ukey, unsigned long udma)
-{
-	union futex_key key;
-	int empty = 0;
-	struct rw_semaphore *fshared = &current->mm->mmap_sem;
-
-again:
-	mutex_lock(&lguest_lock);
-	down_read(fshared);
-	/* Get the futex key for the key the Guest gave us */
-	if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) {
-		kill_guest(lg, "bad sending DMA key");
-		goto unlock;
-	}
-	/* Since the key must be a multiple of 4, the futex key uses the lower
-	 * bit of the "offset" field (which would always be 0) to indicate a
-	 * mapping which is shared with other processes (ie. Guests). */
-	if (key.shared.offset & 1) {
-		struct lguest_dma_info *i;
-		/* Look through the hash for other Guests. */
-		list_for_each_entry(i, &dma_hash[hash(&key)], list) {
-			/* Don't send to ourselves. */
-			if (i->guestid == lg->guestid)
-				continue;
-			if (!key_eq(&key, &i->key))
-				continue;
-
-			/* If dma_transfer() tells us the destination has no
-			 * available buffers, we increment "empty". */
-			empty += dma_transfer(lg, udma, i);
-			break;
-		}
-		/* If the destination is empty, we release our locks and
-		 * give the destination Guest a brief chance to restock. */
-		if (empty == 1) {
-			/* Give any recipients one chance to restock. */
-			up_read(&current->mm->mmap_sem);
-			mutex_unlock(&lguest_lock);
-			/* Next time, we won't try again. */
-			empty++;
-			goto again;
-		}
-	} else {
-		/* Private mapping: Guest is sending to its Launcher.  We set
-		 * the "dma_is_pending" flag so that the main loop will exit
-		 * and the Launcher's read() from /dev/lguest will return. */
-		lg->dma_is_pending = 1;
-		lg->pending_dma = udma;
-		lg->pending_key = ukey;
-	}
-unlock:
-	up_read(fshared);
-	mutex_unlock(&lguest_lock);
-}
-/*:*/
-
-void release_all_dma(struct lguest *lg)
-{
-	unsigned int i;
-
-	BUG_ON(!mutex_is_locked(&lguest_lock));
-
-	down_read(&lg->mm->mmap_sem);
-	for (i = 0; i < LGUEST_MAX_DMA; i++) {
-		if (lg->dma[i].interrupt)
-			unlink_dma(&lg->dma[i]);
-	}
-	up_read(&lg->mm->mmap_sem);
-}
-
-/*M:007 We only return a single DMA buffer to the Launcher, but it would be
- * more efficient to return a pointer to the entire array of DMA buffers, which
- * it can cache and choose one whenever it wants.
- *
- * Currently the Launcher uses a write to /dev/lguest, and the return value is
- * the address of the DMA structure with the interrupt number placed in
- * dma->used_len.  If we wanted to return the entire array, we need to return
- * the address, array size and interrupt number: this seems to require an
- * ioctl(). :*/
-
-/*L:320 This routine looks for a DMA buffer registered by the Guest on the
- * given key (using the BIND_DMA hypercall). */
-unsigned long get_dma_buffer(struct lguest *lg,
-			     unsigned long ukey, unsigned long *interrupt)
-{
-	unsigned long ret = 0;
-	union futex_key key;
-	struct lguest_dma_info *i;
-	struct rw_semaphore *fshared = &current->mm->mmap_sem;
-
-	/* Take the Big Lguest Lock to stop other Guests sending this Guest DMA
-	 * at the same time. */
-	mutex_lock(&lguest_lock);
-	/* To match between Guests sharing the same underlying memory we steal
-	 * code from the futex infrastructure.  This requires that we hold the
-	 * "mmap_sem" for our process (the Launcher), and pass it to the futex
-	 * code. */
-	down_read(fshared);
-
-	/* This can fail if it's not a valid address, or if the address is not
-	 * divisible by 4 (the futex code needs that, we don't really). */
-	if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) {
-		kill_guest(lg, "bad registered DMA buffer");
-		goto unlock;
-	}
-	/* Search the hash table for matching entries (the Launcher can only
-	 * send to its own Guest for the moment, so the entry must be for this
-	 * Guest) */
-	list_for_each_entry(i, &dma_hash[hash(&key)], list) {
-		if (key_eq(&key, &i->key) && i->guestid == lg->guestid) {
-			unsigned int j;
-			/* Look through the registered DMA array for an
-			 * available buffer. */
-			for (j = 0; j < i->num_dmas; j++) {
-				struct lguest_dma dma;
-
-				ret = i->dmas + j * sizeof(struct lguest_dma);
-				lgread(lg, &dma, ret, sizeof(dma));
-				if (dma.used_len == 0)
-					break;
-			}
-			/* Store the interrupt the Guest wants when the buffer
-			 * is used. */
-			*interrupt = i->interrupt;
-			break;
-		}
-	}
-unlock:
-	up_read(fshared);
-	mutex_unlock(&lguest_lock);
-	return ret;
-}
-/*:*/
-
-/*L:410 This really has completed the Launcher.  Not only have we now finished
- * the longest chapter in our journey, but this also means we are over halfway
- * through!
- *
- * Enough prevaricating around the bush: it is time for us to dive into the
- * core of the Host, in "make Host".
- */
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 64f0abed317..d9144beca82 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -1,119 +1,25 @@
 #ifndef _LGUEST_H
 #define _LGUEST_H
 
-#include <asm/desc.h>
-
-#define GDT_ENTRY_LGUEST_CS	10
-#define GDT_ENTRY_LGUEST_DS	11
-#define LGUEST_CS		(GDT_ENTRY_LGUEST_CS * 8)
-#define LGUEST_DS		(GDT_ENTRY_LGUEST_DS * 8)
-
 #ifndef __ASSEMBLY__
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/stringify.h>
-#include <linux/binfmts.h>
-#include <linux/futex.h>
 #include <linux/lguest.h>
 #include <linux/lguest_launcher.h>
 #include <linux/wait.h>
 #include <linux/err.h>
 #include <asm/semaphore.h>
-#include "irq_vectors.h"
-
-#define GUEST_PL 1
 
-struct lguest_regs
-{
-	/* Manually saved part. */
-	unsigned long ebx, ecx, edx;
-	unsigned long esi, edi, ebp;
-	unsigned long gs;
-	unsigned long eax;
-	unsigned long fs, ds, es;
-	unsigned long trapnum, errcode;
-	/* Trap pushed part */
-	unsigned long eip;
-	unsigned long cs;
-	unsigned long eflags;
-	unsigned long esp;
-	unsigned long ss;
-};
+#include <asm/lguest.h>
 
 void free_pagetables(void);
 int init_pagetables(struct page **switcher_page, unsigned int pages);
 
-/* Full 4G segment descriptors, suitable for CS and DS. */
-#define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00})
-#define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300})
-
-struct lguest_dma_info
-{
-	struct list_head list;
-	union futex_key key;
-	unsigned long dmas;
-	u16 next_dma;
-	u16 num_dmas;
-	u16 guestid;
-	u8 interrupt; 	/* 0 when not registered */
-};
-
-/*H:310 The page-table code owes a great debt of gratitude to Andi Kleen.  He
- * reviewed the original code which used "u32" for all page table entries, and
- * insisted that it would be far clearer with explicit typing.  I thought it
- * was overkill, but he was right: it is much clearer than it was before.
- *
- * We have separate types for the Guest's ptes & pgds and the shadow ptes &
- * pgds.  There's already a Linux type for these (pte_t and pgd_t) but they
- * change depending on kernel config options (PAE). */
-
-/* Each entry is identical: lower 12 bits of flags and upper 20 bits for the
- * "page frame number" (0 == first physical page, etc).  They are different
- * types so the compiler will warn us if we mix them improperly. */
-typedef union {
-	struct { unsigned flags:12, pfn:20; };
-	struct { unsigned long val; } raw;
-} spgd_t;
-typedef union {
-	struct { unsigned flags:12, pfn:20; };
-	struct { unsigned long val; } raw;
-} spte_t;
-typedef union {
-	struct { unsigned flags:12, pfn:20; };
-	struct { unsigned long val; } raw;
-} gpgd_t;
-typedef union {
-	struct { unsigned flags:12, pfn:20; };
-	struct { unsigned long val; } raw;
-} gpte_t;
-
-/* We have two convenient macros to convert a "raw" value as handed to us by
- * the Guest into the correct Guest PGD or PTE type. */
-#define mkgpte(_val) ((gpte_t){.raw.val = _val})
-#define mkgpgd(_val) ((gpgd_t){.raw.val = _val})
-/*:*/
-
 struct pgdir
 {
-	unsigned long cr3;
-	spgd_t *pgdir;
-};
-
-/* This is a guest-specific page (mapped ro) into the guest. */
-struct lguest_ro_state
-{
-	/* Host information we need to restore when we switch back. */
-	u32 host_cr3;
-	struct Xgt_desc_struct host_idt_desc;
-	struct Xgt_desc_struct host_gdt_desc;
-	u32 host_sp;
-
-	/* Fields which are used when guest is running. */
-	struct Xgt_desc_struct guest_idt_desc;
-	struct Xgt_desc_struct guest_gdt_desc;
-	struct i386_hw_tss guest_tss;
-	struct desc_struct guest_idt[IDT_ENTRIES];
-	struct desc_struct guest_gdt[GDT_ENTRIES];
+	unsigned long gpgdir;
+	pgd_t *pgdir;
 };
 
 /* We have two pages shared with guests, per cpu.  */
@@ -141,9 +47,11 @@ struct lguest
 	struct lguest_data __user *lguest_data;
 	struct task_struct *tsk;
 	struct mm_struct *mm; 	/* == tsk->mm, but that becomes NULL on exit */
-	u16 guestid;
 	u32 pfn_limit;
-	u32 page_offset;
+	/* This provides the offset to the base of guest-physical
+	 * memory in the Launcher. */
+	void __user *mem_base;
+	unsigned long kernel_address;
 	u32 cr2;
 	int halted;
 	int ts;
@@ -151,6 +59,9 @@ struct lguest
 	u32 esp1;
 	u8 ss1;
 
+	/* If a hypercall was asked for, this points to the arguments. */
+	struct hcall_args *hcall;
+
 	/* Do we need to stop what we're doing and return to userspace? */
 	int break_out;
 	wait_queue_head_t break_wq;
@@ -167,24 +78,15 @@ struct lguest
 	struct task_struct *wake;
 
 	unsigned long noirq_start, noirq_end;
-	int dma_is_pending;
-	unsigned long pending_dma; /* struct lguest_dma */
-	unsigned long pending_key; /* address they're sending to */
+	unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */
 
 	unsigned int stack_pages;
 	u32 tsc_khz;
 
-	struct lguest_dma_info dma[LGUEST_MAX_DMA];
-
 	/* Dead? */
 	const char *dead;
 
-	/* The GDT entries copied into lguest_ro_state when running. */
-	struct desc_struct gdt[GDT_ENTRIES];
-
-	/* The IDT entries: some copied into lguest_ro_state when running. */
-	struct desc_struct idt[FIRST_EXTERNAL_VECTOR+LGUEST_IRQS];
-	struct desc_struct syscall_idt;
+	struct lguest_arch arch;
 
 	/* Virtual clock device */
 	struct hrtimer hrt;
@@ -193,19 +95,38 @@ struct lguest
 	DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
 };
 
-extern struct lguest lguests[];
 extern struct mutex lguest_lock;
 
 /* core.c: */
-u32 lgread_u32(struct lguest *lg, unsigned long addr);
-void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val);
-void lgread(struct lguest *lg, void *buf, unsigned long addr, unsigned len);
-void lgwrite(struct lguest *lg, unsigned long, const void *buf, unsigned len);
-int find_free_guest(void);
 int lguest_address_ok(const struct lguest *lg,
 		      unsigned long addr, unsigned long len);
+void __lgread(struct lguest *, void *, unsigned long, unsigned);
+void __lgwrite(struct lguest *, unsigned long, const void *, unsigned);
+
+/*L:306 Using memory-copy operations like that is usually inconvient, so we
+ * have the following helper macros which read and write a specific type (often
+ * an unsigned long).
+ *
+ * This reads into a variable of the given type then returns that. */
+#define lgread(lg, addr, type)						\
+	({ type _v; __lgread((lg), &_v, (addr), sizeof(_v)); _v; })
+
+/* This checks that the variable is of the given type, then writes it out. */
+#define lgwrite(lg, addr, type, val)				\
+	do {							\
+		typecheck(type, val);				\
+		__lgwrite((lg), (addr), &(val), sizeof(val));	\
+	} while(0)
+/* (end of memory access helper routines) :*/
+
 int run_guest(struct lguest *lg, unsigned long __user *user);
 
+/* Helper macros to obtain the first 12 or the last 20 bits, this is only the
+ * first step in the migration to the kernel types.  pte_pfn is already defined
+ * in the kernel. */
+#define pgd_flags(x)	(pgd_val(x) & ~PAGE_MASK)
+#define pte_flags(x)	(pte_val(x) & ~PAGE_MASK)
+#define pgd_pfn(x)	(pgd_val(x) >> PAGE_SHIFT)
 
 /* interrupts_and_traps.c: */
 void maybe_do_interrupt(struct lguest *lg);
@@ -219,6 +140,9 @@ void copy_traps(const struct lguest *lg, struct desc_struct *idt,
 		const unsigned long *def);
 void guest_set_clockevent(struct lguest *lg, unsigned long delta);
 void init_clockdev(struct lguest *lg);
+bool check_syscall_vector(struct lguest *lg);
+int init_interrupts(void);
+void free_interrupts(void);
 
 /* segments.c: */
 void setup_default_gdt_entries(struct lguest_ro_state *state);
@@ -232,28 +156,33 @@ void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt);
 int init_guest_pagetable(struct lguest *lg, unsigned long pgtable);
 void free_guest_pagetable(struct lguest *lg);
 void guest_new_pagetable(struct lguest *lg, unsigned long pgtable);
-void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 i);
+void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i);
 void guest_pagetable_clear_all(struct lguest *lg);
 void guest_pagetable_flush_user(struct lguest *lg);
-void guest_set_pte(struct lguest *lg, unsigned long cr3,
-		   unsigned long vaddr, gpte_t val);
+void guest_set_pte(struct lguest *lg, unsigned long gpgdir,
+		   unsigned long vaddr, pte_t val);
 void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages);
 int demand_page(struct lguest *info, unsigned long cr2, int errcode);
 void pin_page(struct lguest *lg, unsigned long vaddr);
+unsigned long guest_pa(struct lguest *lg, unsigned long vaddr);
+void page_table_guest_data_init(struct lguest *lg);
+
+/* <arch>/core.c: */
+void lguest_arch_host_init(void);
+void lguest_arch_host_fini(void);
+void lguest_arch_run_guest(struct lguest *lg);
+void lguest_arch_handle_trap(struct lguest *lg);
+int lguest_arch_init_hypercalls(struct lguest *lg);
+int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args);
+void lguest_arch_setup_regs(struct lguest *lg, unsigned long start);
+
+/* <arch>/switcher.S: */
+extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
 
 /* lguest_user.c: */
 int lguest_device_init(void);
 void lguest_device_remove(void);
 
-/* io.c: */
-void lguest_io_init(void);
-int bind_dma(struct lguest *lg,
-	     unsigned long key, unsigned long udma, u16 numdmas, u8 interrupt);
-void send_dma(struct lguest *info, unsigned long key, unsigned long udma);
-void release_all_dma(struct lguest *lg);
-unsigned long get_dma_buffer(struct lguest *lg, unsigned long key,
-			     unsigned long *interrupt);
-
 /* hypercalls.c: */
 void do_hypercalls(struct lguest *lg);
 void write_timestamp(struct lguest *lg);
@@ -292,9 +221,5 @@ do {								\
 } while(0)
 /* (End of aside) :*/
 
-static inline unsigned long guest_pa(struct lguest *lg, unsigned long vaddr)
-{
-	return vaddr - lg->page_offset;
-}
 #endif	/* __ASSEMBLY__ */
 #endif	/* _LGUEST_H */
diff --git a/drivers/lguest/lguest.c b/drivers/lguest/lguest.c
deleted file mode 100644
index 3ba337dde85..00000000000
--- a/drivers/lguest/lguest.c
+++ /dev/null
@@ -1,1108 +0,0 @@
-/*P:010
- * A hypervisor allows multiple Operating Systems to run on a single machine.
- * To quote David Wheeler: "Any problem in computer science can be solved with
- * another layer of indirection."
- *
- * We keep things simple in two ways.  First, we start with a normal Linux
- * kernel and insert a module (lg.ko) which allows us to run other Linux
- * kernels the same way we'd run processes.  We call the first kernel the Host,
- * and the others the Guests.  The program which sets up and configures Guests
- * (such as the example in Documentation/lguest/lguest.c) is called the
- * Launcher.
- *
- * Secondly, we only run specially modified Guests, not normal kernels.  When
- * you set CONFIG_LGUEST to 'y' or 'm', this automatically sets
- * CONFIG_LGUEST_GUEST=y, which compiles this file into the kernel so it knows
- * how to be a Guest.  This means that you can use the same kernel you boot
- * normally (ie. as a Host) as a Guest.
- *
- * These Guests know that they cannot do privileged operations, such as disable
- * interrupts, and that they have to ask the Host to do such things explicitly.
- * This file consists of all the replacements for such low-level native
- * hardware operations: these special Guest versions call the Host.
- *
- * So how does the kernel know it's a Guest?  The Guest starts at a special
- * entry point marked with a magic string, which sets up a few things then
- * calls here.  We replace the native functions various "paravirt" structures
- * with our Guest versions, then boot like normal. :*/
-
-/*
- * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-#include <linux/kernel.h>
-#include <linux/start_kernel.h>
-#include <linux/string.h>
-#include <linux/console.h>
-#include <linux/screen_info.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-#include <linux/lguest.h>
-#include <linux/lguest_launcher.h>
-#include <linux/lguest_bus.h>
-#include <asm/paravirt.h>
-#include <asm/param.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/setup.h>
-#include <asm/e820.h>
-#include <asm/mce.h>
-#include <asm/io.h>
-
-/*G:010 Welcome to the Guest!
- *
- * The Guest in our tale is a simple creature: identical to the Host but
- * behaving in simplified but equivalent ways.  In particular, the Guest is the
- * same kernel as the Host (or at least, built from the same source code). :*/
-
-/* Declarations for definitions in lguest_guest.S */
-extern char lguest_noirq_start[], lguest_noirq_end[];
-extern const char lgstart_cli[], lgend_cli[];
-extern const char lgstart_sti[], lgend_sti[];
-extern const char lgstart_popf[], lgend_popf[];
-extern const char lgstart_pushf[], lgend_pushf[];
-extern const char lgstart_iret[], lgend_iret[];
-extern void lguest_iret(void);
-
-struct lguest_data lguest_data = {
-	.hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
-	.noirq_start = (u32)lguest_noirq_start,
-	.noirq_end = (u32)lguest_noirq_end,
-	.blocked_interrupts = { 1 }, /* Block timer interrupts */
-};
-struct lguest_device_desc *lguest_devices;
-static cycle_t clock_base;
-
-/*G:035 Notice the lazy_hcall() above, rather than hcall().  This is our first
- * real optimization trick!
- *
- * When lazy_mode is set, it means we're allowed to defer all hypercalls and do
- * them as a batch when lazy_mode is eventually turned off.  Because hypercalls
- * are reasonably expensive, batching them up makes sense.  For example, a
- * large mmap might update dozens of page table entries: that code calls
- * paravirt_enter_lazy_mmu(), does the dozen updates, then calls
- * lguest_leave_lazy_mode().
- *
- * So, when we're in lazy mode, we call async_hypercall() to store the call for
- * future processing.  When lazy mode is turned off we issue a hypercall to
- * flush the stored calls.
- */
-static void lguest_leave_lazy_mode(void)
-{
-	paravirt_leave_lazy(paravirt_get_lazy_mode());
-	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
-}
-
-static void lazy_hcall(unsigned long call,
-		       unsigned long arg1,
-		       unsigned long arg2,
-		       unsigned long arg3)
-{
-	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
-		hcall(call, arg1, arg2, arg3);
-	else
-		async_hcall(call, arg1, arg2, arg3);
-}
-
-/* async_hcall() is pretty simple: I'm quite proud of it really.  We have a
- * ring buffer of stored hypercalls which the Host will run though next time we
- * do a normal hypercall.  Each entry in the ring has 4 slots for the hypercall
- * arguments, and a "hcall_status" word which is 0 if the call is ready to go,
- * and 255 once the Host has finished with it.
- *
- * If we come around to a slot which hasn't been finished, then the table is
- * full and we just make the hypercall directly.  This has the nice side
- * effect of causing the Host to run all the stored calls in the ring buffer
- * which empties it for next time! */
-void async_hcall(unsigned long call,
-		 unsigned long arg1, unsigned long arg2, unsigned long arg3)
-{
-	/* Note: This code assumes we're uniprocessor. */
-	static unsigned int next_call;
-	unsigned long flags;
-
-	/* Disable interrupts if not already disabled: we don't want an
-	 * interrupt handler making a hypercall while we're already doing
-	 * one! */
-	local_irq_save(flags);
-	if (lguest_data.hcall_status[next_call] != 0xFF) {
-		/* Table full, so do normal hcall which will flush table. */
-		hcall(call, arg1, arg2, arg3);
-	} else {
-		lguest_data.hcalls[next_call].eax = call;
-		lguest_data.hcalls[next_call].edx = arg1;
-		lguest_data.hcalls[next_call].ebx = arg2;
-		lguest_data.hcalls[next_call].ecx = arg3;
-		/* Arguments must all be written before we mark it to go */
-		wmb();
-		lguest_data.hcall_status[next_call] = 0;
-		if (++next_call == LHCALL_RING_SIZE)
-			next_call = 0;
-	}
-	local_irq_restore(flags);
-}
-/*:*/
-
-/* Wrappers for the SEND_DMA and BIND_DMA hypercalls.  This is mainly because
- * Jeff Garzik complained that __pa() should never appear in drivers, and this
- * helps remove most of them.   But also, it wraps some ugliness. */
-void lguest_send_dma(unsigned long key, struct lguest_dma *dma)
-{
-	/* The hcall might not write this if something goes wrong */
-	dma->used_len = 0;
-	hcall(LHCALL_SEND_DMA, key, __pa(dma), 0);
-}
-
-int lguest_bind_dma(unsigned long key, struct lguest_dma *dmas,
-		    unsigned int num, u8 irq)
-{
-	/* This is the only hypercall which actually wants 5 arguments, and we
-	 * only support 4.  Fortunately the interrupt number is always less
-	 * than 256, so we can pack it with the number of dmas in the final
-	 * argument.  */
-	if (!hcall(LHCALL_BIND_DMA, key, __pa(dmas), (num << 8) | irq))
-		return -ENOMEM;
-	return 0;
-}
-
-/* Unbinding is the same hypercall as binding, but with 0 num & irq. */
-void lguest_unbind_dma(unsigned long key, struct lguest_dma *dmas)
-{
-	hcall(LHCALL_BIND_DMA, key, __pa(dmas), 0);
-}
-
-/* For guests, device memory can be used as normal memory, so we cast away the
- * __iomem to quieten sparse. */
-void *lguest_map(unsigned long phys_addr, unsigned long pages)
-{
-	return (__force void *)ioremap(phys_addr, PAGE_SIZE*pages);
-}
-
-void lguest_unmap(void *addr)
-{
-	iounmap((__force void __iomem *)addr);
-}
-
-/*G:033
- * Here are our first native-instruction replacements: four functions for
- * interrupt control.
- *
- * The simplest way of implementing these would be to have "turn interrupts
- * off" and "turn interrupts on" hypercalls.  Unfortunately, this is too slow:
- * these are by far the most commonly called functions of those we override.
- *
- * So instead we keep an "irq_enabled" field inside our "struct lguest_data",
- * which the Guest can update with a single instruction.  The Host knows to
- * check there when it wants to deliver an interrupt.
- */
-
-/* save_flags() is expected to return the processor state (ie. "eflags").  The
- * eflags word contains all kind of stuff, but in practice Linux only cares
- * about the interrupt flag.  Our "save_flags()" just returns that. */
-static unsigned long save_fl(void)
-{
-	return lguest_data.irq_enabled;
-}
-
-/* "restore_flags" just sets the flags back to the value given. */
-static void restore_fl(unsigned long flags)
-{
-	lguest_data.irq_enabled = flags;
-}
-
-/* Interrupts go off... */
-static void irq_disable(void)
-{
-	lguest_data.irq_enabled = 0;
-}
-
-/* Interrupts go on... */
-static void irq_enable(void)
-{
-	lguest_data.irq_enabled = X86_EFLAGS_IF;
-}
-/*:*/
-/*M:003 Note that we don't check for outstanding interrupts when we re-enable
- * them (or when we unmask an interrupt).  This seems to work for the moment,
- * since interrupts are rare and we'll just get the interrupt on the next timer
- * tick, but when we turn on CONFIG_NO_HZ, we should revisit this.  One way
- * would be to put the "irq_enabled" field in a page by itself, and have the
- * Host write-protect it when an interrupt comes in when irqs are disabled.
- * There will then be a page fault as soon as interrupts are re-enabled. :*/
-
-/*G:034
- * The Interrupt Descriptor Table (IDT).
- *
- * The IDT tells the processor what to do when an interrupt comes in.  Each
- * entry in the table is a 64-bit descriptor: this holds the privilege level,
- * address of the handler, and... well, who cares?  The Guest just asks the
- * Host to make the change anyway, because the Host controls the real IDT.
- */
-static void lguest_write_idt_entry(struct desc_struct *dt,
-				   int entrynum, u32 low, u32 high)
-{
-	/* Keep the local copy up to date. */
-	write_dt_entry(dt, entrynum, low, high);
-	/* Tell Host about this new entry. */
-	hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high);
-}
-
-/* Changing to a different IDT is very rare: we keep the IDT up-to-date every
- * time it is written, so we can simply loop through all entries and tell the
- * Host about them. */
-static void lguest_load_idt(const struct Xgt_desc_struct *desc)
-{
-	unsigned int i;
-	struct desc_struct *idt = (void *)desc->address;
-
-	for (i = 0; i < (desc->size+1)/8; i++)
-		hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b);
-}
-
-/*
- * The Global Descriptor Table.
- *
- * The Intel architecture defines another table, called the Global Descriptor
- * Table (GDT).  You tell the CPU where it is (and its size) using the "lgdt"
- * instruction, and then several other instructions refer to entries in the
- * table.  There are three entries which the Switcher needs, so the Host simply
- * controls the entire thing and the Guest asks it to make changes using the
- * LOAD_GDT hypercall.
- *
- * This is the opposite of the IDT code where we have a LOAD_IDT_ENTRY
- * hypercall and use that repeatedly to load a new IDT.  I don't think it
- * really matters, but wouldn't it be nice if they were the same?
- */
-static void lguest_load_gdt(const struct Xgt_desc_struct *desc)
-{
-	BUG_ON((desc->size+1)/8 != GDT_ENTRIES);
-	hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0);
-}
-
-/* For a single GDT entry which changes, we do the lazy thing: alter our GDT,
- * then tell the Host to reload the entire thing.  This operation is so rare
- * that this naive implementation is reasonable. */
-static void lguest_write_gdt_entry(struct desc_struct *dt,
-				   int entrynum, u32 low, u32 high)
-{
-	write_dt_entry(dt, entrynum, low, high);
-	hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0);
-}
-
-/* OK, I lied.  There are three "thread local storage" GDT entries which change
- * on every context switch (these three entries are how glibc implements
- * __thread variables).  So we have a hypercall specifically for this case. */
-static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
-{
-	/* There's one problem which normal hardware doesn't have: the Host
-	 * can't handle us removing entries we're currently using.  So we clear
-	 * the GS register here: if it's needed it'll be reloaded anyway. */
-	loadsegment(gs, 0);
-	lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0);
-}
-
-/*G:038 That's enough excitement for now, back to ploughing through each of
- * the different pv_ops structures (we're about 1/3 of the way through).
- *
- * This is the Local Descriptor Table, another weird Intel thingy.  Linux only
- * uses this for some strange applications like Wine.  We don't do anything
- * here, so they'll get an informative and friendly Segmentation Fault. */
-static void lguest_set_ldt(const void *addr, unsigned entries)
-{
-}
-
-/* This loads a GDT entry into the "Task Register": that entry points to a
- * structure called the Task State Segment.  Some comments scattered though the
- * kernel code indicate that this used for task switching in ages past, along
- * with blood sacrifice and astrology.
- *
- * Now there's nothing interesting in here that we don't get told elsewhere.
- * But the native version uses the "ltr" instruction, which makes the Host
- * complain to the Guest about a Segmentation Fault and it'll oops.  So we
- * override the native version with a do-nothing version. */
-static void lguest_load_tr_desc(void)
-{
-}
-
-/* The "cpuid" instruction is a way of querying both the CPU identity
- * (manufacturer, model, etc) and its features.  It was introduced before the
- * Pentium in 1993 and keeps getting extended by both Intel and AMD.  As you
- * might imagine, after a decade and a half this treatment, it is now a giant
- * ball of hair.  Its entry in the current Intel manual runs to 28 pages.
- *
- * This instruction even it has its own Wikipedia entry.  The Wikipedia entry
- * has been translated into 4 languages.  I am not making this up!
- *
- * We could get funky here and identify ourselves as "GenuineLguest", but
- * instead we just use the real "cpuid" instruction.  Then I pretty much turned
- * off feature bits until the Guest booted.  (Don't say that: you'll damage
- * lguest sales!)  Shut up, inner voice!  (Hey, just pointing out that this is
- * hardly future proof.)  Noone's listening!  They don't like you anyway,
- * parenthetic weirdo!
- *
- * Replacing the cpuid so we can turn features off is great for the kernel, but
- * anyone (including userspace) can just use the raw "cpuid" instruction and
- * the Host won't even notice since it isn't privileged.  So we try not to get
- * too worked up about it. */
-static void lguest_cpuid(unsigned int *eax, unsigned int *ebx,
-			 unsigned int *ecx, unsigned int *edx)
-{
-	int function = *eax;
-
-	native_cpuid(eax, ebx, ecx, edx);
-	switch (function) {
-	case 1:	/* Basic feature request. */
-		/* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
-		*ecx &= 0x00002201;
-		/* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */
-		*edx &= 0x07808101;
-		/* The Host can do a nice optimization if it knows that the
-		 * kernel mappings (addresses above 0xC0000000 or whatever
-		 * PAGE_OFFSET is set to) haven't changed.  But Linux calls
-		 * flush_tlb_user() for both user and kernel mappings unless
-		 * the Page Global Enable (PGE) feature bit is set. */
-		*edx |= 0x00002000;
-		break;
-	case 0x80000000:
-		/* Futureproof this a little: if they ask how much extended
-		 * processor information there is, limit it to known fields. */
-		if (*eax > 0x80000008)
-			*eax = 0x80000008;
-		break;
-	}
-}
-
-/* Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4.
- * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother
- * it.  The Host needs to know when the Guest wants to change them, so we have
- * a whole series of functions like read_cr0() and write_cr0().
- *
- * We start with CR0.  CR0 allows you to turn on and off all kinds of basic
- * features, but Linux only really cares about one: the horrifically-named Task
- * Switched (TS) bit at bit 3 (ie. 8)
- *
- * What does the TS bit do?  Well, it causes the CPU to trap (interrupt 7) if
- * the floating point unit is used.  Which allows us to restore FPU state
- * lazily after a task switch, and Linux uses that gratefully, but wouldn't a
- * name like "FPUTRAP bit" be a little less cryptic?
- *
- * We store cr0 (and cr3) locally, because the Host never changes it.  The
- * Guest sometimes wants to read it and we'd prefer not to bother the Host
- * unnecessarily. */
-static unsigned long current_cr0, current_cr3;
-static void lguest_write_cr0(unsigned long val)
-{
-	/* 8 == TS bit. */
-	lazy_hcall(LHCALL_TS, val & 8, 0, 0);
-	current_cr0 = val;
-}
-
-static unsigned long lguest_read_cr0(void)
-{
-	return current_cr0;
-}
-
-/* Intel provided a special instruction to clear the TS bit for people too cool
- * to use write_cr0() to do it.  This "clts" instruction is faster, because all
- * the vowels have been optimized out. */
-static void lguest_clts(void)
-{
-	lazy_hcall(LHCALL_TS, 0, 0, 0);
-	current_cr0 &= ~8U;
-}
-
-/* CR2 is the virtual address of the last page fault, which the Guest only ever
- * reads.  The Host kindly writes this into our "struct lguest_data", so we
- * just read it out of there. */
-static unsigned long lguest_read_cr2(void)
-{
-	return lguest_data.cr2;
-}
-
-/* CR3 is the current toplevel pagetable page: the principle is the same as
- * cr0.  Keep a local copy, and tell the Host when it changes. */
-static void lguest_write_cr3(unsigned long cr3)
-{
-	lazy_hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0);
-	current_cr3 = cr3;
-}
-
-static unsigned long lguest_read_cr3(void)
-{
-	return current_cr3;
-}
-
-/* CR4 is used to enable and disable PGE, but we don't care. */
-static unsigned long lguest_read_cr4(void)
-{
-	return 0;
-}
-
-static void lguest_write_cr4(unsigned long val)
-{
-}
-
-/*
- * Page Table Handling.
- *
- * Now would be a good time to take a rest and grab a coffee or similarly
- * relaxing stimulant.  The easy parts are behind us, and the trek gradually
- * winds uphill from here.
- *
- * Quick refresher: memory is divided into "pages" of 4096 bytes each.  The CPU
- * maps virtual addresses to physical addresses using "page tables".  We could
- * use one huge index of 1 million entries: each address is 4 bytes, so that's
- * 1024 pages just to hold the page tables.   But since most virtual addresses
- * are unused, we use a two level index which saves space.  The CR3 register
- * contains the physical address of the top level "page directory" page, which
- * contains physical addresses of up to 1024 second-level pages.  Each of these
- * second level pages contains up to 1024 physical addresses of actual pages,
- * or Page Table Entries (PTEs).
- *
- * Here's a diagram, where arrows indicate physical addresses:
- *
- * CR3 ---> +---------+
- *	    |  	   --------->+---------+
- *	    |	      |	     | PADDR1  |
- *	  Top-level   |	     | PADDR2  |
- *	  (PMD) page  |	     | 	       |
- *	    |	      |	   Lower-level |
- *	    |	      |	   (PTE) page  |
- *	    |	      |	     |	       |
- *	      ....    	     	 ....
- *
- * So to convert a virtual address to a physical address, we look up the top
- * level, which points us to the second level, which gives us the physical
- * address of that page.  If the top level entry was not present, or the second
- * level entry was not present, then the virtual address is invalid (we
- * say "the page was not mapped").
- *
- * Put another way, a 32-bit virtual address is divided up like so:
- *
- *  1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
- * |<---- 10 bits ---->|<---- 10 bits ---->|<------ 12 bits ------>|
- *    Index into top     Index into second      Offset within page
- *  page directory page    pagetable page
- *
- * The kernel spends a lot of time changing both the top-level page directory
- * and lower-level pagetable pages.  The Guest doesn't know physical addresses,
- * so while it maintains these page tables exactly like normal, it also needs
- * to keep the Host informed whenever it makes a change: the Host will create
- * the real page tables based on the Guests'.
- */
-
-/* The Guest calls this to set a second-level entry (pte), ie. to map a page
- * into a process' address space.  We set the entry then tell the Host the
- * toplevel and address this corresponds to.  The Guest uses one pagetable per
- * process, so we need to tell the Host which one we're changing (mm->pgd). */
-static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
-			      pte_t *ptep, pte_t pteval)
-{
-	*ptep = pteval;
-	lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), addr, pteval.pte_low);
-}
-
-/* The Guest calls this to set a top-level entry.  Again, we set the entry then
- * tell the Host which top-level page we changed, and the index of the entry we
- * changed. */
-static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-	*pmdp = pmdval;
-	lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK,
-		   (__pa(pmdp)&(PAGE_SIZE-1))/4, 0);
-}
-
-/* There are a couple of legacy places where the kernel sets a PTE, but we
- * don't know the top level any more.  This is useless for us, since we don't
- * know which pagetable is changing or what address, so we just tell the Host
- * to forget all of them.  Fortunately, this is very rare.
- *
- * ... except in early boot when the kernel sets up the initial pagetables,
- * which makes booting astonishingly slow.  So we don't even tell the Host
- * anything changed until we've done the first page table switch.
- */
-static void lguest_set_pte(pte_t *ptep, pte_t pteval)
-{
-	*ptep = pteval;
-	/* Don't bother with hypercall before initial setup. */
-	if (current_cr3)
-		lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
-}
-
-/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
- * native page table operations.  On native hardware you can set a new page
- * table entry whenever you want, but if you want to remove one you have to do
- * a TLB flush (a TLB is a little cache of page table entries kept by the CPU).
- *
- * So the lguest_set_pte_at() and lguest_set_pmd() functions above are only
- * called when a valid entry is written, not when it's removed (ie. marked not
- * present).  Instead, this is where we come when the Guest wants to remove a
- * page table entry: we tell the Host to set that entry to 0 (ie. the present
- * bit is zero). */
-static void lguest_flush_tlb_single(unsigned long addr)
-{
-	/* Simply set it to zero: if it was not, it will fault back in. */
-	lazy_hcall(LHCALL_SET_PTE, current_cr3, addr, 0);
-}
-
-/* This is what happens after the Guest has removed a large number of entries.
- * This tells the Host that any of the page table entries for userspace might
- * have changed, ie. virtual addresses below PAGE_OFFSET. */
-static void lguest_flush_tlb_user(void)
-{
-	lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0);
-}
-
-/* This is called when the kernel page tables have changed.  That's not very
- * common (unless the Guest is using highmem, which makes the Guest extremely
- * slow), so it's worth separating this from the user flushing above. */
-static void lguest_flush_tlb_kernel(void)
-{
-	lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
-}
-
-/*
- * The Unadvanced Programmable Interrupt Controller.
- *
- * This is an attempt to implement the simplest possible interrupt controller.
- * I spent some time looking though routines like set_irq_chip_and_handler,
- * set_irq_chip_and_handler_name, set_irq_chip_data and set_phasers_to_stun and
- * I *think* this is as simple as it gets.
- *
- * We can tell the Host what interrupts we want blocked ready for using the
- * lguest_data.interrupts bitmap, so disabling (aka "masking") them is as
- * simple as setting a bit.  We don't actually "ack" interrupts as such, we
- * just mask and unmask them.  I wonder if we should be cleverer?
- */
-static void disable_lguest_irq(unsigned int irq)
-{
-	set_bit(irq, lguest_data.blocked_interrupts);
-}
-
-static void enable_lguest_irq(unsigned int irq)
-{
-	clear_bit(irq, lguest_data.blocked_interrupts);
-}
-
-/* This structure describes the lguest IRQ controller. */
-static struct irq_chip lguest_irq_controller = {
-	.name		= "lguest",
-	.mask		= disable_lguest_irq,
-	.mask_ack	= disable_lguest_irq,
-	.unmask		= enable_lguest_irq,
-};
-
-/* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
- * interrupt (except 128, which is used for system calls), and then tells the
- * Linux infrastructure that each interrupt is controlled by our level-based
- * lguest interrupt controller. */
-static void __init lguest_init_IRQ(void)
-{
-	unsigned int i;
-
-	for (i = 0; i < LGUEST_IRQS; i++) {
-		int vector = FIRST_EXTERNAL_VECTOR + i;
-		if (vector != SYSCALL_VECTOR) {
-			set_intr_gate(vector, interrupt[i]);
-			set_irq_chip_and_handler(i, &lguest_irq_controller,
-						 handle_level_irq);
-		}
-	}
-	/* This call is required to set up for 4k stacks, where we have
-	 * separate stacks for hard and soft interrupts. */
-	irq_ctx_init(smp_processor_id());
-}
-
-/*
- * Time.
- *
- * It would be far better for everyone if the Guest had its own clock, but
- * until then the Host gives us the time on every interrupt.
- */
-static unsigned long lguest_get_wallclock(void)
-{
-	return lguest_data.time.tv_sec;
-}
-
-static cycle_t lguest_clock_read(void)
-{
-	unsigned long sec, nsec;
-
-	/* If the Host tells the TSC speed, we can trust that. */
-	if (lguest_data.tsc_khz)
-		return native_read_tsc();
-
-	/* If we can't use the TSC, we read the time value written by the Host.
-	 * Since it's in two parts (seconds and nanoseconds), we risk reading
-	 * it just as it's changing from 99 & 0.999999999 to 100 and 0, and
-	 * getting 99 and 0.  As Linux tends to come apart under the stress of
-	 * time travel, we must be careful: */
-	do {
-		/* First we read the seconds part. */
-		sec = lguest_data.time.tv_sec;
-		/* This read memory barrier tells the compiler and the CPU that
-		 * this can't be reordered: we have to complete the above
-		 * before going on. */
-		rmb();
-		/* Now we read the nanoseconds part. */
-		nsec = lguest_data.time.tv_nsec;
-		/* Make sure we've done that. */
-		rmb();
-		/* Now if the seconds part has changed, try again. */
-	} while (unlikely(lguest_data.time.tv_sec != sec));
-
-	/* Our non-TSC clock is in real nanoseconds. */
-	return sec*1000000000ULL + nsec;
-}
-
-/* This is what we tell the kernel is our clocksource.  */
-static struct clocksource lguest_clock = {
-	.name		= "lguest",
-	.rating		= 400,
-	.read		= lguest_clock_read,
-	.mask		= CLOCKSOURCE_MASK(64),
-	.mult		= 1 << 22,
-	.shift		= 22,
-};
-
-/* The "scheduler clock" is just our real clock, adjusted to start at zero */
-static unsigned long long lguest_sched_clock(void)
-{
-	return cyc2ns(&lguest_clock, lguest_clock_read() - clock_base);
-}
-
-/* We also need a "struct clock_event_device": Linux asks us to set it to go
- * off some time in the future.  Actually, James Morris figured all this out, I
- * just applied the patch. */
-static int lguest_clockevent_set_next_event(unsigned long delta,
-                                           struct clock_event_device *evt)
-{
-	if (delta < LG_CLOCK_MIN_DELTA) {
-		if (printk_ratelimit())
-			printk(KERN_DEBUG "%s: small delta %lu ns\n",
-			       __FUNCTION__, delta);
-		return -ETIME;
-	}
-	hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0);
-	return 0;
-}
-
-static void lguest_clockevent_set_mode(enum clock_event_mode mode,
-                                      struct clock_event_device *evt)
-{
-	switch (mode) {
-	case CLOCK_EVT_MODE_UNUSED:
-	case CLOCK_EVT_MODE_SHUTDOWN:
-		/* A 0 argument shuts the clock down. */
-		hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0);
-		break;
-	case CLOCK_EVT_MODE_ONESHOT:
-		/* This is what we expect. */
-		break;
-	case CLOCK_EVT_MODE_PERIODIC:
-		BUG();
-	case CLOCK_EVT_MODE_RESUME:
-		break;
-	}
-}
-
-/* This describes our primitive timer chip. */
-static struct clock_event_device lguest_clockevent = {
-	.name                   = "lguest",
-	.features               = CLOCK_EVT_FEAT_ONESHOT,
-	.set_next_event         = lguest_clockevent_set_next_event,
-	.set_mode               = lguest_clockevent_set_mode,
-	.rating                 = INT_MAX,
-	.mult                   = 1,
-	.shift                  = 0,
-	.min_delta_ns           = LG_CLOCK_MIN_DELTA,
-	.max_delta_ns           = LG_CLOCK_MAX_DELTA,
-};
-
-/* This is the Guest timer interrupt handler (hardware interrupt 0).  We just
- * call the clockevent infrastructure and it does whatever needs doing. */
-static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
-{
-	unsigned long flags;
-
-	/* Don't interrupt us while this is running. */
-	local_irq_save(flags);
-	lguest_clockevent.event_handler(&lguest_clockevent);
-	local_irq_restore(flags);
-}
-
-/* At some point in the boot process, we get asked to set up our timing
- * infrastructure.  The kernel doesn't expect timer interrupts before this, but
- * we cleverly initialized the "blocked_interrupts" field of "struct
- * lguest_data" so that timer interrupts were blocked until now. */
-static void lguest_time_init(void)
-{
-	/* Set up the timer interrupt (0) to go to our simple timer routine */
-	set_irq_handler(0, lguest_time_irq);
-
-	/* Our clock structure look like arch/i386/kernel/tsc.c if we can use
-	 * the TSC, otherwise it's a dumb nanosecond-resolution clock.  Either
-	 * way, the "rating" is initialized so high that it's always chosen
-	 * over any other clocksource. */
-	if (lguest_data.tsc_khz) {
-		lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz,
-							 lguest_clock.shift);
-		lguest_clock.flags = CLOCK_SOURCE_IS_CONTINUOUS;
-	}
-	clock_base = lguest_clock_read();
-	clocksource_register(&lguest_clock);
-
-	/* Now we've set up our clock, we can use it as the scheduler clock */
-	pv_time_ops.sched_clock = lguest_sched_clock;
-
-	/* We can't set cpumask in the initializer: damn C limitations!  Set it
-	 * here and register our timer device. */
-	lguest_clockevent.cpumask = cpumask_of_cpu(0);
-	clockevents_register_device(&lguest_clockevent);
-
-	/* Finally, we unblock the timer interrupt. */
-	enable_lguest_irq(0);
-}
-
-/*
- * Miscellaneous bits and pieces.
- *
- * Here is an oddball collection of functions which the Guest needs for things
- * to work.  They're pretty simple.
- */
-
-/* The Guest needs to tell the host what stack it expects traps to use.  For
- * native hardware, this is part of the Task State Segment mentioned above in
- * lguest_load_tr_desc(), but to help hypervisors there's this special call.
- *
- * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data
- * segment), the privilege level (we're privilege level 1, the Host is 0 and
- * will not tolerate us trying to use that), the stack pointer, and the number
- * of pages in the stack. */
-static void lguest_load_esp0(struct tss_struct *tss,
-				     struct thread_struct *thread)
-{
-	lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->esp0,
-		   THREAD_SIZE/PAGE_SIZE);
-}
-
-/* Let's just say, I wouldn't do debugging under a Guest. */
-static void lguest_set_debugreg(int regno, unsigned long value)
-{
-	/* FIXME: Implement */
-}
-
-/* There are times when the kernel wants to make sure that no memory writes are
- * caught in the cache (that they've all reached real hardware devices).  This
- * doesn't matter for the Guest which has virtual hardware.
- *
- * On the Pentium 4 and above, cpuid() indicates that the Cache Line Flush
- * (clflush) instruction is available and the kernel uses that.  Otherwise, it
- * uses the older "Write Back and Invalidate Cache" (wbinvd) instruction.
- * Unlike clflush, wbinvd can only be run at privilege level 0.  So we can
- * ignore clflush, but replace wbinvd.
- */
-static void lguest_wbinvd(void)
-{
-}
-
-/* If the Guest expects to have an Advanced Programmable Interrupt Controller,
- * we play dumb by ignoring writes and returning 0 for reads.  So it's no
- * longer Programmable nor Controlling anything, and I don't think 8 lines of
- * code qualifies for Advanced.  It will also never interrupt anything.  It
- * does, however, allow us to get through the Linux boot code. */
-#ifdef CONFIG_X86_LOCAL_APIC
-static void lguest_apic_write(unsigned long reg, unsigned long v)
-{
-}
-
-static unsigned long lguest_apic_read(unsigned long reg)
-{
-	return 0;
-}
-#endif
-
-/* STOP!  Until an interrupt comes in. */
-static void lguest_safe_halt(void)
-{
-	hcall(LHCALL_HALT, 0, 0, 0);
-}
-
-/* Perhaps CRASH isn't the best name for this hypercall, but we use it to get a
- * message out when we're crashing as well as elegant termination like powering
- * off.
- *
- * Note that the Host always prefers that the Guest speak in physical addresses
- * rather than virtual addresses, so we use __pa() here. */
-static void lguest_power_off(void)
-{
-	hcall(LHCALL_CRASH, __pa("Power down"), 0, 0);
-}
-
-/*
- * Panicing.
- *
- * Don't.  But if you did, this is what happens.
- */
-static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
-{
-	hcall(LHCALL_CRASH, __pa(p), 0, 0);
-	/* The hcall won't return, but to keep gcc happy, we're "done". */
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block paniced = {
-	.notifier_call = lguest_panic
-};
-
-/* Setting up memory is fairly easy. */
-static __init char *lguest_memory_setup(void)
-{
-	/* We do this here and not earlier because lockcheck barfs if we do it
-	 * before start_kernel() */
-	atomic_notifier_chain_register(&panic_notifier_list, &paniced);
-
-	/* The Linux bootloader header contains an "e820" memory map: the
-	 * Launcher populated the first entry with our memory limit. */
-	add_memory_region(boot_params.e820_map[0].addr,
-			  boot_params.e820_map[0].size,
-			  boot_params.e820_map[0].type);
-
-	/* This string is for the boot messages. */
-	return "LGUEST";
-}
-
-/*G:050
- * Patching (Powerfully Placating Performance Pedants)
- *
- * We have already seen that pv_ops structures let us replace simple
- * native instructions with calls to the appropriate back end all throughout
- * the kernel.  This allows the same kernel to run as a Guest and as a native
- * kernel, but it's slow because of all the indirect branches.
- *
- * Remember that David Wheeler quote about "Any problem in computer science can
- * be solved with another layer of indirection"?  The rest of that quote is
- * "... But that usually will create another problem."  This is the first of
- * those problems.
- *
- * Our current solution is to allow the paravirt back end to optionally patch
- * over the indirect calls to replace them with something more efficient.  We
- * patch the four most commonly called functions: disable interrupts, enable
- * interrupts, restore interrupts and save interrupts.  We usually have 10
- * bytes to patch into: the Guest versions of these operations are small enough
- * that we can fit comfortably.
- *
- * First we need assembly templates of each of the patchable Guest operations,
- * and these are in lguest_asm.S. */
-
-/*G:060 We construct a table from the assembler templates: */
-static const struct lguest_insns
-{
-	const char *start, *end;
-} lguest_insns[] = {
-	[PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli },
-	[PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti },
-	[PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf },
-	[PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
-};
-
-/* Now our patch routine is fairly simple (based on the native one in
- * paravirt.c).  If we have a replacement, we copy it in and return how much of
- * the available space we used. */
-static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
-			     unsigned long addr, unsigned len)
-{
-	unsigned int insn_len;
-
-	/* Don't do anything special if we don't have a replacement */
-	if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start)
-		return paravirt_patch_default(type, clobber, ibuf, addr, len);
-
-	insn_len = lguest_insns[type].end - lguest_insns[type].start;
-
-	/* Similarly if we can't fit replacement (shouldn't happen, but let's
-	 * be thorough). */
-	if (len < insn_len)
-		return paravirt_patch_default(type, clobber, ibuf, addr, len);
-
-	/* Copy in our instructions. */
-	memcpy(ibuf, lguest_insns[type].start, insn_len);
-	return insn_len;
-}
-
-/*G:030 Once we get to lguest_init(), we know we're a Guest.  The pv_ops
- * structures in the kernel provide points for (almost) every routine we have
- * to override to avoid privileged instructions. */
-__init void lguest_init(void *boot)
-{
-	/* Copy boot parameters first: the Launcher put the physical location
-	 * in %esi, and head.S converted that to a virtual address and handed
-	 * it to us.  We use "__memcpy" because "memcpy" sometimes tries to do
-	 * tricky things to go faster, and we're not ready for that. */
-	__memcpy(&boot_params, boot, PARAM_SIZE);
-	/* The boot parameters also tell us where the command-line is: save
-	 * that, too. */
-	__memcpy(boot_command_line, __va(boot_params.hdr.cmd_line_ptr),
-	       COMMAND_LINE_SIZE);
-
-	/* We're under lguest, paravirt is enabled, and we're running at
-	 * privilege level 1, not 0 as normal. */
-	pv_info.name = "lguest";
-	pv_info.paravirt_enabled = 1;
-	pv_info.kernel_rpl = 1;
-
-	/* We set up all the lguest overrides for sensitive operations.  These
-	 * are detailed with the operations themselves. */
-
-	/* interrupt-related operations */
-	pv_irq_ops.init_IRQ = lguest_init_IRQ;
-	pv_irq_ops.save_fl = save_fl;
-	pv_irq_ops.restore_fl = restore_fl;
-	pv_irq_ops.irq_disable = irq_disable;
-	pv_irq_ops.irq_enable = irq_enable;
-	pv_irq_ops.safe_halt = lguest_safe_halt;
-
-	/* init-time operations */
-	pv_init_ops.memory_setup = lguest_memory_setup;
-	pv_init_ops.patch = lguest_patch;
-
-	/* Intercepts of various cpu instructions */
-	pv_cpu_ops.load_gdt = lguest_load_gdt;
-	pv_cpu_ops.cpuid = lguest_cpuid;
-	pv_cpu_ops.load_idt = lguest_load_idt;
-	pv_cpu_ops.iret = lguest_iret;
-	pv_cpu_ops.load_esp0 = lguest_load_esp0;
-	pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
-	pv_cpu_ops.set_ldt = lguest_set_ldt;
-	pv_cpu_ops.load_tls = lguest_load_tls;
-	pv_cpu_ops.set_debugreg = lguest_set_debugreg;
-	pv_cpu_ops.clts = lguest_clts;
-	pv_cpu_ops.read_cr0 = lguest_read_cr0;
-	pv_cpu_ops.write_cr0 = lguest_write_cr0;
-	pv_cpu_ops.read_cr4 = lguest_read_cr4;
-	pv_cpu_ops.write_cr4 = lguest_write_cr4;
-	pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
-	pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
-	pv_cpu_ops.wbinvd = lguest_wbinvd;
-	pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu;
-	pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
-
-	/* pagetable management */
-	pv_mmu_ops.write_cr3 = lguest_write_cr3;
-	pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user;
-	pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single;
-	pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
-	pv_mmu_ops.set_pte = lguest_set_pte;
-	pv_mmu_ops.set_pte_at = lguest_set_pte_at;
-	pv_mmu_ops.set_pmd = lguest_set_pmd;
-	pv_mmu_ops.read_cr2 = lguest_read_cr2;
-	pv_mmu_ops.read_cr3 = lguest_read_cr3;
-	pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
-	pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
-
-#ifdef CONFIG_X86_LOCAL_APIC
-	/* apic read/write intercepts */
-	pv_apic_ops.apic_write = lguest_apic_write;
-	pv_apic_ops.apic_write_atomic = lguest_apic_write;
-	pv_apic_ops.apic_read = lguest_apic_read;
-#endif
-
-	/* time operations */
-	pv_time_ops.get_wallclock = lguest_get_wallclock;
-	pv_time_ops.time_init = lguest_time_init;
-
-	/* Now is a good time to look at the implementations of these functions
-	 * before returning to the rest of lguest_init(). */
-
-	/*G:070 Now we've seen all the paravirt_ops, we return to
-	 * lguest_init() where the rest of the fairly chaotic boot setup
-	 * occurs.
-	 *
-	 * The Host expects our first hypercall to tell it where our "struct
-	 * lguest_data" is, so we do that first. */
-	hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0);
-
-	/* The native boot code sets up initial page tables immediately after
-	 * the kernel itself, and sets init_pg_tables_end so they're not
-	 * clobbered.  The Launcher places our initial pagetables somewhere at
-	 * the top of our physical memory, so we don't need extra space: set
-	 * init_pg_tables_end to the end of the kernel. */
-	init_pg_tables_end = __pa(pg0);
-
-	/* Load the %fs segment register (the per-cpu segment register) with
-	 * the normal data segment to get through booting. */
-	asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
-
-	/* Clear the part of the kernel data which is expected to be zero.
-	 * Normally it will be anyway, but if we're loading from a bzImage with
-	 * CONFIG_RELOCATALE=y, the relocations will be sitting here. */
-	memset(__bss_start, 0, __bss_stop - __bss_start);
-
-	/* The Host uses the top of the Guest's virtual address space for the
-	 * Host<->Guest Switcher, and it tells us how much it needs in
-	 * lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */
-	reserve_top_address(lguest_data.reserve_mem);
-
-	/* If we don't initialize the lock dependency checker now, it crashes
-	 * paravirt_disable_iospace. */
-	lockdep_init();
-
-	/* The IDE code spends about 3 seconds probing for disks: if we reserve
-	 * all the I/O ports up front it can't get them and so doesn't probe.
-	 * Other device drivers are similar (but less severe).  This cuts the
-	 * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. */
-	paravirt_disable_iospace();
-
-	/* This is messy CPU setup stuff which the native boot code does before
-	 * start_kernel, so we have to do, too: */
-	cpu_detect(&new_cpu_data);
-	/* head.S usually sets up the first capability word, so do it here. */
-	new_cpu_data.x86_capability[0] = cpuid_edx(1);
-
-	/* Math is always hard! */
-	new_cpu_data.hard_math = 1;
-
-#ifdef CONFIG_X86_MCE
-	mce_disabled = 1;
-#endif
-#ifdef CONFIG_ACPI
-	acpi_disabled = 1;
-	acpi_ht = 0;
-#endif
-
-	/* We set the perferred console to "hvc".  This is the "hypervisor
-	 * virtual console" driver written by the PowerPC people, which we also
-	 * adapted for lguest's use. */
-	add_preferred_console("hvc", 0, NULL);
-
-	/* Last of all, we set the power management poweroff hook to point to
-	 * the Guest routine to power off. */
-	pm_power_off = lguest_power_off;
-
-	/* Now we're set up, call start_kernel() in init/main.c and we proceed
-	 * to boot as normal.  It never returns. */
-	start_kernel();
-}
-/*
- * This marks the end of stage II of our journey, The Guest.
- *
- * It is now time for us to explore the nooks and crannies of the three Guest
- * devices and complete our understanding of the Guest in "make Drivers".
- */
diff --git a/drivers/lguest/lguest_asm.S b/drivers/lguest/lguest_asm.S
deleted file mode 100644
index 1ddcd5cd20f..00000000000
--- a/drivers/lguest/lguest_asm.S
+++ /dev/null
@@ -1,93 +0,0 @@
-#include <linux/linkage.h>
-#include <linux/lguest.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-#include <asm/processor-flags.h>
-
-/*G:020 This is where we begin: we have a magic signature which the launcher
- * looks for.  The plan is that the Linux boot protocol will be extended with a
- * "platform type" field which will guide us here from the normal entry point,
- * but for the moment this suffices.  The normal boot code uses %esi for the
- * boot header, so we do too.  We convert it to a virtual address by adding
- * PAGE_OFFSET, and hand it to lguest_init() as its argument (ie. %eax).
- *
- * The .section line puts this code in .init.text so it will be discarded after
- * boot. */
-.section .init.text, "ax", @progbits
-.ascii "GenuineLguest"
-	/* Set up initial stack. */
- 	movl $(init_thread_union+THREAD_SIZE),%esp
-	movl %esi, %eax
-	addl $__PAGE_OFFSET, %eax
-	jmp lguest_init
-
-/*G:055 We create a macro which puts the assembler code between lgstart_ and
- * lgend_ markers.  These templates are put in the .text section: they can't be
- * discarded after boot as we may need to patch modules, too. */
-.text
-#define LGUEST_PATCH(name, insns...)			\
-	lgstart_##name:	insns; lgend_##name:;		\
-	.globl lgstart_##name; .globl lgend_##name
-
-LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
-LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled)
-LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled)
-LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
-/*:*/
-
-/* These demark the EIP range where host should never deliver interrupts. */
-.global lguest_noirq_start
-.global lguest_noirq_end
-
-/*M:004 When the Host reflects a trap or injects an interrupt into the Guest,
- * it sets the eflags interrupt bit on the stack based on
- * lguest_data.irq_enabled, so the Guest iret logic does the right thing when
- * restoring it.  However, when the Host sets the Guest up for direct traps,
- * such as system calls, the processor is the one to push eflags onto the
- * stack, and the interrupt bit will be 1 (in reality, interrupts are always
- * enabled in the Guest).
- *
- * This turns out to be harmless: the only trap which should happen under Linux
- * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc
- * regions), which has to be reflected through the Host anyway.  If another
- * trap *does* go off when interrupts are disabled, the Guest will panic, and
- * we'll never get to this iret! :*/
-
-/*G:045 There is one final paravirt_op that the Guest implements, and glancing
- * at it you can see why I left it to last.  It's *cool*!  It's in *assembler*!
- *
- * The "iret" instruction is used to return from an interrupt or trap.  The
- * stack looks like this:
- *   old address
- *   old code segment & privilege level
- *   old processor flags ("eflags")
- *
- * The "iret" instruction pops those values off the stack and restores them all
- * at once.  The only problem is that eflags includes the Interrupt Flag which
- * the Guest can't change: the CPU will simply ignore it when we do an "iret".
- * So we have to copy eflags from the stack to lguest_data.irq_enabled before
- * we do the "iret".
- *
- * There are two problems with this: firstly, we need to use a register to do
- * the copy and secondly, the whole thing needs to be atomic.  The first
- * problem is easy to solve: push %eax on the stack so we can use it, and then
- * restore it at the end just before the real "iret".
- *
- * The second is harder: copying eflags to lguest_data.irq_enabled will turn
- * interrupts on before we're finished, so we could be interrupted before we
- * return to userspace or wherever.  Our solution to this is to surround the
- * code with lguest_noirq_start: and lguest_noirq_end: labels.  We tell the
- * Host that it is *never* to interrupt us there, even if interrupts seem to be
- * enabled. */
-ENTRY(lguest_iret)
-	pushl	%eax
-	movl	12(%esp), %eax
-lguest_noirq_start:
-	/* Note the %ss: segment prefix here.  Normal data accesses use the
-	 * "ds" segment, but that will have already been restored for whatever
-	 * we're returning to (such as userspace): we can't trust it.  The %ss:
-	 * prefix makes sure we use the stack segment, which is still valid. */
-	movl	%eax,%ss:lguest_data+LGUEST_DATA_irq_enabled
-	popl	%eax
-	iret
-lguest_noirq_end:
diff --git a/drivers/lguest/lguest_bus.c b/drivers/lguest/lguest_bus.c
deleted file mode 100644
index 57329788f8a..00000000000
--- a/drivers/lguest/lguest_bus.c
+++ /dev/null
@@ -1,218 +0,0 @@
-/*P:050 Lguest guests use a very simple bus for devices.  It's a simple array
- * of device descriptors contained just above the top of normal memory.  The
- * lguest bus is 80% tedious boilerplate code. :*/
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/lguest_bus.h>
-#include <asm/io.h>
-#include <asm/paravirt.h>
-
-static ssize_t type_show(struct device *_dev,
-                         struct device_attribute *attr, char *buf)
-{
-	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
-	return sprintf(buf, "%hu", lguest_devices[dev->index].type);
-}
-static ssize_t features_show(struct device *_dev,
-                             struct device_attribute *attr, char *buf)
-{
-	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
-	return sprintf(buf, "%hx", lguest_devices[dev->index].features);
-}
-static ssize_t pfn_show(struct device *_dev,
-			 struct device_attribute *attr, char *buf)
-{
-	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
-	return sprintf(buf, "%u", lguest_devices[dev->index].pfn);
-}
-static ssize_t status_show(struct device *_dev,
-                           struct device_attribute *attr, char *buf)
-{
-	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
-	return sprintf(buf, "%hx", lguest_devices[dev->index].status);
-}
-static ssize_t status_store(struct device *_dev, struct device_attribute *attr,
-                            const char *buf, size_t count)
-{
-	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
-	if (sscanf(buf, "%hi", &lguest_devices[dev->index].status) != 1)
-		return -EINVAL;
-	return count;
-}
-static struct device_attribute lguest_dev_attrs[] = {
-	__ATTR_RO(type),
-	__ATTR_RO(features),
-	__ATTR_RO(pfn),
-	__ATTR(status, 0644, status_show, status_store),
-	__ATTR_NULL
-};
-
-/*D:130 The generic bus infrastructure requires a function which says whether a
- * device matches a driver.  For us, it is simple: "struct lguest_driver"
- * contains a "device_type" field which indicates what type of device it can
- * handle, so we just cast the args and compare: */
-static int lguest_dev_match(struct device *_dev, struct device_driver *_drv)
-{
-	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
-	struct lguest_driver *drv = container_of(_drv,struct lguest_driver,drv);
-
-	return (drv->device_type == lguest_devices[dev->index].type);
-}
-/*:*/
-
-struct lguest_bus {
-	struct bus_type bus;
-	struct device dev;
-};
-
-static struct lguest_bus lguest_bus = {
-	.bus = {
-		.name  = "lguest",
-		.match = lguest_dev_match,
-		.dev_attrs = lguest_dev_attrs,
-	},
-	.dev = {
-		.parent = NULL,
-		.bus_id = "lguest",
-	}
-};
-
-/*D:140 This is the callback which occurs once the bus infrastructure matches
- * up a device and driver, ie. in response to add_lguest_device() calling
- * device_register(), or register_lguest_driver() calling driver_register().
- *
- * At the moment it's always the latter: the devices are added first, since
- * scan_devices() is called from a "core_initcall", and the drivers themselves
- * called later as a normal "initcall".  But it would work the other way too.
- *
- * So now we have the happy couple, we add the status bit to indicate that we
- * found a driver.  If the driver truly loves the device, it will return
- * happiness from its probe function (ok, perhaps this wasn't my greatest
- * analogy), and we set the final "driver ok" bit so the Host sees it's all
- * green. */
-static int lguest_dev_probe(struct device *_dev)
-{
-	int ret;
-	struct lguest_device*dev = container_of(_dev,struct lguest_device,dev);
-	struct lguest_driver*drv = container_of(dev->dev.driver,
-						struct lguest_driver, drv);
-
-	lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER;
-	ret = drv->probe(dev);
-	if (ret == 0)
-		lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER_OK;
-	return ret;
-}
-
-/* The last part of the bus infrastructure is the function lguest drivers use
- * to register themselves.  Firstly, we do nothing if there's no lguest bus
- * (ie. this is not a Guest), otherwise we fill in the embedded generic "struct
- * driver" fields and call the generic driver_register(). */
-int register_lguest_driver(struct lguest_driver *drv)
-{
-	if (!lguest_devices)
-		return 0;
-
-	drv->drv.bus = &lguest_bus.bus;
-	drv->drv.name = drv->name;
-	drv->drv.owner = drv->owner;
-	drv->drv.probe = lguest_dev_probe;
-
-	return driver_register(&drv->drv);
-}
-
-/* At the moment we build all the drivers into the kernel because they're so
- * simple: 8144 bytes for all three of them as I type this.  And as the console
- * really needs to be built in, it's actually only 3527 bytes for the network
- * and block drivers.
- *
- * If they get complex it will make sense for them to be modularized, so we
- * need to explicitly export the symbol.
- *
- * I don't think non-GPL modules make sense, so it's a GPL-only export.
- */
-EXPORT_SYMBOL_GPL(register_lguest_driver);
-
-/*D:120 This is the core of the lguest bus: actually adding a new device.
- * It's a separate function because it's neater that way, and because an
- * earlier version of the code supported hotplug and unplug.  They were removed
- * early on because they were never used.
- *
- * As Andrew Tridgell says, "Untested code is buggy code".
- *
- * It's worth reading this carefully: we start with an index into the array of
- * "struct lguest_device_desc"s indicating the device which is new: */
-static void add_lguest_device(unsigned int index)
-{
-	struct lguest_device *new;
-
-	/* Each "struct lguest_device_desc" has a "status" field, which the
-	 * Guest updates as the device is probed.  In the worst case, the Host
-	 * can look at these bits to tell what part of device setup failed,
-	 * even if the console isn't available. */
-	lguest_devices[index].status |= LGUEST_DEVICE_S_ACKNOWLEDGE;
-	new = kmalloc(sizeof(struct lguest_device), GFP_KERNEL);
-	if (!new) {
-		printk(KERN_EMERG "Cannot allocate lguest device %u\n", index);
-		lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED;
-		return;
-	}
-
-	/* The "struct lguest_device" setup is pretty straight-forward example
-	 * code. */
-	new->index = index;
-	new->private = NULL;
-	memset(&new->dev, 0, sizeof(new->dev));
-	new->dev.parent = &lguest_bus.dev;
-	new->dev.bus = &lguest_bus.bus;
-	sprintf(new->dev.bus_id, "%u", index);
-
-	/* device_register() causes the bus infrastructure to look for a
-	 * matching driver. */
-	if (device_register(&new->dev) != 0) {
-		printk(KERN_EMERG "Cannot register lguest device %u\n", index);
-		lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED;
-		kfree(new);
-	}
-}
-
-/*D:110 scan_devices() simply iterates through the device array.  The type 0
- * is reserved to mean "no device", and anything else means we have found a
- * device: add it. */
-static void scan_devices(void)
-{
-	unsigned int i;
-
-	for (i = 0; i < LGUEST_MAX_DEVICES; i++)
-		if (lguest_devices[i].type)
-			add_lguest_device(i);
-}
-
-/*D:100 Fairly early in boot, lguest_bus_init() is called to set up the lguest
- * bus.  We check that we are a Guest by checking paravirt_ops.name: there are
- * other ways of checking, but this seems most obvious to me.
- *
- * So we can access the array of "struct lguest_device_desc"s easily, we map
- * that memory and store the pointer in the global "lguest_devices".  Then we
- * register the bus with the core.  Doing two registrations seems clunky to me,
- * but it seems to be the correct sysfs incantation.
- *
- * Finally we call scan_devices() which adds all the devices found in the
- * "struct lguest_device_desc" array. */
-static int __init lguest_bus_init(void)
-{
-	if (strcmp(pv_info.name, "lguest") != 0)
-		return 0;
-
-	/* Devices are in a single page above top of "normal" mem */
-	lguest_devices = lguest_map(max_pfn<<PAGE_SHIFT, 1);
-
-	if (bus_register(&lguest_bus.bus) != 0
-	    || device_register(&lguest_bus.dev) != 0)
-		panic("lguest bus registration failed");
-
-	scan_devices();
-	return 0;
-}
-/* Do this after core stuff, before devices. */
-postcore_initcall(lguest_bus_init);
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
new file mode 100644
index 00000000000..71c64837b43
--- /dev/null
+++ b/drivers/lguest/lguest_device.c
@@ -0,0 +1,373 @@
+/*P:050 Lguest guests use a very simple method to describe devices.  It's a
+ * series of device descriptors contained just above the top of normal
+ * memory.
+ *
+ * We use the standard "virtio" device infrastructure, which provides us with a
+ * console, a network and a block driver.  Each one expects some configuration
+ * information and a "virtqueue" mechanism to send and receive data. :*/
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/lguest_launcher.h>
+#include <linux/virtio.h>
+#include <linux/virtio_config.h>
+#include <linux/interrupt.h>
+#include <linux/virtio_ring.h>
+#include <linux/err.h>
+#include <asm/io.h>
+#include <asm/paravirt.h>
+#include <asm/lguest_hcall.h>
+
+/* The pointer to our (page) of device descriptions. */
+static void *lguest_devices;
+
+/* Unique numbering for lguest devices. */
+static unsigned int dev_index;
+
+/* For Guests, device memory can be used as normal memory, so we cast away the
+ * __iomem to quieten sparse. */
+static inline void *lguest_map(unsigned long phys_addr, unsigned long pages)
+{
+	return (__force void *)ioremap(phys_addr, PAGE_SIZE*pages);
+}
+
+static inline void lguest_unmap(void *addr)
+{
+	iounmap((__force void __iomem *)addr);
+}
+
+/*D:100 Each lguest device is just a virtio device plus a pointer to its entry
+ * in the lguest_devices page. */
+struct lguest_device {
+	struct virtio_device vdev;
+
+	/* The entry in the lguest_devices page for this device. */
+	struct lguest_device_desc *desc;
+};
+
+/* Since the virtio infrastructure hands us a pointer to the virtio_device all
+ * the time, it helps to have a curt macro to get a pointer to the struct
+ * lguest_device it's enclosed in.  */
+#define to_lgdev(vdev) container_of(vdev, struct lguest_device, vdev)
+
+/*D:130
+ * Device configurations
+ *
+ * The configuration information for a device consists of a series of fields.
+ * The device will look for these fields during setup.
+ *
+ * For us these fields come immediately after that device's descriptor in the
+ * lguest_devices page.
+ *
+ * Each field starts with a "type" byte, a "length" byte, then that number of
+ * bytes of configuration information.  The device descriptor tells us the
+ * total configuration length so we know when we've reached the last field. */
+
+/* type + length bytes */
+#define FHDR_LEN 2
+
+/* This finds the first field of a given type for a device's configuration. */
+static void *lg_find(struct virtio_device *vdev, u8 type, unsigned int *len)
+{
+	struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
+	int i;
+
+	for (i = 0; i < desc->config_len; i += FHDR_LEN + desc->config[i+1]) {
+		if (desc->config[i] == type) {
+			/* Mark it used, so Host can know we looked at it, and
+			 * also so we won't find the same one twice. */
+			desc->config[i] |= 0x80;
+			/* Remember, the second byte is the length. */
+			*len = desc->config[i+1];
+			/* We return a pointer to the field header. */
+			return desc->config + i;
+		}
+	}
+
+	/* Not found: return NULL for failure. */
+	return NULL;
+}
+
+/* Once they've found a field, getting a copy of it is easy. */
+static void lg_get(struct virtio_device *vdev, void *token,
+		   void *buf, unsigned len)
+{
+	/* Check they didn't ask for more than the length of the field! */
+	BUG_ON(len > ((u8 *)token)[1]);
+	memcpy(buf, token + FHDR_LEN, len);
+}
+
+/* Setting the contents is also trivial. */
+static void lg_set(struct virtio_device *vdev, void *token,
+		   const void *buf, unsigned len)
+{
+	BUG_ON(len > ((u8 *)token)[1]);
+	memcpy(token + FHDR_LEN, buf, len);
+}
+
+/* The operations to get and set the status word just access the status field
+ * of the device descriptor. */
+static u8 lg_get_status(struct virtio_device *vdev)
+{
+	return to_lgdev(vdev)->desc->status;
+}
+
+static void lg_set_status(struct virtio_device *vdev, u8 status)
+{
+	to_lgdev(vdev)->desc->status = status;
+}
+
+/*
+ * Virtqueues
+ *
+ * The other piece of infrastructure virtio needs is a "virtqueue": a way of
+ * the Guest device registering buffers for the other side to read from or
+ * write into (ie. send and receive buffers).  Each device can have multiple
+ * virtqueues: for example the console has one queue for sending and one for
+ * receiving.
+ *
+ * Fortunately for us, a very fast shared-memory-plus-descriptors virtqueue
+ * already exists in virtio_ring.c.  We just need to connect it up.
+ *
+ * We start with the information we need to keep about each virtqueue.
+ */
+
+/*D:140 This is the information we remember about each virtqueue. */
+struct lguest_vq_info
+{
+	/* A copy of the information contained in the device config. */
+	struct lguest_vqconfig config;
+
+	/* The address where we mapped the virtio ring, so we can unmap it. */
+	void *pages;
+};
+
+/* When the virtio_ring code wants to prod the Host, it calls us here and we
+ * make a hypercall.  We hand the page number of the virtqueue so the Host
+ * knows which virtqueue we're talking about. */
+static void lg_notify(struct virtqueue *vq)
+{
+	/* We store our virtqueue information in the "priv" pointer of the
+	 * virtqueue structure. */
+	struct lguest_vq_info *lvq = vq->priv;
+
+	hcall(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT, 0, 0);
+}
+
+/* This routine finds the first virtqueue described in the configuration of
+ * this device and sets it up.
+ *
+ * This is kind of an ugly duckling.  It'd be nicer to have a standard
+ * representation of a virtqueue in the configuration space, but it seems that
+ * everyone wants to do it differently.  The KVM guys want the Guest to
+ * allocate its own pages and tell the Host where they are, but for lguest it's
+ * simpler for the Host to simply tell us where the pages are.
+ *
+ * So we provide devices with a "find virtqueue and set it up" function. */
+static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
+				    bool (*callback)(struct virtqueue *vq))
+{
+	struct lguest_vq_info *lvq;
+	struct virtqueue *vq;
+	unsigned int len;
+	void *token;
+	int err;
+
+	/* Look for a field of the correct type to mark a virtqueue.  Note that
+	 * if this succeeds, then the type will be changed so it won't be found
+	 * again, and future lg_find_vq() calls will find the next
+	 * virtqueue (if any). */
+	token = vdev->config->find(vdev, VIRTIO_CONFIG_F_VIRTQUEUE, &len);
+	if (!token)
+		return ERR_PTR(-ENOENT);
+
+	lvq = kmalloc(sizeof(*lvq), GFP_KERNEL);
+	if (!lvq)
+		return ERR_PTR(-ENOMEM);
+
+	/* Note: we could use a configuration space inside here, just like we
+	 * do for the device.  This would allow expansion in future, because
+	 * our configuration system is designed to be expansible.  But this is
+	 * way easier. */
+	if (len != sizeof(lvq->config)) {
+		dev_err(&vdev->dev, "Unexpected virtio config len %u\n", len);
+		err = -EIO;
+		goto free_lvq;
+	}
+	/* Make a copy of the "struct lguest_vqconfig" field.  We need a copy
+	 * because the config space might not be aligned correctly. */
+	vdev->config->get(vdev, token, &lvq->config, sizeof(lvq->config));
+
+	/* Figure out how many pages the ring will take, and map that memory */
+	lvq->pages = lguest_map((unsigned long)lvq->config.pfn << PAGE_SHIFT,
+				DIV_ROUND_UP(vring_size(lvq->config.num),
+					     PAGE_SIZE));
+	if (!lvq->pages) {
+		err = -ENOMEM;
+		goto free_lvq;
+	}
+
+	/* OK, tell virtio_ring.c to set up a virtqueue now we know its size
+	 * and we've got a pointer to its pages. */
+	vq = vring_new_virtqueue(lvq->config.num, vdev, lvq->pages,
+				 lg_notify, callback);
+	if (!vq) {
+		err = -ENOMEM;
+		goto unmap;
+	}
+
+	/* Tell the interrupt for this virtqueue to go to the virtio_ring
+	 * interrupt handler. */
+	/* FIXME: We used to have a flag for the Host to tell us we could use
+	 * the interrupt as a source of randomness: it'd be nice to have that
+	 * back.. */
+	err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED,
+			  vdev->dev.bus_id, vq);
+	if (err)
+		goto destroy_vring;
+
+	/* Last of all we hook up our 'struct lguest_vq_info" to the
+	 * virtqueue's priv pointer. */
+	vq->priv = lvq;
+	return vq;
+
+destroy_vring:
+	vring_del_virtqueue(vq);
+unmap:
+	lguest_unmap(lvq->pages);
+free_lvq:
+	kfree(lvq);
+	return ERR_PTR(err);
+}
+/*:*/
+
+/* Cleaning up a virtqueue is easy */
+static void lg_del_vq(struct virtqueue *vq)
+{
+	struct lguest_vq_info *lvq = vq->priv;
+
+	/* Tell virtio_ring.c to free the virtqueue. */
+	vring_del_virtqueue(vq);
+	/* Unmap the pages containing the ring. */
+	lguest_unmap(lvq->pages);
+	/* Free our own queue information. */
+	kfree(lvq);
+}
+
+/* The ops structure which hooks everything together. */
+static struct virtio_config_ops lguest_config_ops = {
+	.find = lg_find,
+	.get = lg_get,
+	.set = lg_set,
+	.get_status = lg_get_status,
+	.set_status = lg_set_status,
+	.find_vq = lg_find_vq,
+	.del_vq = lg_del_vq,
+};
+
+/* The root device for the lguest virtio devices.  This makes them appear as
+ * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2. */
+static struct device lguest_root = {
+	.parent = NULL,
+	.bus_id = "lguest",
+};
+
+/*D:120 This is the core of the lguest bus: actually adding a new device.
+ * It's a separate function because it's neater that way, and because an
+ * earlier version of the code supported hotplug and unplug.  They were removed
+ * early on because they were never used.
+ *
+ * As Andrew Tridgell says, "Untested code is buggy code".
+ *
+ * It's worth reading this carefully: we start with a pointer to the new device
+ * descriptor in the "lguest_devices" page. */
+static void add_lguest_device(struct lguest_device_desc *d)
+{
+	struct lguest_device *ldev;
+
+	ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
+	if (!ldev) {
+		printk(KERN_EMERG "Cannot allocate lguest dev %u\n",
+		       dev_index++);
+		return;
+	}
+
+	/* This devices' parent is the lguest/ dir. */
+	ldev->vdev.dev.parent = &lguest_root;
+	/* We have a unique device index thanks to the dev_index counter. */
+	ldev->vdev.index = dev_index++;
+	/* The device type comes straight from the descriptor.  There's also a
+	 * device vendor field in the virtio_device struct, which we leave as
+	 * 0. */
+	ldev->vdev.id.device = d->type;
+	/* We have a simple set of routines for querying the device's
+	 * configuration information and setting its status. */
+	ldev->vdev.config = &lguest_config_ops;
+	/* And we remember the device's descriptor for lguest_config_ops. */
+	ldev->desc = d;
+
+	/* register_virtio_device() sets up the generic fields for the struct
+	 * virtio_device and calls device_register().  This makes the bus
+	 * infrastructure look for a matching driver. */
+	if (register_virtio_device(&ldev->vdev) != 0) {
+		printk(KERN_ERR "Failed to register lguest device %u\n",
+		       ldev->vdev.index);
+		kfree(ldev);
+	}
+}
+
+/*D:110 scan_devices() simply iterates through the device page.  The type 0 is
+ * reserved to mean "end of devices". */
+static void scan_devices(void)
+{
+	unsigned int i;
+	struct lguest_device_desc *d;
+
+	/* We start at the page beginning, and skip over each entry. */
+	for (i = 0; i < PAGE_SIZE; i += sizeof(*d) + d->config_len) {
+		d = lguest_devices + i;
+
+		/* Once we hit a zero, stop. */
+		if (d->type == 0)
+			break;
+
+		add_lguest_device(d);
+	}
+}
+
+/*D:105 Fairly early in boot, lguest_devices_init() is called to set up the
+ * lguest device infrastructure.  We check that we are a Guest by checking
+ * pv_info.name: there are other ways of checking, but this seems most
+ * obvious to me.
+ *
+ * So we can access the "struct lguest_device_desc"s easily, we map that memory
+ * and store the pointer in the global "lguest_devices".  Then we register a
+ * root device from which all our devices will hang (this seems to be the
+ * correct sysfs incantation).
+ *
+ * Finally we call scan_devices() which adds all the devices found in the
+ * lguest_devices page. */
+static int __init lguest_devices_init(void)
+{
+	if (strcmp(pv_info.name, "lguest") != 0)
+		return 0;
+
+	if (device_register(&lguest_root) != 0)
+		panic("Could not register lguest root");
+
+	/* Devices are in a single page above top of "normal" mem */
+	lguest_devices = lguest_map(max_pfn<<PAGE_SHIFT, 1);
+
+	scan_devices();
+	return 0;
+}
+/* We do this after core stuff, but before the drivers. */
+postcore_initcall(lguest_devices_init);
+
+/*D:150 At this point in the journey we used to now wade through the lguest
+ * devices themselves: net, block and console.  Since they're all now virtio
+ * devices rather than lguest-specific, I've decided to ignore them.  Mostly,
+ * they're kind of boring.  But this does mean you'll never experience the
+ * thrill of reading the forbidden love scene buried deep in the block driver.
+ *
+ * "make Launcher" beckons, where we answer questions like "Where do Guests
+ * come from?", and "What do you do when someone asks for optimization?". */
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index 80d1b58c769..ee405b38383 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -1,73 +1,17 @@
 /*P:200 This contains all the /dev/lguest code, whereby the userspace launcher
  * controls and communicates with the Guest.  For example, the first write will
- * tell us the memory size, pagetable, entry point and kernel address offset.
- * A read will run the Guest until a signal is pending (-EINTR), or the Guest
- * does a DMA out to the Launcher.  Writes are also used to get a DMA buffer
- * registered by the Guest and to send the Guest an interrupt. :*/
+ * tell us the Guest's memory layout, pagetable, entry point and kernel address
+ * offset.  A read will run the Guest until something happens, such as a signal
+ * or the Guest doing a NOTIFY out to the Launcher. :*/
 #include <linux/uaccess.h>
 #include <linux/miscdevice.h>
 #include <linux/fs.h>
 #include "lg.h"
 
-/*L:030 setup_regs() doesn't really belong in this file, but it gives us an
- * early glimpse deeper into the Host so it's worth having here.
- *
- * Most of the Guest's registers are left alone: we used get_zeroed_page() to
- * allocate the structure, so they will be 0. */
-static void setup_regs(struct lguest_regs *regs, unsigned long start)
-{
-	/* There are four "segment" registers which the Guest needs to boot:
-	 * The "code segment" register (cs) refers to the kernel code segment
-	 * __KERNEL_CS, and the "data", "extra" and "stack" segment registers
-	 * refer to the kernel data segment __KERNEL_DS.
-	 *
-	 * The privilege level is packed into the lower bits.  The Guest runs
-	 * at privilege level 1 (GUEST_PL).*/
-	regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL;
-	regs->cs = __KERNEL_CS|GUEST_PL;
-
-	/* The "eflags" register contains miscellaneous flags.  Bit 1 (0x002)
-	 * is supposed to always be "1".  Bit 9 (0x200) controls whether
-	 * interrupts are enabled.  We always leave interrupts enabled while
-	 * running the Guest. */
-	regs->eflags = 0x202;
-
-	/* The "Extended Instruction Pointer" register says where the Guest is
-	 * running. */
-	regs->eip = start;
-
-	/* %esi points to our boot information, at physical address 0, so don't
-	 * touch it. */
-}
-
-/*L:310 To send DMA into the Guest, the Launcher needs to be able to ask for a
- * DMA buffer.  This is done by writing LHREQ_GETDMA and the key to
- * /dev/lguest. */
-static long user_get_dma(struct lguest *lg, const u32 __user *input)
-{
-	unsigned long key, udma, irq;
-
-	/* Fetch the key they wrote to us. */
-	if (get_user(key, input) != 0)
-		return -EFAULT;
-	/* Look for a free Guest DMA buffer bound to that key. */
-	udma = get_dma_buffer(lg, key, &irq);
-	if (!udma)
-		return -ENOENT;
-
-	/* We need to tell the Launcher what interrupt the Guest expects after
-	 * the buffer is filled.  We stash it in udma->used_len. */
-	lgwrite_u32(lg, udma + offsetof(struct lguest_dma, used_len), irq);
-
-	/* The (guest-physical) address of the DMA buffer is returned from
-	 * the write(). */
-	return udma;
-}
-
 /*L:315 To force the Guest to stop running and return to the Launcher, the
  * Waker sets writes LHREQ_BREAK and the value "1" to /dev/lguest.  The
  * Launcher then writes LHREQ_BREAK and "0" to release the Waker. */
-static int break_guest_out(struct lguest *lg, const u32 __user *input)
+static int break_guest_out(struct lguest *lg, const unsigned long __user *input)
 {
 	unsigned long on;
 
@@ -90,9 +34,9 @@ static int break_guest_out(struct lguest *lg, const u32 __user *input)
 
 /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
  * number to /dev/lguest. */
-static int user_send_irq(struct lguest *lg, const u32 __user *input)
+static int user_send_irq(struct lguest *lg, const unsigned long __user *input)
 {
-	u32 irq;
+	unsigned long irq;
 
 	if (get_user(irq, input) != 0)
 		return -EFAULT;
@@ -133,17 +77,19 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
 		return len;
 	}
 
-	/* If we returned from read() last time because the Guest sent DMA,
+	/* If we returned from read() last time because the Guest notified,
 	 * clear the flag. */
-	if (lg->dma_is_pending)
-		lg->dma_is_pending = 0;
+	if (lg->pending_notify)
+		lg->pending_notify = 0;
 
 	/* Run the Guest until something interesting happens. */
 	return run_guest(lg, (unsigned long __user *)user);
 }
 
-/*L:020 The initialization write supplies 4 32-bit values (in addition to the
- * 32-bit LHREQ_INITIALIZE value).  These are:
+/*L:020 The initialization write supplies 4 pointer sized (32 or 64 bit)
+ * values (in addition to the LHREQ_INITIALIZE value).  These are:
+ *
+ * base: The start of the Guest-physical memory inside the Launcher memory.
  *
  * pfnlimit: The highest (Guest-physical) page number the Guest should be
  * allowed to access.  The Launcher has to live in Guest memory, so it sets
@@ -153,23 +99,17 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
  * pagetables (which are set up by the Launcher).
  *
  * start: The first instruction to execute ("eip" in x86-speak).
- *
- * page_offset: The PAGE_OFFSET constant in the Guest kernel.  We should
- * probably wean the code off this, but it's a very useful constant!  Any
- * address above this is within the Guest kernel, and any kernel address can
- * quickly converted from physical to virtual by adding PAGE_OFFSET.  It's
- * 0xC0000000 (3G) by default, but it's configurable at kernel build time.
  */
-static int initialize(struct file *file, const u32 __user *input)
+static int initialize(struct file *file, const unsigned long __user *input)
 {
 	/* "struct lguest" contains everything we (the Host) know about a
 	 * Guest. */
 	struct lguest *lg;
-	int err, i;
-	u32 args[4];
+	int err;
+	unsigned long args[4];
 
-	/* We grab the Big Lguest lock, which protects the global array
-	 * "lguests" and multiple simultaneous initializations. */
+	/* We grab the Big Lguest lock, which protects against multiple
+	 * simultaneous initializations. */
 	mutex_lock(&lguest_lock);
 	/* You can't initialize twice!  Close the device and start again... */
 	if (file->private_data) {
@@ -182,20 +122,15 @@ static int initialize(struct file *file, const u32 __user *input)
 		goto unlock;
 	}
 
-	/* Find an unused guest. */
-	i = find_free_guest();
-	if (i < 0) {
-		err = -ENOSPC;
+	lg = kzalloc(sizeof(*lg), GFP_KERNEL);
+	if (!lg) {
+		err = -ENOMEM;
 		goto unlock;
 	}
-	/* OK, we have an index into the "lguest" array: "lg" is a convenient
-	 * pointer. */
-	lg = &lguests[i];
 
 	/* Populate the easy fields of our "struct lguest" */
-	lg->guestid = i;
-	lg->pfn_limit = args[0];
-	lg->page_offset = args[3];
+	lg->mem_base = (void __user *)(long)args[0];
+	lg->pfn_limit = args[1];
 
 	/* We need a complete page for the Guest registers: they are accessible
 	 * to the Guest and we can only grant it access to whole pages. */
@@ -210,17 +145,13 @@ static int initialize(struct file *file, const u32 __user *input)
 	/* Initialize the Guest's shadow page tables, using the toplevel
 	 * address the Launcher gave us.  This allocates memory, so can
 	 * fail. */
-	err = init_guest_pagetable(lg, args[1]);
+	err = init_guest_pagetable(lg, args[2]);
 	if (err)
 		goto free_regs;
 
 	/* Now we initialize the Guest's registers, handing it the start
 	 * address. */
-	setup_regs(lg->regs, args[2]);
-
-	/* There are a couple of GDT entries the Guest expects when first
-	 * booting. */
-	setup_guest_gdt(lg);
+	lguest_arch_setup_regs(lg, args[3]);
 
 	/* The timer for lguest's clock needs initialization. */
 	init_clockdev(lg);
@@ -260,18 +191,19 @@ unlock:
 /*L:010 The first operation the Launcher does must be a write.  All writes
  * start with a 32 bit number: for the first write this must be
  * LHREQ_INITIALIZE to set up the Guest.  After that the Launcher can use
- * writes of other values to get DMA buffers and send interrupts. */
-static ssize_t write(struct file *file, const char __user *input,
+ * writes of other values to send interrupts. */
+static ssize_t write(struct file *file, const char __user *in,
 		     size_t size, loff_t *off)
 {
 	/* Once the guest is initialized, we hold the "struct lguest" in the
 	 * file private data. */
 	struct lguest *lg = file->private_data;
-	u32 req;
+	const unsigned long __user *input = (const unsigned long __user *)in;
+	unsigned long req;
 
 	if (get_user(req, input) != 0)
 		return -EFAULT;
-	input += sizeof(req);
+	input++;
 
 	/* If you haven't initialized, you must do that first. */
 	if (req != LHREQ_INITIALIZE && !lg)
@@ -287,13 +219,11 @@ static ssize_t write(struct file *file, const char __user *input,
 
 	switch (req) {
 	case LHREQ_INITIALIZE:
-		return initialize(file, (const u32 __user *)input);
-	case LHREQ_GETDMA:
-		return user_get_dma(lg, (const u32 __user *)input);
+		return initialize(file, input);
 	case LHREQ_IRQ:
-		return user_send_irq(lg, (const u32 __user *)input);
+		return user_send_irq(lg, input);
 	case LHREQ_BREAK:
-		return break_guest_out(lg, (const u32 __user *)input);
+		return break_guest_out(lg, input);
 	default:
 		return -EINVAL;
 	}
@@ -319,8 +249,6 @@ static int close(struct inode *inode, struct file *file)
 	mutex_lock(&lguest_lock);
 	/* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */
 	hrtimer_cancel(&lg->hrt);
-	/* Free any DMA buffers the Guest had bound. */
-	release_all_dma(lg);
 	/* Free up the shadow page tables for the Guest. */
 	free_guest_pagetable(lg);
 	/* Now all the memory cleanups are done, it's safe to release the
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index b7a924ace68..2a45f0691c9 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -13,6 +13,7 @@
 #include <linux/random.h>
 #include <linux/percpu.h>
 #include <asm/tlbflush.h>
+#include <asm/uaccess.h>
 #include "lg.h"
 
 /*M:008 We hold reference to pages, which prevents them from being swapped.
@@ -44,44 +45,32 @@
  *  (vii) Setting up the page tables initially.
  :*/
 
-/* Pages a 4k long, and each page table entry is 4 bytes long, giving us 1024
- * (or 2^10) entries per page. */
-#define PTES_PER_PAGE_SHIFT 10
-#define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT)
 
 /* 1024 entries in a page table page maps 1024 pages: 4MB.  The Switcher is
  * conveniently placed at the top 4MB, so it uses a separate, complete PTE
  * page.  */
-#define SWITCHER_PGD_INDEX (PTES_PER_PAGE - 1)
+#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1)
 
 /* We actually need a separate PTE page for each CPU.  Remember that after the
  * Switcher code itself comes two pages for each CPU, and we don't want this
  * CPU's guest to see the pages of any other CPU. */
-static DEFINE_PER_CPU(spte_t *, switcher_pte_pages);
+static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
 #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
 
 /*H:320 With our shadow and Guest types established, we need to deal with
  * them: the page table code is curly enough to need helper functions to keep
  * it clear and clean.
  *
- * The first helper takes a virtual address, and says which entry in the top
- * level page table deals with that address.  Since each top level entry deals
- * with 4M, this effectively divides by 4M. */
-static unsigned vaddr_to_pgd_index(unsigned long vaddr)
-{
-	return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT);
-}
-
-/* There are two functions which return pointers to the shadow (aka "real")
+ * There are two functions which return pointers to the shadow (aka "real")
  * page tables.
  *
  * spgd_addr() takes the virtual address and returns a pointer to the top-level
  * page directory entry for that address.  Since we keep track of several page
  * tables, the "i" argument tells us which one we're interested in (it's
  * usually the current one). */
-static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)
+static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)
 {
-	unsigned int index = vaddr_to_pgd_index(vaddr);
+	unsigned int index = pgd_index(vaddr);
 
 	/* We kill any Guest trying to touch the Switcher addresses. */
 	if (index >= SWITCHER_PGD_INDEX) {
@@ -95,28 +84,28 @@ static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)
 /* This routine then takes the PGD entry given above, which contains the
  * address of the PTE page.  It then returns a pointer to the PTE entry for the
  * given address. */
-static spte_t *spte_addr(struct lguest *lg, spgd_t spgd, unsigned long vaddr)
+static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr)
 {
-	spte_t *page = __va(spgd.pfn << PAGE_SHIFT);
+	pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
 	/* You should never call this if the PGD entry wasn't valid */
-	BUG_ON(!(spgd.flags & _PAGE_PRESENT));
-	return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE];
+	BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
+	return &page[(vaddr >> PAGE_SHIFT) % PTRS_PER_PTE];
 }
 
 /* These two functions just like the above two, except they access the Guest
  * page tables.  Hence they return a Guest address. */
 static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr)
 {
-	unsigned int index = vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT);
-	return lg->pgdirs[lg->pgdidx].cr3 + index * sizeof(gpgd_t);
+	unsigned int index = vaddr >> (PGDIR_SHIFT);
+	return lg->pgdirs[lg->pgdidx].gpgdir + index * sizeof(pgd_t);
 }
 
 static unsigned long gpte_addr(struct lguest *lg,
-			       gpgd_t gpgd, unsigned long vaddr)
+			       pgd_t gpgd, unsigned long vaddr)
 {
-	unsigned long gpage = gpgd.pfn << PAGE_SHIFT;
-	BUG_ON(!(gpgd.flags & _PAGE_PRESENT));
-	return gpage + ((vaddr>>PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(gpte_t);
+	unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
+	BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
+	return gpage + ((vaddr>>PAGE_SHIFT) % PTRS_PER_PTE) * sizeof(pte_t);
 }
 
 /*H:350 This routine takes a page number given by the Guest and converts it to
@@ -149,53 +138,55 @@ static unsigned long get_pfn(unsigned long virtpfn, int write)
  * entry can be a little tricky.  The flags are (almost) the same, but the
  * Guest PTE contains a virtual page number: the CPU needs the real page
  * number. */
-static spte_t gpte_to_spte(struct lguest *lg, gpte_t gpte, int write)
+static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write)
 {
-	spte_t spte;
-	unsigned long pfn;
+	unsigned long pfn, base, flags;
 
 	/* The Guest sets the global flag, because it thinks that it is using
 	 * PGE.  We only told it to use PGE so it would tell us whether it was
 	 * flushing a kernel mapping or a userspace mapping.  We don't actually
 	 * use the global bit, so throw it away. */
-	spte.flags = (gpte.flags & ~_PAGE_GLOBAL);
+	flags = (pte_flags(gpte) & ~_PAGE_GLOBAL);
+
+	/* The Guest's pages are offset inside the Launcher. */
+	base = (unsigned long)lg->mem_base / PAGE_SIZE;
 
 	/* We need a temporary "unsigned long" variable to hold the answer from
 	 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't
 	 * fit in spte.pfn.  get_pfn() finds the real physical number of the
 	 * page, given the virtual number. */
-	pfn = get_pfn(gpte.pfn, write);
+	pfn = get_pfn(base + pte_pfn(gpte), write);
 	if (pfn == -1UL) {
-		kill_guest(lg, "failed to get page %u", gpte.pfn);
+		kill_guest(lg, "failed to get page %lu", pte_pfn(gpte));
 		/* When we destroy the Guest, we'll go through the shadow page
 		 * tables and release_pte() them.  Make sure we don't think
 		 * this one is valid! */
-		spte.flags = 0;
+		flags = 0;
 	}
-	/* Now we assign the page number, and our shadow PTE is complete. */
-	spte.pfn = pfn;
-	return spte;
+	/* Now we assemble our shadow PTE from the page number and flags. */
+	return pfn_pte(pfn, __pgprot(flags));
 }
 
 /*H:460 And to complete the chain, release_pte() looks like this: */
-static void release_pte(spte_t pte)
+static void release_pte(pte_t pte)
 {
 	/* Remember that get_user_pages() took a reference to the page, in
 	 * get_pfn()?  We have to put it back now. */
-	if (pte.flags & _PAGE_PRESENT)
-		put_page(pfn_to_page(pte.pfn));
+	if (pte_flags(pte) & _PAGE_PRESENT)
+		put_page(pfn_to_page(pte_pfn(pte)));
 }
 /*:*/
 
-static void check_gpte(struct lguest *lg, gpte_t gpte)
+static void check_gpte(struct lguest *lg, pte_t gpte)
 {
-	if ((gpte.flags & (_PAGE_PWT|_PAGE_PSE)) || gpte.pfn >= lg->pfn_limit)
+	if ((pte_flags(gpte) & (_PAGE_PWT|_PAGE_PSE))
+	    || pte_pfn(gpte) >= lg->pfn_limit)
 		kill_guest(lg, "bad page table entry");
 }
 
-static void check_gpgd(struct lguest *lg, gpgd_t gpgd)
+static void check_gpgd(struct lguest *lg, pgd_t gpgd)
 {
-	if ((gpgd.flags & ~_PAGE_TABLE) || gpgd.pfn >= lg->pfn_limit)
+	if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || pgd_pfn(gpgd) >= lg->pfn_limit)
 		kill_guest(lg, "bad page directory entry");
 }
 
@@ -211,21 +202,21 @@ static void check_gpgd(struct lguest *lg, gpgd_t gpgd)
  * true. */
 int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
 {
-	gpgd_t gpgd;
-	spgd_t *spgd;
+	pgd_t gpgd;
+	pgd_t *spgd;
 	unsigned long gpte_ptr;
-	gpte_t gpte;
-	spte_t *spte;
+	pte_t gpte;
+	pte_t *spte;
 
 	/* First step: get the top-level Guest page table entry. */
-	gpgd = mkgpgd(lgread_u32(lg, gpgd_addr(lg, vaddr)));
+	gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t);
 	/* Toplevel not present?  We can't map it in. */
-	if (!(gpgd.flags & _PAGE_PRESENT))
+	if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
 		return 0;
 
 	/* Now look at the matching shadow entry. */
 	spgd = spgd_addr(lg, lg->pgdidx, vaddr);
-	if (!(spgd->flags & _PAGE_PRESENT)) {
+	if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
 		/* No shadow entry: allocate a new shadow PTE page. */
 		unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
 		/* This is not really the Guest's fault, but killing it is
@@ -238,34 +229,35 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
 		check_gpgd(lg, gpgd);
 		/* And we copy the flags to the shadow PGD entry.  The page
 		 * number in the shadow PGD is the page we just allocated. */
-		spgd->raw.val = (__pa(ptepage) | gpgd.flags);
+		*spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd));
 	}
 
 	/* OK, now we look at the lower level in the Guest page table: keep its
 	 * address, because we might update it later. */
 	gpte_ptr = gpte_addr(lg, gpgd, vaddr);
-	gpte = mkgpte(lgread_u32(lg, gpte_ptr));
+	gpte = lgread(lg, gpte_ptr, pte_t);
 
 	/* If this page isn't in the Guest page tables, we can't page it in. */
-	if (!(gpte.flags & _PAGE_PRESENT))
+	if (!(pte_flags(gpte) & _PAGE_PRESENT))
 		return 0;
 
 	/* Check they're not trying to write to a page the Guest wants
 	 * read-only (bit 2 of errcode == write). */
-	if ((errcode & 2) && !(gpte.flags & _PAGE_RW))
+	if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW))
 		return 0;
 
 	/* User access to a kernel page? (bit 3 == user access) */
-	if ((errcode & 4) && !(gpte.flags & _PAGE_USER))
+	if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))
 		return 0;
 
 	/* Check that the Guest PTE flags are OK, and the page number is below
 	 * the pfn_limit (ie. not mapping the Launcher binary). */
 	check_gpte(lg, gpte);
 	/* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
-	gpte.flags |= _PAGE_ACCESSED;
+	gpte = pte_mkyoung(gpte);
+
 	if (errcode & 2)
-		gpte.flags |= _PAGE_DIRTY;
+		gpte = pte_mkdirty(gpte);
 
 	/* Get the pointer to the shadow PTE entry we're going to set. */
 	spte = spte_addr(lg, *spgd, vaddr);
@@ -275,21 +267,18 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
 
 	/* If this is a write, we insist that the Guest page is writable (the
 	 * final arg to gpte_to_spte()). */
-	if (gpte.flags & _PAGE_DIRTY)
+	if (pte_dirty(gpte))
 		*spte = gpte_to_spte(lg, gpte, 1);
-	else {
+	else
 		/* If this is a read, don't set the "writable" bit in the page
 		 * table entry, even if the Guest says it's writable.  That way
 		 * we come back here when a write does actually ocur, so we can
 		 * update the Guest's _PAGE_DIRTY flag. */
-		gpte_t ro_gpte = gpte;
-		ro_gpte.flags &= ~_PAGE_RW;
-		*spte = gpte_to_spte(lg, ro_gpte, 0);
-	}
+		*spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0);
 
 	/* Finally, we write the Guest PTE entry back: we've set the
 	 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */
-	lgwrite_u32(lg, gpte_ptr, gpte.raw.val);
+	lgwrite(lg, gpte_ptr, pte_t, gpte);
 
 	/* We succeeded in mapping the page! */
 	return 1;
@@ -305,17 +294,18 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
  * mapped by the shadow page tables, and is it writable? */
 static int page_writable(struct lguest *lg, unsigned long vaddr)
 {
-	spgd_t *spgd;
+	pgd_t *spgd;
 	unsigned long flags;
 
 	/* Look at the top level entry: is it present? */
 	spgd = spgd_addr(lg, lg->pgdidx, vaddr);
-	if (!(spgd->flags & _PAGE_PRESENT))
+	if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
 		return 0;
 
 	/* Check the flags on the pte entry itself: it must be present and
 	 * writable. */
-	flags = spte_addr(lg, *spgd, vaddr)->flags;
+	flags = pte_flags(*(spte_addr(lg, *spgd, vaddr)));
+
 	return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
 }
 
@@ -329,22 +319,22 @@ void pin_page(struct lguest *lg, unsigned long vaddr)
 }
 
 /*H:450 If we chase down the release_pgd() code, it looks like this: */
-static void release_pgd(struct lguest *lg, spgd_t *spgd)
+static void release_pgd(struct lguest *lg, pgd_t *spgd)
 {
 	/* If the entry's not present, there's nothing to release. */
-	if (spgd->flags & _PAGE_PRESENT) {
+	if (pgd_flags(*spgd) & _PAGE_PRESENT) {
 		unsigned int i;
 		/* Converting the pfn to find the actual PTE page is easy: turn
 		 * the page number into a physical address, then convert to a
 		 * virtual address (easy for kernel pages like this one). */
-		spte_t *ptepage = __va(spgd->pfn << PAGE_SHIFT);
+		pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
 		/* For each entry in the page, we might need to release it. */
-		for (i = 0; i < PTES_PER_PAGE; i++)
+		for (i = 0; i < PTRS_PER_PTE; i++)
 			release_pte(ptepage[i]);
 		/* Now we can free the page of PTEs */
 		free_page((long)ptepage);
 		/* And zero out the PGD entry we we never release it twice. */
-		spgd->raw.val = 0;
+		*spgd = __pgd(0);
 	}
 }
 
@@ -356,7 +346,7 @@ static void flush_user_mappings(struct lguest *lg, int idx)
 {
 	unsigned int i;
 	/* Release every pgd entry up to the kernel's address. */
-	for (i = 0; i < vaddr_to_pgd_index(lg->page_offset); i++)
+	for (i = 0; i < pgd_index(lg->kernel_address); i++)
 		release_pgd(lg, lg->pgdirs[idx].pgdir + i);
 }
 
@@ -369,6 +359,25 @@ void guest_pagetable_flush_user(struct lguest *lg)
 }
 /*:*/
 
+/* We walk down the guest page tables to get a guest-physical address */
+unsigned long guest_pa(struct lguest *lg, unsigned long vaddr)
+{
+	pgd_t gpgd;
+	pte_t gpte;
+
+	/* First step: get the top-level Guest page table entry. */
+	gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t);
+	/* Toplevel not present?  We can't map it in. */
+	if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
+		kill_guest(lg, "Bad address %#lx", vaddr);
+
+	gpte = lgread(lg, gpte_addr(lg, gpgd, vaddr), pte_t);
+	if (!(pte_flags(gpte) & _PAGE_PRESENT))
+		kill_guest(lg, "Bad address %#lx", vaddr);
+
+	return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
+}
+
 /* We keep several page tables.  This is a simple routine to find the page
  * table (if any) corresponding to this top-level address the Guest has given
  * us. */
@@ -376,7 +385,7 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
 {
 	unsigned int i;
 	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
-		if (lg->pgdirs[i].cr3 == pgtable)
+		if (lg->pgdirs[i].gpgdir == pgtable)
 			break;
 	return i;
 }
@@ -385,7 +394,7 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
  * allocate a new one (and so the kernel parts are not there), we set
  * blank_pgdir. */
 static unsigned int new_pgdir(struct lguest *lg,
-			      unsigned long cr3,
+			      unsigned long gpgdir,
 			      int *blank_pgdir)
 {
 	unsigned int next;
@@ -395,7 +404,7 @@ static unsigned int new_pgdir(struct lguest *lg,
 	next = random32() % ARRAY_SIZE(lg->pgdirs);
 	/* If it's never been allocated at all before, try now. */
 	if (!lg->pgdirs[next].pgdir) {
-		lg->pgdirs[next].pgdir = (spgd_t *)get_zeroed_page(GFP_KERNEL);
+		lg->pgdirs[next].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
 		/* If the allocation fails, just keep using the one we have */
 		if (!lg->pgdirs[next].pgdir)
 			next = lg->pgdidx;
@@ -405,7 +414,7 @@ static unsigned int new_pgdir(struct lguest *lg,
 			*blank_pgdir = 1;
 	}
 	/* Record which Guest toplevel this shadows. */
-	lg->pgdirs[next].cr3 = cr3;
+	lg->pgdirs[next].gpgdir = gpgdir;
 	/* Release all the non-kernel mappings. */
 	flush_user_mappings(lg, next);
 
@@ -472,26 +481,27 @@ void guest_pagetable_clear_all(struct lguest *lg)
  * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately.
  */
 static void do_set_pte(struct lguest *lg, int idx,
-		       unsigned long vaddr, gpte_t gpte)
+		       unsigned long vaddr, pte_t gpte)
 {
 	/* Look up the matching shadow page directot entry. */
-	spgd_t *spgd = spgd_addr(lg, idx, vaddr);
+	pgd_t *spgd = spgd_addr(lg, idx, vaddr);
 
 	/* If the top level isn't present, there's no entry to update. */
-	if (spgd->flags & _PAGE_PRESENT) {
+	if (pgd_flags(*spgd) & _PAGE_PRESENT) {
 		/* Otherwise, we start by releasing the existing entry. */
-		spte_t *spte = spte_addr(lg, *spgd, vaddr);
+		pte_t *spte = spte_addr(lg, *spgd, vaddr);
 		release_pte(*spte);
 
 		/* If they're setting this entry as dirty or accessed, we might
 		 * as well put that entry they've given us in now.  This shaves
 		 * 10% off a copy-on-write micro-benchmark. */
-		if (gpte.flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
+		if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
 			check_gpte(lg, gpte);
-			*spte = gpte_to_spte(lg, gpte, gpte.flags&_PAGE_DIRTY);
+			*spte = gpte_to_spte(lg, gpte,
+					     pte_flags(gpte) & _PAGE_DIRTY);
 		} else
 			/* Otherwise we can demand_page() it in later. */
-			spte->raw.val = 0;
+			*spte = __pte(0);
 	}
 }
 
@@ -506,18 +516,18 @@ static void do_set_pte(struct lguest *lg, int idx,
  * The benefit is that when we have to track a new page table, we can copy keep
  * all the kernel mappings.  This speeds up context switch immensely. */
 void guest_set_pte(struct lguest *lg,
-		   unsigned long cr3, unsigned long vaddr, gpte_t gpte)
+		   unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
 {
 	/* Kernel mappings must be changed on all top levels.  Slow, but
 	 * doesn't happen often. */
-	if (vaddr >= lg->page_offset) {
+	if (vaddr >= lg->kernel_address) {
 		unsigned int i;
 		for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
 			if (lg->pgdirs[i].pgdir)
 				do_set_pte(lg, i, vaddr, gpte);
 	} else {
 		/* Is this page table one we have a shadow for? */
-		int pgdir = find_pgdir(lg, cr3);
+		int pgdir = find_pgdir(lg, gpgdir);
 		if (pgdir != ARRAY_SIZE(lg->pgdirs))
 			/* If so, do the update. */
 			do_set_pte(lg, pgdir, vaddr, gpte);
@@ -538,7 +548,7 @@ void guest_set_pte(struct lguest *lg,
  *
  * So with that in mind here's our code to to update a (top-level) PGD entry:
  */
-void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx)
+void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx)
 {
 	int pgdir;
 
@@ -548,7 +558,7 @@ void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx)
 		return;
 
 	/* If they're talking about a page table we have a shadow for... */
-	pgdir = find_pgdir(lg, cr3);
+	pgdir = find_pgdir(lg, gpgdir);
 	if (pgdir < ARRAY_SIZE(lg->pgdirs))
 		/* ... throw it away. */
 		release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx);
@@ -560,21 +570,34 @@ void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx)
  * its first page table is.  We set some things up here: */
 int init_guest_pagetable(struct lguest *lg, unsigned long pgtable)
 {
-	/* In flush_user_mappings() we loop from 0 to
-	 * "vaddr_to_pgd_index(lg->page_offset)".  This assumes it won't hit
-	 * the Switcher mappings, so check that now. */
-	if (vaddr_to_pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX)
-		return -EINVAL;
 	/* We start on the first shadow page table, and give it a blank PGD
 	 * page. */
 	lg->pgdidx = 0;
-	lg->pgdirs[lg->pgdidx].cr3 = pgtable;
-	lg->pgdirs[lg->pgdidx].pgdir = (spgd_t*)get_zeroed_page(GFP_KERNEL);
+	lg->pgdirs[lg->pgdidx].gpgdir = pgtable;
+	lg->pgdirs[lg->pgdidx].pgdir = (pgd_t*)get_zeroed_page(GFP_KERNEL);
 	if (!lg->pgdirs[lg->pgdidx].pgdir)
 		return -ENOMEM;
 	return 0;
 }
 
+/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
+void page_table_guest_data_init(struct lguest *lg)
+{
+	/* We get the kernel address: above this is all kernel memory. */
+	if (get_user(lg->kernel_address, &lg->lguest_data->kernel_address)
+	    /* We tell the Guest that it can't use the top 4MB of virtual
+	     * addresses used by the Switcher. */
+	    || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem)
+	    || put_user(lg->pgdirs[lg->pgdidx].gpgdir,&lg->lguest_data->pgdir))
+		kill_guest(lg, "bad guest page %p", lg->lguest_data);
+
+	/* In flush_user_mappings() we loop from 0 to
+	 * "pgd_index(lg->kernel_address)".  This assumes it won't hit the
+	 * Switcher mappings, so check that now. */
+	if (pgd_index(lg->kernel_address) >= SWITCHER_PGD_INDEX)
+		kill_guest(lg, "bad kernel address %#lx", lg->kernel_address);
+}
+
 /* When a Guest dies, our cleanup is fairly simple. */
 void free_guest_pagetable(struct lguest *lg)
 {
@@ -594,14 +617,14 @@ void free_guest_pagetable(struct lguest *lg)
  * for each CPU already set up, we just need to hook them in. */
 void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)
 {
-	spte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
-	spgd_t switcher_pgd;
-	spte_t regs_pte;
+	pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
+	pgd_t switcher_pgd;
+	pte_t regs_pte;
 
 	/* Make the last PGD entry for this Guest point to the Switcher's PTE
 	 * page for this CPU (with appropriate flags). */
-	switcher_pgd.pfn = __pa(switcher_pte_page) >> PAGE_SHIFT;
-	switcher_pgd.flags = _PAGE_KERNEL;
+	switcher_pgd = __pgd(__pa(switcher_pte_page) | _PAGE_KERNEL);
+
 	lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
 
 	/* We also change the Switcher PTE page.  When we're running the Guest,
@@ -611,10 +634,8 @@ void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)
 	 * CPU's "struct lguest_pages": if we make sure the Guest's register
 	 * page is already mapped there, we don't have to copy them out
 	 * again. */
-	regs_pte.pfn = __pa(lg->regs_page) >> PAGE_SHIFT;
-	regs_pte.flags = _PAGE_KERNEL;
-	switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTES_PER_PAGE]
-		= regs_pte;
+	regs_pte = pfn_pte (__pa(lg->regs_page) >> PAGE_SHIFT, __pgprot(_PAGE_KERNEL));
+	switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte;
 }
 /*:*/
 
@@ -635,24 +656,25 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
 					      unsigned int pages)
 {
 	unsigned int i;
-	spte_t *pte = switcher_pte_page(cpu);
+	pte_t *pte = switcher_pte_page(cpu);
 
 	/* The first entries are easy: they map the Switcher code. */
 	for (i = 0; i < pages; i++) {
-		pte[i].pfn = page_to_pfn(switcher_page[i]);
-		pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED;
+		pte[i] = mk_pte(switcher_page[i],
+				__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED));
 	}
 
 	/* The only other thing we map is this CPU's pair of pages. */
 	i = pages + cpu*2;
 
 	/* First page (Guest registers) is writable from the Guest */
-	pte[i].pfn = page_to_pfn(switcher_page[i]);
-	pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW;
+	pte[i] = pfn_pte(page_to_pfn(switcher_page[i]),
+			 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW));
+
 	/* The second page contains the "struct lguest_ro_state", and is
 	 * read-only. */
-	pte[i+1].pfn = page_to_pfn(switcher_page[i+1]);
-	pte[i+1].flags = _PAGE_PRESENT|_PAGE_ACCESSED;
+	pte[i+1] = pfn_pte(page_to_pfn(switcher_page[i+1]),
+			   __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED));
 }
 
 /*H:510 At boot or module load time, init_pagetables() allocates and populates
@@ -662,7 +684,7 @@ __init int init_pagetables(struct page **switcher_page, unsigned int pages)
 	unsigned int i;
 
 	for_each_possible_cpu(i) {
-		switcher_pte_page(i) = (spte_t *)get_zeroed_page(GFP_KERNEL);
+		switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL);
 		if (!switcher_pte_page(i)) {
 			free_switcher_pte_pages();
 			return -ENOMEM;
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c
index 9b81119f46e..c2434ec99f7 100644
--- a/drivers/lguest/segments.c
+++ b/drivers/lguest/segments.c
@@ -73,14 +73,14 @@ static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end)
 		/* Segment descriptors contain a privilege level: the Guest is
 		 * sometimes careless and leaves this as 0, even though it's
 		 * running at privilege level 1.  If so, we fix it here. */
-		if ((lg->gdt[i].b & 0x00006000) == 0)
-			lg->gdt[i].b |= (GUEST_PL << 13);
+		if ((lg->arch.gdt[i].b & 0x00006000) == 0)
+			lg->arch.gdt[i].b |= (GUEST_PL << 13);
 
 		/* Each descriptor has an "accessed" bit.  If we don't set it
 		 * now, the CPU will try to set it when the Guest first loads
 		 * that entry into a segment register.  But the GDT isn't
 		 * writable by the Guest, so bad things can happen. */
-		lg->gdt[i].b |= 0x00000100;
+		lg->arch.gdt[i].b |= 0x00000100;
 	}
 }
 
@@ -106,12 +106,12 @@ void setup_default_gdt_entries(struct lguest_ro_state *state)
 void setup_guest_gdt(struct lguest *lg)
 {
 	/* Start with full 0-4G segments... */
-	lg->gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
-	lg->gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
+	lg->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
+	lg->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
 	/* ...except the Guest is allowed to use them, so set the privilege
 	 * level appropriately in the flags. */
-	lg->gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13);
-	lg->gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);
+	lg->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13);
+	lg->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);
 }
 
 /* Like the IDT, we never simply use the GDT the Guest gives us.  We set up the
@@ -126,7 +126,7 @@ void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt)
 	unsigned int i;
 
 	for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++)
-		gdt[i] = lg->gdt[i];
+		gdt[i] = lg->arch.gdt[i];
 }
 
 /* This is the full version */
@@ -138,7 +138,7 @@ void copy_gdt(const struct lguest *lg, struct desc_struct *gdt)
 	 * replaced.  See ignored_gdt() above. */
 	for (i = 0; i < GDT_ENTRIES; i++)
 		if (!ignored_gdt(i))
-			gdt[i] = lg->gdt[i];
+			gdt[i] = lg->arch.gdt[i];
 }
 
 /* This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT). */
@@ -146,12 +146,12 @@ void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num)
 {
 	/* We assume the Guest has the same number of GDT entries as the
 	 * Host, otherwise we'd have to dynamically allocate the Guest GDT. */
-	if (num > ARRAY_SIZE(lg->gdt))
+	if (num > ARRAY_SIZE(lg->arch.gdt))
 		kill_guest(lg, "too many gdt entries %i", num);
 
 	/* We read the whole thing in, then fix it up. */
-	lgread(lg, lg->gdt, table, num * sizeof(lg->gdt[0]));
-	fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->gdt));
+	__lgread(lg, lg->arch.gdt, table, num * sizeof(lg->arch.gdt[0]));
+	fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->arch.gdt));
 	/* Mark that the GDT changed so the core knows it has to copy it again,
 	 * even if the Guest is run on the same CPU. */
 	lg->changed |= CHANGED_GDT;
@@ -159,9 +159,9 @@ void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num)
 
 void guest_load_tls(struct lguest *lg, unsigned long gtls)
 {
-	struct desc_struct *tls = &lg->gdt[GDT_ENTRY_TLS_MIN];
+	struct desc_struct *tls = &lg->arch.gdt[GDT_ENTRY_TLS_MIN];
 
-	lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
+	__lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
 	fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
 	lg->changed |= CHANGED_GDT_TLS;
 }
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
new file mode 100644
index 00000000000..9eed12d5a39
--- /dev/null
+++ b/drivers/lguest/x86/core.c
@@ -0,0 +1,577 @@
+/*
+ * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
+ * Copyright (C) 2007, Jes Sorensen <jes@sgi.com> SGI.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/kernel.h>
+#include <linux/start_kernel.h>
+#include <linux/string.h>
+#include <linux/console.h>
+#include <linux/screen_info.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/cpu.h>
+#include <linux/lguest.h>
+#include <linux/lguest_launcher.h>
+#include <asm/paravirt.h>
+#include <asm/param.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/desc.h>
+#include <asm/setup.h>
+#include <asm/lguest.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
+#include "../lg.h"
+
+static int cpu_had_pge;
+
+static struct {
+	unsigned long offset;
+	unsigned short segment;
+} lguest_entry;
+
+/* Offset from where switcher.S was compiled to where we've copied it */
+static unsigned long switcher_offset(void)
+{
+	return SWITCHER_ADDR - (unsigned long)start_switcher_text;
+}
+
+/* This cpu's struct lguest_pages. */
+static struct lguest_pages *lguest_pages(unsigned int cpu)
+{
+	return &(((struct lguest_pages *)
+		  (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
+}
+
+static DEFINE_PER_CPU(struct lguest *, last_guest);
+
+/*S:010
+ * We are getting close to the Switcher.
+ *
+ * Remember that each CPU has two pages which are visible to the Guest when it
+ * runs on that CPU.  This has to contain the state for that Guest: we copy the
+ * state in just before we run the Guest.
+ *
+ * Each Guest has "changed" flags which indicate what has changed in the Guest
+ * since it last ran.  We saw this set in interrupts_and_traps.c and
+ * segments.c.
+ */
+static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages)
+{
+	/* Copying all this data can be quite expensive.  We usually run the
+	 * same Guest we ran last time (and that Guest hasn't run anywhere else
+	 * meanwhile).  If that's not the case, we pretend everything in the
+	 * Guest has changed. */
+	if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) {
+		__get_cpu_var(last_guest) = lg;
+		lg->last_pages = pages;
+		lg->changed = CHANGED_ALL;
+	}
+
+	/* These copies are pretty cheap, so we do them unconditionally: */
+	/* Save the current Host top-level page directory. */
+	pages->state.host_cr3 = __pa(current->mm->pgd);
+	/* Set up the Guest's page tables to see this CPU's pages (and no
+	 * other CPU's pages). */
+	map_switcher_in_guest(lg, pages);
+	/* Set up the two "TSS" members which tell the CPU what stack to use
+	 * for traps which do directly into the Guest (ie. traps at privilege
+	 * level 1). */
+	pages->state.guest_tss.esp1 = lg->esp1;
+	pages->state.guest_tss.ss1 = lg->ss1;
+
+	/* Copy direct-to-Guest trap entries. */
+	if (lg->changed & CHANGED_IDT)
+		copy_traps(lg, pages->state.guest_idt, default_idt_entries);
+
+	/* Copy all GDT entries which the Guest can change. */
+	if (lg->changed & CHANGED_GDT)
+		copy_gdt(lg, pages->state.guest_gdt);
+	/* If only the TLS entries have changed, copy them. */
+	else if (lg->changed & CHANGED_GDT_TLS)
+		copy_gdt_tls(lg, pages->state.guest_gdt);
+
+	/* Mark the Guest as unchanged for next time. */
+	lg->changed = 0;
+}
+
+/* Finally: the code to actually call into the Switcher to run the Guest. */
+static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
+{
+	/* This is a dummy value we need for GCC's sake. */
+	unsigned int clobber;
+
+	/* Copy the guest-specific information into this CPU's "struct
+	 * lguest_pages". */
+	copy_in_guest_info(lg, pages);
+
+	/* Set the trap number to 256 (impossible value).  If we fault while
+	 * switching to the Guest (bad segment registers or bug), this will
+	 * cause us to abort the Guest. */
+	lg->regs->trapnum = 256;
+
+	/* Now: we push the "eflags" register on the stack, then do an "lcall".
+	 * This is how we change from using the kernel code segment to using
+	 * the dedicated lguest code segment, as well as jumping into the
+	 * Switcher.
+	 *
+	 * The lcall also pushes the old code segment (KERNEL_CS) onto the
+	 * stack, then the address of this call.  This stack layout happens to
+	 * exactly match the stack of an interrupt... */
+	asm volatile("pushf; lcall *lguest_entry"
+		     /* This is how we tell GCC that %eax ("a") and %ebx ("b")
+		      * are changed by this routine.  The "=" means output. */
+		     : "=a"(clobber), "=b"(clobber)
+		     /* %eax contains the pages pointer.  ("0" refers to the
+		      * 0-th argument above, ie "a").  %ebx contains the
+		      * physical address of the Guest's top-level page
+		      * directory. */
+		     : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir))
+		     /* We tell gcc that all these registers could change,
+		      * which means we don't have to save and restore them in
+		      * the Switcher. */
+		     : "memory", "%edx", "%ecx", "%edi", "%esi");
+}
+/*:*/
+
+/*H:040 This is the i386-specific code to setup and run the Guest.  Interrupts
+ * are disabled: we own the CPU. */
+void lguest_arch_run_guest(struct lguest *lg)
+{
+	/* Remember the awfully-named TS bit?  If the Guest has asked
+	 * to set it we set it now, so we can trap and pass that trap
+	 * to the Guest if it uses the FPU. */
+	if (lg->ts)
+		lguest_set_ts();
+
+	/* SYSENTER is an optimized way of doing system calls.  We
+	 * can't allow it because it always jumps to privilege level 0.
+	 * A normal Guest won't try it because we don't advertise it in
+	 * CPUID, but a malicious Guest (or malicious Guest userspace
+	 * program) could, so we tell the CPU to disable it before
+	 * running the Guest. */
+	if (boot_cpu_has(X86_FEATURE_SEP))
+		wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
+
+	/* Now we actually run the Guest.  It will pop back out when
+	 * something interesting happens, and we can examine its
+	 * registers to see what it was doing. */
+	run_guest_once(lg, lguest_pages(raw_smp_processor_id()));
+
+	/* The "regs" pointer contains two extra entries which are not
+	 * really registers: a trap number which says what interrupt or
+	 * trap made the switcher code come back, and an error code
+	 * which some traps set.  */
+
+	/* If the Guest page faulted, then the cr2 register will tell
+	 * us the bad virtual address.  We have to grab this now,
+	 * because once we re-enable interrupts an interrupt could
+	 * fault and thus overwrite cr2, or we could even move off to a
+	 * different CPU. */
+	if (lg->regs->trapnum == 14)
+		lg->arch.last_pagefault = read_cr2();
+	/* Similarly, if we took a trap because the Guest used the FPU,
+	 * we have to restore the FPU it expects to see. */
+	else if (lg->regs->trapnum == 7)
+		math_state_restore();
+
+	/* Restore SYSENTER if it's supposed to be on. */
+	if (boot_cpu_has(X86_FEATURE_SEP))
+		wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
+}
+
+/*H:130 Our Guest is usually so well behaved; it never tries to do things it
+ * isn't allowed to.  Unfortunately, Linux's paravirtual infrastructure isn't
+ * quite complete, because it doesn't contain replacements for the Intel I/O
+ * instructions.  As a result, the Guest sometimes fumbles across one during
+ * the boot process as it probes for various things which are usually attached
+ * to a PC.
+ *
+ * When the Guest uses one of these instructions, we get trap #13 (General
+ * Protection Fault) and come here.  We see if it's one of those troublesome
+ * instructions and skip over it.  We return true if we did. */
+static int emulate_insn(struct lguest *lg)
+{
+	u8 insn;
+	unsigned int insnlen = 0, in = 0, shift = 0;
+	/* The eip contains the *virtual* address of the Guest's instruction:
+	 * guest_pa just subtracts the Guest's page_offset. */
+	unsigned long physaddr = guest_pa(lg, lg->regs->eip);
+
+	/* This must be the Guest kernel trying to do something, not userspace!
+	 * The bottom two bits of the CS segment register are the privilege
+	 * level. */
+	if ((lg->regs->cs & 3) != GUEST_PL)
+		return 0;
+
+	/* Decoding x86 instructions is icky. */
+	insn = lgread(lg, physaddr, u8);
+
+	/* 0x66 is an "operand prefix".  It means it's using the upper 16 bits
+	   of the eax register. */
+	if (insn == 0x66) {
+		shift = 16;
+		/* The instruction is 1 byte so far, read the next byte. */
+		insnlen = 1;
+		insn = lgread(lg, physaddr + insnlen, u8);
+	}
+
+	/* We can ignore the lower bit for the moment and decode the 4 opcodes
+	 * we need to emulate. */
+	switch (insn & 0xFE) {
+	case 0xE4: /* in     <next byte>,%al */
+		insnlen += 2;
+		in = 1;
+		break;
+	case 0xEC: /* in     (%dx),%al */
+		insnlen += 1;
+		in = 1;
+		break;
+	case 0xE6: /* out    %al,<next byte> */
+		insnlen += 2;
+		break;
+	case 0xEE: /* out    %al,(%dx) */
+		insnlen += 1;
+		break;
+	default:
+		/* OK, we don't know what this is, can't emulate. */
+		return 0;
+	}
+
+	/* If it was an "IN" instruction, they expect the result to be read
+	 * into %eax, so we change %eax.  We always return all-ones, which
+	 * traditionally means "there's nothing there". */
+	if (in) {
+		/* Lower bit tells is whether it's a 16 or 32 bit access */
+		if (insn & 0x1)
+			lg->regs->eax = 0xFFFFFFFF;
+		else
+			lg->regs->eax |= (0xFFFF << shift);
+	}
+	/* Finally, we've "done" the instruction, so move past it. */
+	lg->regs->eip += insnlen;
+	/* Success! */
+	return 1;
+}
+
+/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
+void lguest_arch_handle_trap(struct lguest *lg)
+{
+	switch (lg->regs->trapnum) {
+	case 13: /* We've intercepted a GPF. */
+		 /* Check if this was one of those annoying IN or OUT
+		  * instructions which we need to emulate.  If so, we
+		  * just go back into the Guest after we've done it. */
+		if (lg->regs->errcode == 0) {
+			if (emulate_insn(lg))
+				return;
+		}
+		break;
+	case 14: /* We've intercepted a page fault. */
+		 /* The Guest accessed a virtual address that wasn't
+		  * mapped.  This happens a lot: we don't actually set
+		  * up most of the page tables for the Guest at all when
+		  * we start: as it runs it asks for more and more, and
+		  * we set them up as required. In this case, we don't
+		  * even tell the Guest that the fault happened.
+		  *
+		  * The errcode tells whether this was a read or a
+		  * write, and whether kernel or userspace code. */
+		if (demand_page(lg, lg->arch.last_pagefault, lg->regs->errcode))
+			return;
+
+		 /* OK, it's really not there (or not OK): the Guest
+		  * needs to know.  We write out the cr2 value so it
+		  * knows where the fault occurred.
+		  *
+		  * Note that if the Guest were really messed up, this
+		  * could happen before it's done the INITIALIZE
+		  * hypercall, so lg->lguest_data will be NULL */
+		if (lg->lguest_data &&
+		    put_user(lg->arch.last_pagefault, &lg->lguest_data->cr2))
+			kill_guest(lg, "Writing cr2");
+		break;
+	case 7: /* We've intercepted a Device Not Available fault. */
+		/* If the Guest doesn't want to know, we already
+		 * restored the Floating Point Unit, so we just
+		 * continue without telling it. */
+		if (!lg->ts)
+			return;
+		break;
+	case 32 ... 255:
+		/* These values mean a real interrupt occurred, in which case
+		 * the Host handler has already been run.  We just do a
+		 * friendly check if another process should now be run, then
+		 * return to run the Guest again */
+		cond_resched();
+		return;
+	case LGUEST_TRAP_ENTRY:
+		/* Our 'struct hcall_args' maps directly over our regs: we set
+		 * up the pointer now to indicate a hypercall is pending. */
+		lg->hcall = (struct hcall_args *)lg->regs;
+		return;
+	}
+
+	/* We didn't handle the trap, so it needs to go to the Guest. */
+	if (!deliver_trap(lg, lg->regs->trapnum))
+		/* If the Guest doesn't have a handler (either it hasn't
+		 * registered any yet, or it's one of the faults we don't let
+		 * it handle), it dies with a cryptic error message. */
+		kill_guest(lg, "unhandled trap %li at %#lx (%#lx)",
+			   lg->regs->trapnum, lg->regs->eip,
+			   lg->regs->trapnum == 14 ? lg->arch.last_pagefault
+			   : lg->regs->errcode);
+}
+
+/* Now we can look at each of the routines this calls, in increasing order of
+ * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(),
+ * deliver_trap() and demand_page().  After all those, we'll be ready to
+ * examine the Switcher, and our philosophical understanding of the Host/Guest
+ * duality will be complete. :*/
+static void adjust_pge(void *on)
+{
+	if (on)
+		write_cr4(read_cr4() | X86_CR4_PGE);
+	else
+		write_cr4(read_cr4() & ~X86_CR4_PGE);
+}
+
+/*H:020 Now the Switcher is mapped and every thing else is ready, we need to do
+ * some more i386-specific initialization. */
+void __init lguest_arch_host_init(void)
+{
+	int i;
+
+	/* Most of the i386/switcher.S doesn't care that it's been moved; on
+	 * Intel, jumps are relative, and it doesn't access any references to
+	 * external code or data.
+	 *
+	 * The only exception is the interrupt handlers in switcher.S: their
+	 * addresses are placed in a table (default_idt_entries), so we need to
+	 * update the table with the new addresses.  switcher_offset() is a
+	 * convenience function which returns the distance between the builtin
+	 * switcher code and the high-mapped copy we just made. */
+	for (i = 0; i < IDT_ENTRIES; i++)
+		default_idt_entries[i] += switcher_offset();
+
+	/*
+	 * Set up the Switcher's per-cpu areas.
+	 *
+	 * Each CPU gets two pages of its own within the high-mapped region
+	 * (aka. "struct lguest_pages").  Much of this can be initialized now,
+	 * but some depends on what Guest we are running (which is set up in
+	 * copy_in_guest_info()).
+	 */
+	for_each_possible_cpu(i) {
+		/* lguest_pages() returns this CPU's two pages. */
+		struct lguest_pages *pages = lguest_pages(i);
+		/* This is a convenience pointer to make the code fit one
+		 * statement to a line. */
+		struct lguest_ro_state *state = &pages->state;
+
+		/* The Global Descriptor Table: the Host has a different one
+		 * for each CPU.  We keep a descriptor for the GDT which says
+		 * where it is and how big it is (the size is actually the last
+		 * byte, not the size, hence the "-1"). */
+		state->host_gdt_desc.size = GDT_SIZE-1;
+		state->host_gdt_desc.address = (long)get_cpu_gdt_table(i);
+
+		/* All CPUs on the Host use the same Interrupt Descriptor
+		 * Table, so we just use store_idt(), which gets this CPU's IDT
+		 * descriptor. */
+		store_idt(&state->host_idt_desc);
+
+		/* The descriptors for the Guest's GDT and IDT can be filled
+		 * out now, too.  We copy the GDT & IDT into ->guest_gdt and
+		 * ->guest_idt before actually running the Guest. */
+		state->guest_idt_desc.size = sizeof(state->guest_idt)-1;
+		state->guest_idt_desc.address = (long)&state->guest_idt;
+		state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1;
+		state->guest_gdt_desc.address = (long)&state->guest_gdt;
+
+		/* We know where we want the stack to be when the Guest enters
+		 * the switcher: in pages->regs.  The stack grows upwards, so
+		 * we start it at the end of that structure. */
+		state->guest_tss.esp0 = (long)(&pages->regs + 1);
+		/* And this is the GDT entry to use for the stack: we keep a
+		 * couple of special LGUEST entries. */
+		state->guest_tss.ss0 = LGUEST_DS;
+
+		/* x86 can have a finegrained bitmap which indicates what I/O
+		 * ports the process can use.  We set it to the end of our
+		 * structure, meaning "none". */
+		state->guest_tss.io_bitmap_base = sizeof(state->guest_tss);
+
+		/* Some GDT entries are the same across all Guests, so we can
+		 * set them up now. */
+		setup_default_gdt_entries(state);
+		/* Most IDT entries are the same for all Guests, too.*/
+		setup_default_idt_entries(state, default_idt_entries);
+
+		/* The Host needs to be able to use the LGUEST segments on this
+		 * CPU, too, so put them in the Host GDT. */
+		get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
+		get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
+	}
+
+	/* In the Switcher, we want the %cs segment register to use the
+	 * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so
+	 * it will be undisturbed when we switch.  To change %cs and jump we
+	 * need this structure to feed to Intel's "lcall" instruction. */
+	lguest_entry.offset = (long)switch_to_guest + switcher_offset();
+	lguest_entry.segment = LGUEST_CS;
+
+	/* Finally, we need to turn off "Page Global Enable".  PGE is an
+	 * optimization where page table entries are specially marked to show
+	 * they never change.  The Host kernel marks all the kernel pages this
+	 * way because it's always present, even when userspace is running.
+	 *
+	 * Lguest breaks this: unbeknownst to the rest of the Host kernel, we
+	 * switch to the Guest kernel.  If you don't disable this on all CPUs,
+	 * you'll get really weird bugs that you'll chase for two days.
+	 *
+	 * I used to turn PGE off every time we switched to the Guest and back
+	 * on when we return, but that slowed the Switcher down noticibly. */
+
+	/* We don't need the complexity of CPUs coming and going while we're
+	 * doing this. */
+	lock_cpu_hotplug();
+	if (cpu_has_pge) { /* We have a broader idea of "global". */
+		/* Remember that this was originally set (for cleanup). */
+		cpu_had_pge = 1;
+		/* adjust_pge is a helper function which sets or unsets the PGE
+		 * bit on its CPU, depending on the argument (0 == unset). */
+		on_each_cpu(adjust_pge, (void *)0, 0, 1);
+		/* Turn off the feature in the global feature set. */
+		clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+	}
+	unlock_cpu_hotplug();
+};
+/*:*/
+
+void __exit lguest_arch_host_fini(void)
+{
+	/* If we had PGE before we started, turn it back on now. */
+	lock_cpu_hotplug();
+	if (cpu_had_pge) {
+		set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+		/* adjust_pge's argument "1" means set PGE. */
+		on_each_cpu(adjust_pge, (void *)1, 0, 1);
+	}
+	unlock_cpu_hotplug();
+}
+
+
+/*H:122 The i386-specific hypercalls simply farm out to the right functions. */
+int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args)
+{
+	switch (args->arg0) {
+	case LHCALL_LOAD_GDT:
+		load_guest_gdt(lg, args->arg1, args->arg2);
+		break;
+	case LHCALL_LOAD_IDT_ENTRY:
+		load_guest_idt_entry(lg, args->arg1, args->arg2, args->arg3);
+		break;
+	case LHCALL_LOAD_TLS:
+		guest_load_tls(lg, args->arg1);
+		break;
+	default:
+		/* Bad Guest.  Bad! */
+		return -EIO;
+	}
+	return 0;
+}
+
+/*H:126 i386-specific hypercall initialization: */
+int lguest_arch_init_hypercalls(struct lguest *lg)
+{
+	u32 tsc_speed;
+
+	/* The pointer to the Guest's "struct lguest_data" is the only
+	 * argument.  We check that address now. */
+	if (!lguest_address_ok(lg, lg->hcall->arg1, sizeof(*lg->lguest_data)))
+		return -EFAULT;
+
+	/* Having checked it, we simply set lg->lguest_data to point straight
+	 * into the Launcher's memory at the right place and then use
+	 * copy_to_user/from_user from now on, instead of lgread/write.  I put
+	 * this in to show that I'm not immune to writing stupid
+	 * optimizations. */
+	lg->lguest_data = lg->mem_base + lg->hcall->arg1;
+
+	/* We insist that the Time Stamp Counter exist and doesn't change with
+	 * cpu frequency.  Some devious chip manufacturers decided that TSC
+	 * changes could be handled in software.  I decided that time going
+	 * backwards might be good for benchmarks, but it's bad for users.
+	 *
+	 * We also insist that the TSC be stable: the kernel detects unreliable
+	 * TSCs for its own purposes, and we use that here. */
+	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable())
+		tsc_speed = tsc_khz;
+	else
+		tsc_speed = 0;
+	if (put_user(tsc_speed, &lg->lguest_data->tsc_khz))
+		return -EFAULT;
+
+	/* The interrupt code might not like the system call vector. */
+	if (!check_syscall_vector(lg))
+		kill_guest(lg, "bad syscall vector");
+
+	return 0;
+}
+/* Now we've examined the hypercall code; our Guest can make requests.  There
+ * is one other way we can do things for the Guest, as we see in
+ * emulate_insn(). :*/
+
+/*L:030 lguest_arch_setup_regs()
+ *
+ * Most of the Guest's registers are left alone: we used get_zeroed_page() to
+ * allocate the structure, so they will be 0. */
+void lguest_arch_setup_regs(struct lguest *lg, unsigned long start)
+{
+	struct lguest_regs *regs = lg->regs;
+
+	/* There are four "segment" registers which the Guest needs to boot:
+	 * The "code segment" register (cs) refers to the kernel code segment
+	 * __KERNEL_CS, and the "data", "extra" and "stack" segment registers
+	 * refer to the kernel data segment __KERNEL_DS.
+	 *
+	 * The privilege level is packed into the lower bits.  The Guest runs
+	 * at privilege level 1 (GUEST_PL).*/
+	regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL;
+	regs->cs = __KERNEL_CS|GUEST_PL;
+
+	/* The "eflags" register contains miscellaneous flags.  Bit 1 (0x002)
+	 * is supposed to always be "1".  Bit 9 (0x200) controls whether
+	 * interrupts are enabled.  We always leave interrupts enabled while
+	 * running the Guest. */
+	regs->eflags = 0x202;
+
+	/* The "Extended Instruction Pointer" register says where the Guest is
+	 * running. */
+	regs->eip = start;
+
+	/* %esi points to our boot information, at physical address 0, so don't
+	 * touch it. */
+	/* There are a couple of GDT entries the Guest expects when first
+	 * booting. */
+
+	setup_guest_gdt(lg);
+}
diff --git a/drivers/lguest/switcher.S b/drivers/lguest/x86/switcher_32.S
index 7c9c230cc84..1010b90b11f 100644
--- a/drivers/lguest/switcher.S
+++ b/drivers/lguest/x86/switcher_32.S
@@ -48,7 +48,8 @@
 #include <linux/linkage.h>
 #include <asm/asm-offsets.h>
 #include <asm/page.h>
-#include "lg.h"
+#include <asm/segment.h>
+#include <asm/lguest.h>
 
 // We mark the start of the code to copy
 // It's placed in .text tho it's never run here
@@ -132,6 +133,7 @@ ENTRY(switch_to_guest)
 	// The Guest's register page has been mapped
 	// Writable onto our %esp (stack) --
 	// We can simply pop off all Guest regs.
+	popl	%eax
 	popl	%ebx
 	popl	%ecx
 	popl	%edx
@@ -139,7 +141,6 @@ ENTRY(switch_to_guest)
 	popl	%edi
 	popl	%ebp
 	popl	%gs
-	popl	%eax
 	popl	%fs
 	popl	%ds
 	popl	%es
@@ -167,7 +168,6 @@ ENTRY(switch_to_guest)
 	pushl	%es;							\
 	pushl	%ds;							\
 	pushl	%fs;							\
-	pushl	%eax;							\
 	pushl	%gs;							\
 	pushl	%ebp;							\
 	pushl	%edi;							\
@@ -175,6 +175,7 @@ ENTRY(switch_to_guest)
 	pushl	%edx;							\
 	pushl	%ecx;							\
 	pushl	%ebx;							\
+	pushl	%eax;							\
 	/* Our stack and our code are using segments			\
 	 * Set in the TSS and IDT					\
 	 * Yet if we were to touch data we'd use			\
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 927cb34c480..7c426d07a55 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -274,7 +274,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 			if (bitmap->offset < 0) {
 				/* DATA  BITMAP METADATA  */
 				if (bitmap->offset
-				    + page->index * (PAGE_SIZE/512)
+				    + (long)(page->index * (PAGE_SIZE/512))
 				    + size/512 > 0)
 					/* bitmap runs in to metadata */
 					return -EINVAL;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 0eb5416798b..ac54f697c50 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -348,16 +348,17 @@ static int crypt_convert(struct crypt_config *cc,
 	      ctx->idx_out < ctx->bio_out->bi_vcnt) {
 		struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in);
 		struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out);
-		struct scatterlist sg_in = {
-			.page = bv_in->bv_page,
-			.offset = bv_in->bv_offset + ctx->offset_in,
-			.length = 1 << SECTOR_SHIFT
-		};
-		struct scatterlist sg_out = {
-			.page = bv_out->bv_page,
-			.offset = bv_out->bv_offset + ctx->offset_out,
-			.length = 1 << SECTOR_SHIFT
-		};
+		struct scatterlist sg_in, sg_out;
+
+		sg_init_table(&sg_in, 1);
+		sg_set_page(&sg_in, bv_in->bv_page);
+		sg_in.offset = bv_in->bv_offset + ctx->offset_in;
+		sg_in.length = 1 << SECTOR_SHIFT;
+
+		sg_init_table(&sg_out, 1);
+		sg_set_page(&sg_out, bv_out->bv_page);
+		sg_out.offset = bv_out->bv_offset + ctx->offset_out;
+		sg_out.length = 1 << SECTOR_SHIFT;
 
 		ctx->offset_in += sg_in.length;
 		if (ctx->offset_in >= bv_in->bv_len) {
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 8ee181a01f5..80a67d789b7 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -376,7 +376,12 @@ static unsigned long get_stripe_work(struct stripe_head *sh)
 		ack++;
 
 	sh->ops.count -= ack;
-	BUG_ON(sh->ops.count < 0);
+	if (unlikely(sh->ops.count < 0)) {
+		printk(KERN_ERR "pending: %#lx ops.pending: %#lx ops.ack: %#lx "
+			"ops.complete: %#lx\n", pending, sh->ops.pending,
+			sh->ops.ack, sh->ops.complete);
+		BUG();
+	}
 
 	return pending;
 }
@@ -550,8 +555,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
 			}
 		}
 	}
-	clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack);
-	clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending);
+	set_bit(STRIPE_OP_BIOFILL, &sh->ops.complete);
 
 	return_io(return_bi);
 
@@ -2893,6 +2897,13 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 	s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
 	/* Now to look around and see what can be done */
 
+	/* clean-up completed biofill operations */
+	if (test_bit(STRIPE_OP_BIOFILL, &sh->ops.complete)) {
+		clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending);
+		clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack);
+		clear_bit(STRIPE_OP_BIOFILL, &sh->ops.complete);
+	}
+
 	rcu_read_lock();
 	for (i=disks; i--; ) {
 		mdk_rdev_t *rdev;
diff --git a/drivers/media/common/ir-keymaps.c b/drivers/media/common/ir-keymaps.c
index aefcf28da1c..185e8a860c1 100644
--- a/drivers/media/common/ir-keymaps.c
+++ b/drivers/media/common/ir-keymaps.c
@@ -1074,41 +1074,41 @@ EXPORT_SYMBOL_GPL(ir_codes_manli);
 /* Mike Baikov <mike@baikov.com> */
 IR_KEYTAB_TYPE ir_codes_gotview7135[IR_KEYTAB_SIZE] = {
 
-	[ 0x21 ] = KEY_POWER,
-	[ 0x69 ] = KEY_TV,
-	[ 0x33 ] = KEY_0,
-	[ 0x51 ] = KEY_1,
-	[ 0x31 ] = KEY_2,
-	[ 0x71 ] = KEY_3,
-	[ 0x3b ] = KEY_4,
-	[ 0x58 ] = KEY_5,
-	[ 0x41 ] = KEY_6,
-	[ 0x48 ] = KEY_7,
-	[ 0x30 ] = KEY_8,
-	[ 0x53 ] = KEY_9,
-	[ 0x73 ] = KEY_AGAIN, /* LOOP */
-	[ 0x0a ] = KEY_AUDIO,
-	[ 0x61 ] = KEY_PRINT, /* PREVIEW */
-	[ 0x7a ] = KEY_VIDEO,
-	[ 0x20 ] = KEY_CHANNELUP,
-	[ 0x40 ] = KEY_CHANNELDOWN,
-	[ 0x18 ] = KEY_VOLUMEDOWN,
-	[ 0x50 ] = KEY_VOLUMEUP,
-	[ 0x10 ] = KEY_MUTE,
-	[ 0x4a ] = KEY_SEARCH,
-	[ 0x7b ] = KEY_SHUFFLE, /* SNAPSHOT */
-	[ 0x22 ] = KEY_RECORD,
-	[ 0x62 ] = KEY_STOP,
-	[ 0x78 ] = KEY_PLAY,
-	[ 0x39 ] = KEY_REWIND,
-	[ 0x59 ] = KEY_PAUSE,
-	[ 0x19 ] = KEY_FORWARD,
-	[ 0x09 ] = KEY_ZOOM,
-
-	[ 0x52 ] = KEY_F21, /* LIVE TIMESHIFT */
-	[ 0x1a ] = KEY_F22, /* MIN TIMESHIFT */
-	[ 0x3a ] = KEY_F23, /* TIMESHIFT */
-	[ 0x70 ] = KEY_F24, /* NORMAL TIMESHIFT */
+	[ 0x11 ] = KEY_POWER,
+	[ 0x35 ] = KEY_TV,
+	[ 0x1b ] = KEY_0,
+	[ 0x29 ] = KEY_1,
+	[ 0x19 ] = KEY_2,
+	[ 0x39 ] = KEY_3,
+	[ 0x1f ] = KEY_4,
+	[ 0x2c ] = KEY_5,
+	[ 0x21 ] = KEY_6,
+	[ 0x24 ] = KEY_7,
+	[ 0x18 ] = KEY_8,
+	[ 0x2b ] = KEY_9,
+	[ 0x3b ] = KEY_AGAIN, /* LOOP */
+	[ 0x06 ] = KEY_AUDIO,
+	[ 0x31 ] = KEY_PRINT, /* PREVIEW */
+	[ 0x3e ] = KEY_VIDEO,
+	[ 0x10 ] = KEY_CHANNELUP,
+	[ 0x20 ] = KEY_CHANNELDOWN,
+	[ 0x0c ] = KEY_VOLUMEDOWN,
+	[ 0x28 ] = KEY_VOLUMEUP,
+	[ 0x08 ] = KEY_MUTE,
+	[ 0x26 ] = KEY_SEARCH, /*SCAN*/
+	[ 0x3f ] = KEY_SHUFFLE, /* SNAPSHOT */
+	[ 0x12 ] = KEY_RECORD,
+	[ 0x32 ] = KEY_STOP,
+	[ 0x3c ] = KEY_PLAY,
+	[ 0x1d ] = KEY_REWIND,
+	[ 0x2d ] = KEY_PAUSE,
+	[ 0x0d ] = KEY_FORWARD,
+	[ 0x05 ] = KEY_ZOOM,  /*FULL*/
+
+	[ 0x2a ] = KEY_F21, /* LIVE TIMESHIFT */
+	[ 0x0e ] = KEY_F22, /* MIN TIMESHIFT */
+	[ 0x1e ] = KEY_F23, /* TIMESHIFT */
+	[ 0x38 ] = KEY_F24, /* NORMAL TIMESHIFT */
 };
 
 EXPORT_SYMBOL_GPL(ir_codes_gotview7135);
diff --git a/drivers/media/common/saa7146_core.c b/drivers/media/common/saa7146_core.c
index 365a22118a0..2b1f8b4be00 100644
--- a/drivers/media/common/saa7146_core.c
+++ b/drivers/media/common/saa7146_core.c
@@ -112,12 +112,13 @@ static struct scatterlist* vmalloc_to_sg(unsigned char *virt, int nr_pages)
 	sglist = kcalloc(nr_pages, sizeof(struct scatterlist), GFP_KERNEL);
 	if (NULL == sglist)
 		return NULL;
+	sg_init_table(sglist, nr_pages);
 	for (i = 0; i < nr_pages; i++, virt += PAGE_SIZE) {
 		pg = vmalloc_to_page(virt);
 		if (NULL == pg)
 			goto err;
 		BUG_ON(PageHighMem(pg));
-		sglist[i].page   = pg;
+		sg_set_page(&sglist[i], pg);
 		sglist[i].length = PAGE_SIZE;
 	}
 	return sglist;
diff --git a/drivers/media/dvb/cinergyT2/cinergyT2.c b/drivers/media/dvb/cinergyT2/cinergyT2.c
index a05e5c18228..db08b0a8888 100644
--- a/drivers/media/dvb/cinergyT2/cinergyT2.c
+++ b/drivers/media/dvb/cinergyT2/cinergyT2.c
@@ -345,7 +345,9 @@ static int cinergyt2_start_feed(struct dvb_demux_feed *dvbdmxfeed)
 	struct dvb_demux *demux = dvbdmxfeed->demux;
 	struct cinergyt2 *cinergyt2 = demux->priv;
 
-	if (cinergyt2->disconnect_pending || mutex_lock_interruptible(&cinergyt2->sem))
+	if (cinergyt2->disconnect_pending)
+		return -EAGAIN;
+	if (mutex_lock_interruptible(&cinergyt2->sem))
 		return -ERESTARTSYS;
 
 	if (cinergyt2->streaming == 0)
@@ -361,7 +363,9 @@ static int cinergyt2_stop_feed(struct dvb_demux_feed *dvbdmxfeed)
 	struct dvb_demux *demux = dvbdmxfeed->demux;
 	struct cinergyt2 *cinergyt2 = demux->priv;
 
-	if (cinergyt2->disconnect_pending || mutex_lock_interruptible(&cinergyt2->sem))
+	if (cinergyt2->disconnect_pending)
+		return -EAGAIN;
+	if (mutex_lock_interruptible(&cinergyt2->sem))
 		return -ERESTARTSYS;
 
 	if (--cinergyt2->streaming == 0)
@@ -481,12 +485,16 @@ static int cinergyt2_open (struct inode *inode, struct file *file)
 {
 	struct dvb_device *dvbdev = file->private_data;
 	struct cinergyt2 *cinergyt2 = dvbdev->priv;
-	int err = -ERESTARTSYS;
+	int err = -EAGAIN;
 
-	if (cinergyt2->disconnect_pending || mutex_lock_interruptible(&cinergyt2->wq_sem))
+	if (cinergyt2->disconnect_pending)
+		goto out;
+	err = mutex_lock_interruptible(&cinergyt2->wq_sem);
+	if (err)
 		goto out;
 
-	if (mutex_lock_interruptible(&cinergyt2->sem))
+	err = mutex_lock_interruptible(&cinergyt2->sem);
+	if (err)
 		goto out_unlock1;
 
 	if ((err = dvb_generic_open(inode, file)))
@@ -550,7 +558,9 @@ static unsigned int cinergyt2_poll (struct file *file, struct poll_table_struct
 	struct cinergyt2 *cinergyt2 = dvbdev->priv;
 	unsigned int mask = 0;
 
-	if (cinergyt2->disconnect_pending || mutex_lock_interruptible(&cinergyt2->sem))
+	if (cinergyt2->disconnect_pending)
+		return -EAGAIN;
+	if (mutex_lock_interruptible(&cinergyt2->sem))
 		return -ERESTARTSYS;
 
 	poll_wait(file, &cinergyt2->poll_wq, wait);
@@ -625,7 +635,9 @@ static int cinergyt2_ioctl (struct inode *inode, struct file *file,
 		if (copy_from_user(&p, (void  __user*) arg, sizeof(p)))
 			return -EFAULT;
 
-		if (cinergyt2->disconnect_pending || mutex_lock_interruptible(&cinergyt2->sem))
+		if (cinergyt2->disconnect_pending)
+			return -EAGAIN;
+		if (mutex_lock_interruptible(&cinergyt2->sem))
 			return -ERESTARTSYS;
 
 		param->cmd = CINERGYT2_EP1_SET_TUNER_PARAMETERS;
@@ -996,7 +1008,9 @@ static int cinergyt2_suspend (struct usb_interface *intf, pm_message_t state)
 {
 	struct cinergyt2 *cinergyt2 = usb_get_intfdata (intf);
 
-	if (cinergyt2->disconnect_pending || mutex_lock_interruptible(&cinergyt2->wq_sem))
+	if (cinergyt2->disconnect_pending)
+		return -EAGAIN;
+	if (mutex_lock_interruptible(&cinergyt2->wq_sem))
 		return -ERESTARTSYS;
 
 	cinergyt2_suspend_rc(cinergyt2);
@@ -1017,16 +1031,18 @@ static int cinergyt2_resume (struct usb_interface *intf)
 {
 	struct cinergyt2 *cinergyt2 = usb_get_intfdata (intf);
 	struct dvbt_set_parameters_msg *param = &cinergyt2->param;
-	int err = -ERESTARTSYS;
+	int err = -EAGAIN;
 
-	if (cinergyt2->disconnect_pending || mutex_lock_interruptible(&cinergyt2->wq_sem))
+	if (cinergyt2->disconnect_pending)
+		goto out;
+	err = mutex_lock_interruptible(&cinergyt2->wq_sem);
+	if (err)
 		goto out;
 
-	if (mutex_lock_interruptible(&cinergyt2->sem))
+	err = mutex_lock_interruptible(&cinergyt2->sem);
+	if (err)
 		goto out_unlock1;
 
-	err = 0;
-
 	if (!cinergyt2->sleeping) {
 		cinergyt2_sleep(cinergyt2, 0);
 		cinergyt2_command(cinergyt2, (char *) param, sizeof(*param), NULL, 0);
diff --git a/drivers/media/dvb/dvb-core/dvb_ca_en50221.c b/drivers/media/dvb/dvb-core/dvb_ca_en50221.c
index 084a508a03d..89437fdab8b 100644
--- a/drivers/media/dvb/dvb-core/dvb_ca_en50221.c
+++ b/drivers/media/dvb/dvb-core/dvb_ca_en50221.c
@@ -972,7 +972,7 @@ static int dvb_ca_en50221_thread(void *data)
 	/* main loop */
 	while (!kthread_should_stop()) {
 		/* sleep for a bit */
-		while (!ca->wakeup) {
+		if (!ca->wakeup) {
 			set_current_state(TASK_INTERRUPTIBLE);
 			schedule_timeout(ca->delay);
 			if (kthread_should_stop())
diff --git a/drivers/media/dvb/dvb-usb/dib0700_devices.c b/drivers/media/dvb/dvb-usb/dib0700_devices.c
index e8c4a869453..58452b52002 100644
--- a/drivers/media/dvb/dvb-usb/dib0700_devices.c
+++ b/drivers/media/dvb/dvb-usb/dib0700_devices.c
@@ -828,7 +828,7 @@ MODULE_DEVICE_TABLE(usb, dib0700_usb_id_table);
 #define DIB0700_DEFAULT_DEVICE_PROPERTIES \
 	.caps              = DVB_USB_IS_AN_I2C_ADAPTER, \
 	.usb_ctrl          = DEVICE_SPECIFIC, \
-	.firmware          = "dvb-usb-dib0700-03-pre1.fw", \
+	.firmware          = "dvb-usb-dib0700-1.10.fw", \
 	.download_firmware = dib0700_download_firmware, \
 	.no_reconnect      = 1, \
 	.size_of_priv      = sizeof(struct dib0700_state), \
diff --git a/drivers/media/radio/miropcm20-radio.c b/drivers/media/radio/miropcm20-radio.c
index c7c9d1dc069..3ae56fef8c9 100644
--- a/drivers/media/radio/miropcm20-radio.c
+++ b/drivers/media/radio/miropcm20-radio.c
@@ -229,7 +229,6 @@ static struct video_device pcm20_radio = {
 	.owner		= THIS_MODULE,
 	.name		= "Miro PCM 20 radio",
 	.type		= VID_TYPE_TUNER,
-	.hardware	= VID_HARDWARE_RTRACK,
 	.fops           = &pcm20_fops,
 	.priv		= &pcm20_unit
 };
diff --git a/drivers/media/radio/radio-gemtek.c b/drivers/media/radio/radio-gemtek.c
index 0c963db0361..5e4b9ddb23c 100644
--- a/drivers/media/radio/radio-gemtek.c
+++ b/drivers/media/radio/radio-gemtek.c
@@ -554,7 +554,6 @@ static struct video_device gemtek_radio = {
 	.owner			= THIS_MODULE,
 	.name			= "GemTek Radio card",
 	.type			= VID_TYPE_TUNER,
-	.hardware		= VID_HARDWARE_GEMTEK,
 	.fops			= &gemtek_fops,
 	.vidioc_querycap	= vidioc_querycap,
 	.vidioc_g_tuner		= vidioc_g_tuner,
diff --git a/drivers/media/video/arv.c b/drivers/media/video/arv.c
index 19e9929ffa0..c94a4d0f280 100644
--- a/drivers/media/video/arv.c
+++ b/drivers/media/video/arv.c
@@ -755,7 +755,6 @@ static struct video_device ar_template = {
 	.owner		= THIS_MODULE,
 	.name		= "Colour AR VGA",
 	.type		= VID_TYPE_CAPTURE,
-	.hardware	= VID_HARDWARE_ARV,
 	.fops		= &ar_fops,
 	.release	= ar_release,
 	.minor		= -1,
diff --git a/drivers/media/video/bt8xx/bttv-driver.c b/drivers/media/video/bt8xx/bttv-driver.c
index 7a332b3efe5..9feeb636ff9 100644
--- a/drivers/media/video/bt8xx/bttv-driver.c
+++ b/drivers/media/video/bt8xx/bttv-driver.c
@@ -3877,7 +3877,6 @@ static struct video_device bttv_video_template =
 	.name     = "UNSET",
 	.type     = VID_TYPE_CAPTURE|VID_TYPE_TUNER|
 		    VID_TYPE_CLIPPING|VID_TYPE_SCALES,
-	.hardware = VID_HARDWARE_BT848,
 	.fops     = &bttv_fops,
 	.minor    = -1,
 };
@@ -3886,7 +3885,6 @@ static struct video_device bttv_vbi_template =
 {
 	.name     = "bt848/878 vbi",
 	.type     = VID_TYPE_TUNER|VID_TYPE_TELETEXT,
-	.hardware = VID_HARDWARE_BT848,
 	.fops     = &bttv_fops,
 	.minor    = -1,
 };
@@ -4034,7 +4032,6 @@ static struct video_device radio_template =
 {
 	.name     = "bt848/878 radio",
 	.type     = VID_TYPE_TUNER,
-	.hardware = VID_HARDWARE_BT848,
 	.fops     = &radio_fops,
 	.minor    = -1,
 };
diff --git a/drivers/media/video/bw-qcam.c b/drivers/media/video/bw-qcam.c
index 7f7e3d3398d..58423525591 100644
--- a/drivers/media/video/bw-qcam.c
+++ b/drivers/media/video/bw-qcam.c
@@ -899,7 +899,6 @@ static struct video_device qcam_template=
 	.owner		= THIS_MODULE,
 	.name		= "Connectix Quickcam",
 	.type		= VID_TYPE_CAPTURE,
-	.hardware	= VID_HARDWARE_QCAM_BW,
 	.fops           = &qcam_fops,
 };
 
diff --git a/drivers/media/video/c-qcam.c b/drivers/media/video/c-qcam.c
index f76c6a6c376..cf1546b5a7f 100644
--- a/drivers/media/video/c-qcam.c
+++ b/drivers/media/video/c-qcam.c
@@ -699,7 +699,6 @@ static struct video_device qcam_template=
 	.owner		= THIS_MODULE,
 	.name		= "Colour QuickCam",
 	.type		= VID_TYPE_CAPTURE,
-	.hardware	= VID_HARDWARE_QCAM_C,
 	.fops           = &qcam_fops,
 };
 
diff --git a/drivers/media/video/cpia.c b/drivers/media/video/cpia.c
index a1d02e5ce0f..7c630f5ee72 100644
--- a/drivers/media/video/cpia.c
+++ b/drivers/media/video/cpia.c
@@ -65,10 +65,6 @@ MODULE_PARM_DESC(colorspace_conv,
 
 #define ABOUT "V4L-Driver for Vision CPiA based cameras"
 
-#ifndef VID_HARDWARE_CPIA
-#define VID_HARDWARE_CPIA 24    /* FIXME -> from linux/videodev.h */
-#endif
-
 #define CPIA_MODULE_CPIA			(0<<5)
 #define CPIA_MODULE_SYSTEM			(1<<5)
 #define CPIA_MODULE_VP_CTRL			(5<<5)
@@ -3804,7 +3800,6 @@ static struct video_device cpia_template = {
 	.owner		= THIS_MODULE,
 	.name		= "CPiA Camera",
 	.type		= VID_TYPE_CAPTURE,
-	.hardware	= VID_HARDWARE_CPIA,
 	.fops           = &cpia_fops,
 };
 
diff --git a/drivers/media/video/cpia2/cpia2_v4l.c b/drivers/media/video/cpia2/cpia2_v4l.c
index e3aaba1e0e0..e378abec806 100644
--- a/drivers/media/video/cpia2/cpia2_v4l.c
+++ b/drivers/media/video/cpia2/cpia2_v4l.c
@@ -86,10 +86,6 @@ MODULE_LICENSE("GPL");
 
 #define ABOUT "V4L-Driver for Vision CPiA2 based cameras"
 
-#ifndef VID_HARDWARE_CPIA2
-#error "VID_HARDWARE_CPIA2 should have been defined in linux/videodev.h"
-#endif
-
 struct control_menu_info {
 	int value;
 	char name[32];
@@ -1942,7 +1938,6 @@ static struct video_device cpia2_template = {
 	.type=		VID_TYPE_CAPTURE,
 	.type2 = 	V4L2_CAP_VIDEO_CAPTURE |
 			V4L2_CAP_STREAMING,
-	.hardware=	VID_HARDWARE_CPIA2,
 	.minor=		-1,
 	.fops=		&fops_template,
 	.release=	video_device_release,
diff --git a/drivers/media/video/cx23885/cx23885-core.c b/drivers/media/video/cx23885/cx23885-core.c
index af16505bd2e..3cdd136477e 100644
--- a/drivers/media/video/cx23885/cx23885-core.c
+++ b/drivers/media/video/cx23885/cx23885-core.c
@@ -793,7 +793,7 @@ static int cx23885_dev_setup(struct cx23885_dev *dev)
 		       dev->pci->subsystem_device);
 
 		cx23885_devcount--;
-		goto fail_free;
+		return -ENODEV;
 	}
 
 	/* PCIe stuff */
@@ -835,10 +835,6 @@ static int cx23885_dev_setup(struct cx23885_dev *dev)
 	}
 
 	return 0;
-
-fail_free:
-	kfree(dev);
-	return -ENODEV;
 }
 
 void cx23885_dev_unregister(struct cx23885_dev *dev)
diff --git a/drivers/media/video/cx88/cx88-alsa.c b/drivers/media/video/cx88/cx88-alsa.c
index 141dadf7cf1..40ffd7a5579 100644
--- a/drivers/media/video/cx88/cx88-alsa.c
+++ b/drivers/media/video/cx88/cx88-alsa.c
@@ -39,6 +39,7 @@
 #include <sound/pcm_params.h>
 #include <sound/control.h>
 #include <sound/initval.h>
+#include <sound/tlv.h>
 
 #include "cx88.h"
 #include "cx88-reg.h"
@@ -82,6 +83,7 @@ typedef struct cx88_audio_dev snd_cx88_card_t;
 
 
 
+
 /****************************************************************************
 			Module global static vars
  ****************************************************************************/
@@ -545,8 +547,8 @@ static int __devinit snd_cx88_pcm(snd_cx88_card_t *chip, int device, char *name)
 /****************************************************************************
 				CONTROL INTERFACE
  ****************************************************************************/
-static int snd_cx88_capture_volume_info(struct snd_kcontrol *kcontrol,
-					struct snd_ctl_elem_info *info)
+static int snd_cx88_volume_info(struct snd_kcontrol *kcontrol,
+				struct snd_ctl_elem_info *info)
 {
 	info->type = SNDRV_CTL_ELEM_TYPE_INTEGER;
 	info->count = 2;
@@ -556,9 +558,8 @@ static int snd_cx88_capture_volume_info(struct snd_kcontrol *kcontrol,
 	return 0;
 }
 
-/* OK - TODO: test it */
-static int snd_cx88_capture_volume_get(struct snd_kcontrol *kcontrol,
-				       struct snd_ctl_elem_value *value)
+static int snd_cx88_volume_get(struct snd_kcontrol *kcontrol,
+			       struct snd_ctl_elem_value *value)
 {
 	snd_cx88_card_t *chip = snd_kcontrol_chip(kcontrol);
 	struct cx88_core *core=chip->core;
@@ -573,8 +574,8 @@ static int snd_cx88_capture_volume_get(struct snd_kcontrol *kcontrol,
 }
 
 /* OK - TODO: test it */
-static int snd_cx88_capture_volume_put(struct snd_kcontrol *kcontrol,
-				       struct snd_ctl_elem_value *value)
+static int snd_cx88_volume_put(struct snd_kcontrol *kcontrol,
+			       struct snd_ctl_elem_value *value)
 {
 	snd_cx88_card_t *chip = snd_kcontrol_chip(kcontrol);
 	struct cx88_core *core=chip->core;
@@ -605,14 +606,67 @@ static int snd_cx88_capture_volume_put(struct snd_kcontrol *kcontrol,
 	return changed;
 }
 
-static struct snd_kcontrol_new snd_cx88_capture_volume = {
+static const DECLARE_TLV_DB_SCALE(snd_cx88_db_scale, -6300, 100, 0);
+
+static struct snd_kcontrol_new snd_cx88_volume = {
+	.iface = SNDRV_CTL_ELEM_IFACE_MIXER,
+	.access = SNDRV_CTL_ELEM_ACCESS_READWRITE |
+		  SNDRV_CTL_ELEM_ACCESS_TLV_READ,
+	.name = "Playback Volume",
+	.info = snd_cx88_volume_info,
+	.get = snd_cx88_volume_get,
+	.put = snd_cx88_volume_put,
+	.tlv.p = snd_cx88_db_scale,
+};
+
+static int snd_cx88_switch_get(struct snd_kcontrol *kcontrol,
+			       struct snd_ctl_elem_value *value)
+{
+	snd_cx88_card_t *chip = snd_kcontrol_chip(kcontrol);
+	struct cx88_core *core = chip->core;
+	u32 bit = kcontrol->private_value;
+
+	value->value.integer.value[0] = !(cx_read(AUD_VOL_CTL) & bit);
+	return 0;
+}
+
+static int snd_cx88_switch_put(struct snd_kcontrol *kcontrol,
+				       struct snd_ctl_elem_value *value)
+{
+	snd_cx88_card_t *chip = snd_kcontrol_chip(kcontrol);
+	struct cx88_core *core = chip->core;
+	u32 bit = kcontrol->private_value;
+	int ret = 0;
+	u32 vol;
+
+	spin_lock_irq(&chip->reg_lock);
+	vol = cx_read(AUD_VOL_CTL);
+	if (value->value.integer.value[0] != !(vol & bit)) {
+		vol ^= bit;
+		cx_write(AUD_VOL_CTL, vol);
+		ret = 1;
+	}
+	spin_unlock_irq(&chip->reg_lock);
+	return ret;
+}
+
+static struct snd_kcontrol_new snd_cx88_dac_switch = {
 	.iface = SNDRV_CTL_ELEM_IFACE_MIXER,
-	.name = "Capture Volume",
-	.info = snd_cx88_capture_volume_info,
-	.get = snd_cx88_capture_volume_get,
-	.put = snd_cx88_capture_volume_put,
+	.name = "Playback Switch",
+	.info = snd_ctl_boolean_mono_info,
+	.get = snd_cx88_switch_get,
+	.put = snd_cx88_switch_put,
+	.private_value = (1<<8),
 };
 
+static struct snd_kcontrol_new snd_cx88_source_switch = {
+	.iface = SNDRV_CTL_ELEM_IFACE_MIXER,
+	.name = "Capture Switch",
+	.info = snd_ctl_boolean_mono_info,
+	.get = snd_cx88_switch_get,
+	.put = snd_cx88_switch_put,
+	.private_value = (1<<6),
+};
 
 /****************************************************************************
 			Basic Flow for Sound Devices
@@ -762,7 +816,13 @@ static int __devinit cx88_audio_initdev(struct pci_dev *pci,
 	if (err < 0)
 		goto error;
 
-	err = snd_ctl_add(card, snd_ctl_new1(&snd_cx88_capture_volume, chip));
+	err = snd_ctl_add(card, snd_ctl_new1(&snd_cx88_volume, chip));
+	if (err < 0)
+		goto error;
+	err = snd_ctl_add(card, snd_ctl_new1(&snd_cx88_dac_switch, chip));
+	if (err < 0)
+		goto error;
+	err = snd_ctl_add(card, snd_ctl_new1(&snd_cx88_source_switch, chip));
 	if (err < 0)
 		goto error;
 
diff --git a/drivers/media/video/cx88/cx88-blackbird.c b/drivers/media/video/cx88/cx88-blackbird.c
index 6d6f5048d76..f33f0b47142 100644
--- a/drivers/media/video/cx88/cx88-blackbird.c
+++ b/drivers/media/video/cx88/cx88-blackbird.c
@@ -527,44 +527,6 @@ static void blackbird_codec_settings(struct cx8802_dev *dev)
 	cx2341x_update(dev, blackbird_mbox_func, NULL, &dev->params);
 }
 
-static struct v4l2_mpeg_compression default_mpeg_params = {
-	.st_type          = V4L2_MPEG_PS_2,
-	.st_bitrate       = {
-		.mode     = V4L2_BITRATE_CBR,
-		.min      = 0,
-		.target   = 0,
-		.max      = 0
-	},
-	.ts_pid_pmt       = 16,
-	.ts_pid_audio     = 260,
-	.ts_pid_video     = 256,
-	.ts_pid_pcr       = 259,
-	.ps_size          = 0,
-	.au_type          = V4L2_MPEG_AU_2_II,
-	.au_bitrate       = {
-		.mode     = V4L2_BITRATE_CBR,
-		.min      = 224,
-		.target   = 224,
-		.max      = 224
-	},
-	.au_sample_rate    = 48000,
-	.au_pesid          = 0,
-	.vi_type           = V4L2_MPEG_VI_2,
-	.vi_aspect_ratio   = V4L2_MPEG_ASPECT_4_3,
-	.vi_bitrate        = {
-		.mode      = V4L2_BITRATE_CBR,
-		.min       = 4000,
-		.target    = 4500,
-		.max       = 6000
-	},
-	.vi_frame_rate     = 25,
-	.vi_frames_per_gop = 12,
-	.vi_bframes_count  = 2,
-	.vi_pesid          = 0,
-	.closed_gops       = 1,
-	.pulldown          = 0
-};
-
 static int blackbird_initialize_codec(struct cx8802_dev *dev)
 {
 	struct cx88_core *core = dev->core;
@@ -852,23 +814,6 @@ static int vidioc_streamoff(struct file *file, void *priv, enum v4l2_buf_type i)
 	return videobuf_streamoff(&fh->mpegq);
 }
 
-static int vidioc_g_mpegcomp (struct file *file, void *fh,
-			      struct v4l2_mpeg_compression *f)
-{
-	printk(KERN_WARNING "VIDIOC_G_MPEGCOMP is obsolete. "
-				"Replace with VIDIOC_G_EXT_CTRLS!");
-	memcpy(f,&default_mpeg_params,sizeof(*f));
-	return 0;
-}
-
-static int vidioc_s_mpegcomp (struct file *file, void *fh,
-			      struct v4l2_mpeg_compression *f)
-{
-	printk(KERN_WARNING "VIDIOC_S_MPEGCOMP is obsolete. "
-				"Replace with VIDIOC_S_EXT_CTRLS!");
-	return 0;
-}
-
 static int vidioc_g_ext_ctrls (struct file *file, void *priv,
 			       struct v4l2_ext_controls *f)
 {
@@ -1216,8 +1161,6 @@ static struct video_device cx8802_mpeg_template =
 	.vidioc_dqbuf         = vidioc_dqbuf,
 	.vidioc_streamon      = vidioc_streamon,
 	.vidioc_streamoff     = vidioc_streamoff,
-	.vidioc_g_mpegcomp    = vidioc_g_mpegcomp,
-	.vidioc_s_mpegcomp    = vidioc_s_mpegcomp,
 	.vidioc_g_ext_ctrls   = vidioc_g_ext_ctrls,
 	.vidioc_s_ext_ctrls   = vidioc_s_ext_ctrls,
 	.vidioc_try_ext_ctrls = vidioc_try_ext_ctrls,
diff --git a/drivers/media/video/cx88/cx88-dvb.c b/drivers/media/video/cx88/cx88-dvb.c
index d16e5c6d21c..fce19caf9d0 100644
--- a/drivers/media/video/cx88/cx88-dvb.c
+++ b/drivers/media/video/cx88/cx88-dvb.c
@@ -475,8 +475,9 @@ static int dvb_register(struct cx8802_dev *dev)
 		break;
 	case CX88_BOARD_DNTV_LIVE_DVB_T_PRO:
 #if defined(CONFIG_VIDEO_CX88_VP3054) || (defined(CONFIG_VIDEO_CX88_VP3054_MODULE) && defined(MODULE))
+		/* MT352 is on a secondary I2C bus made from some GPIO lines */
 		dev->dvb.frontend = dvb_attach(mt352_attach, &dntv_live_dvbt_pro_config,
-			&((struct vp3054_i2c_state *)dev->card_priv)->adap);
+					       &dev->vp3054->adap);
 		if (dev->dvb.frontend != NULL) {
 			dvb_attach(dvb_pll_attach, dev->dvb.frontend, 0x61,
 				   &dev->core->i2c_adap, DVB_PLL_FMD1216ME);
diff --git a/drivers/media/video/cx88/cx88-mpeg.c b/drivers/media/video/cx88/cx88-mpeg.c
index a652f294d23..448c6738094 100644
--- a/drivers/media/video/cx88/cx88-mpeg.c
+++ b/drivers/media/video/cx88/cx88-mpeg.c
@@ -79,7 +79,8 @@ static int cx8802_start_dma(struct cx8802_dev    *dev,
 {
 	struct cx88_core *core = dev->core;
 
-	dprintk(1, "cx8802_start_dma w: %d, h: %d, f: %d\n", dev->width, dev->height, buf->vb.field);
+	dprintk(1, "cx8802_start_dma w: %d, h: %d, f: %d\n",
+		buf->vb.width, buf->vb.height, buf->vb.field);
 
 	/* setup fifo + format */
 	cx88_sram_channel_setup(core, &cx88_sram_channels[SRAM_CH28],
@@ -177,7 +178,6 @@ static int cx8802_restart_queue(struct cx8802_dev    *dev,
 				struct cx88_dmaqueue *q)
 {
 	struct cx88_buffer *buf;
-	struct list_head *item;
 
 	dprintk( 1, "cx8802_restart_queue\n" );
 	if (list_empty(&q->active))
@@ -223,10 +223,8 @@ static int cx8802_restart_queue(struct cx8802_dev    *dev,
 	dprintk(2,"restart_queue [%p/%d]: restart dma\n",
 		buf, buf->vb.i);
 	cx8802_start_dma(dev, q, buf);
-	list_for_each(item,&q->active) {
-		buf = list_entry(item, struct cx88_buffer, vb.queue);
+	list_for_each_entry(buf, &q->active, vb.queue)
 		buf->count = q->count++;
-	}
 	mod_timer(&q->timeout, jiffies+BUFFER_TIMEOUT);
 	return 0;
 }
@@ -572,42 +570,29 @@ int cx8802_resume_common(struct pci_dev *pci_dev)
 	return 0;
 }
 
+#if defined(CONFIG_VIDEO_CX88_BLACKBIRD) || \
+    defined(CONFIG_VIDEO_CX88_BLACKBIRD_MODULE)
 struct cx8802_dev * cx8802_get_device(struct inode *inode)
 {
 	int minor = iminor(inode);
-	struct cx8802_dev *h = NULL;
-	struct list_head *list;
+	struct cx8802_dev *dev;
 
-	list_for_each(list,&cx8802_devlist) {
-		h = list_entry(list, struct cx8802_dev, devlist);
-		if (h->mpeg_dev && h->mpeg_dev->minor == minor)
-			return h;
-	}
+	list_for_each_entry(dev, &cx8802_devlist, devlist)
+		if (dev->mpeg_dev && dev->mpeg_dev->minor == minor)
+			return dev;
 
 	return NULL;
 }
+EXPORT_SYMBOL(cx8802_get_device);
+#endif
 
 struct cx8802_driver * cx8802_get_driver(struct cx8802_dev *dev, enum cx88_board_type btype)
 {
-	struct cx8802_dev *h = NULL;
-	struct cx8802_driver *d = NULL;
-	struct list_head *list;
-	struct list_head *list2;
-
-	list_for_each(list,&cx8802_devlist) {
-		h = list_entry(list, struct cx8802_dev, devlist);
-		if (h != dev)
-			continue;
-
-		list_for_each(list2, &h->drvlist.devlist) {
-			d = list_entry(list2, struct cx8802_driver, devlist);
+	struct cx8802_driver *d;
 
-			/* only unregister the correct driver type */
-			if (d->type_id == btype) {
-				return d;
-			}
-		}
-	}
+	list_for_each_entry(d, &dev->drvlist, drvlist)
+		if (d->type_id == btype)
+			return d;
 
 	return NULL;
 }
@@ -671,10 +656,9 @@ static int cx8802_check_driver(struct cx8802_driver *drv)
 
 int cx8802_register_driver(struct cx8802_driver *drv)
 {
-	struct cx8802_dev *h;
+	struct cx8802_dev *dev;
 	struct cx8802_driver *driver;
-	struct list_head *list;
-	int err = 0, i = 0;
+	int err, i = 0;
 
 	printk(KERN_INFO
 	       "cx88/2: registering cx8802 driver, type: %s access: %s\n",
@@ -686,14 +670,12 @@ int cx8802_register_driver(struct cx8802_driver *drv)
 		return err;
 	}
 
-	list_for_each(list,&cx8802_devlist) {
-		h = list_entry(list, struct cx8802_dev, devlist);
-
+	list_for_each_entry(dev, &cx8802_devlist, devlist) {
 		printk(KERN_INFO
 		       "%s/2: subsystem: %04x:%04x, board: %s [card=%d]\n",
-		       h->core->name, h->pci->subsystem_vendor,
-		       h->pci->subsystem_device, h->core->board.name,
-		       h->core->boardnr);
+		       dev->core->name, dev->pci->subsystem_vendor,
+		       dev->pci->subsystem_device, dev->core->board.name,
+		       dev->core->boardnr);
 
 		/* Bring up a new struct for each driver instance */
 		driver = kzalloc(sizeof(*drv),GFP_KERNEL);
@@ -701,7 +683,7 @@ int cx8802_register_driver(struct cx8802_driver *drv)
 			return -ENOMEM;
 
 		/* Snapshot of the driver registration data */
-		drv->core = h->core;
+		drv->core = dev->core;
 		drv->suspend = cx8802_suspend_common;
 		drv->resume = cx8802_resume_common;
 		drv->request_acquire = cx8802_request_acquire;
@@ -712,49 +694,38 @@ int cx8802_register_driver(struct cx8802_driver *drv)
 		if (err == 0) {
 			i++;
 			mutex_lock(&drv->core->lock);
-			list_add_tail(&driver->devlist,&h->drvlist.devlist);
+			list_add_tail(&driver->drvlist, &dev->drvlist);
 			mutex_unlock(&drv->core->lock);
 		} else {
 			printk(KERN_ERR
 			       "%s/2: cx8802 probe failed, err = %d\n",
-			       h->core->name, err);
+			       dev->core->name, err);
 		}
 
 	}
-	if (i == 0)
-		err = -ENODEV;
-	else
-		err = 0;
 
-	return err;
+	return i ? 0 : -ENODEV;
 }
 
 int cx8802_unregister_driver(struct cx8802_driver *drv)
 {
-	struct cx8802_dev *h;
-	struct cx8802_driver *d;
-	struct list_head *list;
-	struct list_head *list2, *q;
-	int err = 0, i = 0;
+	struct cx8802_dev *dev;
+	struct cx8802_driver *d, *dtmp;
+	int err = 0;
 
 	printk(KERN_INFO
 	       "cx88/2: unregistering cx8802 driver, type: %s access: %s\n",
 	       drv->type_id == CX88_MPEG_DVB ? "dvb" : "blackbird",
 	       drv->hw_access == CX8802_DRVCTL_SHARED ? "shared" : "exclusive");
 
-	list_for_each(list,&cx8802_devlist) {
-		i++;
-		h = list_entry(list, struct cx8802_dev, devlist);
-
+	list_for_each_entry(dev, &cx8802_devlist, devlist) {
 		printk(KERN_INFO
 		       "%s/2: subsystem: %04x:%04x, board: %s [card=%d]\n",
-		       h->core->name, h->pci->subsystem_vendor,
-		       h->pci->subsystem_device, h->core->board.name,
-		       h->core->boardnr);
-
-		list_for_each_safe(list2, q, &h->drvlist.devlist) {
-			d = list_entry(list2, struct cx8802_driver, devlist);
+		       dev->core->name, dev->pci->subsystem_vendor,
+		       dev->pci->subsystem_device, dev->core->board.name,
+		       dev->core->boardnr);
 
+		list_for_each_entry_safe(d, dtmp, &dev->drvlist, drvlist) {
 			/* only unregister the correct driver type */
 			if (d->type_id != drv->type_id)
 				continue;
@@ -762,12 +733,12 @@ int cx8802_unregister_driver(struct cx8802_driver *drv)
 			err = d->remove(d);
 			if (err == 0) {
 				mutex_lock(&drv->core->lock);
-				list_del(list2);
+				list_del(&d->drvlist);
 				mutex_unlock(&drv->core->lock);
+				kfree(d);
 			} else
 				printk(KERN_ERR "%s/2: cx8802 driver remove "
-				       "failed (%d)\n", h->core->name, err);
-
+				       "failed (%d)\n", dev->core->name, err);
 		}
 
 	}
@@ -805,7 +776,7 @@ static int __devinit cx8802_probe(struct pci_dev *pci_dev,
 	if (err != 0)
 		goto fail_free;
 
-	INIT_LIST_HEAD(&dev->drvlist.devlist);
+	INIT_LIST_HEAD(&dev->drvlist);
 	list_add_tail(&dev->devlist,&cx8802_devlist);
 
 	/* Maintain a reference so cx88-video can query the 8802 device. */
@@ -825,23 +796,30 @@ static int __devinit cx8802_probe(struct pci_dev *pci_dev,
 static void __devexit cx8802_remove(struct pci_dev *pci_dev)
 {
 	struct cx8802_dev *dev;
-	struct cx8802_driver *h;
-	struct list_head *list;
 
 	dev = pci_get_drvdata(pci_dev);
 
 	dprintk( 1, "%s\n", __FUNCTION__);
 
-	list_for_each(list,&dev->drvlist.devlist) {
-		h = list_entry(list, struct cx8802_driver, devlist);
-		dprintk( 1, " ->driver\n");
-		if (h->remove == NULL) {
-			printk(KERN_ERR "%s .. skipping driver, no probe function\n", __FUNCTION__);
-			continue;
+	if (!list_empty(&dev->drvlist)) {
+		struct cx8802_driver *drv, *tmp;
+		int err;
+
+		printk(KERN_WARNING "%s/2: Trying to remove cx8802 driver "
+		       "while cx8802 sub-drivers still loaded?!\n",
+		       dev->core->name);
+
+		list_for_each_entry_safe(drv, tmp, &dev->drvlist, drvlist) {
+			err = drv->remove(drv);
+			if (err == 0) {
+				mutex_lock(&drv->core->lock);
+				list_del(&drv->drvlist);
+				mutex_unlock(&drv->core->lock);
+			} else
+				printk(KERN_ERR "%s/2: cx8802 driver remove "
+				       "failed (%d)\n", dev->core->name, err);
+			kfree(drv);
 		}
-		printk(KERN_INFO "%s .. Removing driver type %d\n", __FUNCTION__, h->type_id);
-		cx8802_unregister_driver(h);
-		list_del(&dev->drvlist.devlist);
 	}
 
 	/* Destroy any 8802 reference. */
@@ -901,7 +879,6 @@ EXPORT_SYMBOL(cx8802_fini_common);
 
 EXPORT_SYMBOL(cx8802_register_driver);
 EXPORT_SYMBOL(cx8802_unregister_driver);
-EXPORT_SYMBOL(cx8802_get_device);
 EXPORT_SYMBOL(cx8802_get_driver);
 /* ----------------------------------------------------------- */
 /*
diff --git a/drivers/media/video/cx88/cx88-video.c b/drivers/media/video/cx88/cx88-video.c
index 231ae6c4dd2..5ee05f8f3fa 100644
--- a/drivers/media/video/cx88/cx88-video.c
+++ b/drivers/media/video/cx88/cx88-video.c
@@ -1675,7 +1675,6 @@ static struct video_device cx8800_radio_template =
 {
 	.name                 = "cx8800-radio",
 	.type                 = VID_TYPE_TUNER,
-	.hardware             = 0,
 	.fops                 = &radio_fops,
 	.minor                = -1,
 	.vidioc_querycap      = radio_querycap,
diff --git a/drivers/media/video/cx88/cx88-vp3054-i2c.c b/drivers/media/video/cx88/cx88-vp3054-i2c.c
index 77c37889232..6ce5af48847 100644
--- a/drivers/media/video/cx88/cx88-vp3054-i2c.c
+++ b/drivers/media/video/cx88/cx88-vp3054-i2c.c
@@ -41,7 +41,7 @@ static void vp3054_bit_setscl(void *data, int state)
 {
 	struct cx8802_dev *dev = data;
 	struct cx88_core *core = dev->core;
-	struct vp3054_i2c_state *vp3054_i2c = dev->card_priv;
+	struct vp3054_i2c_state *vp3054_i2c = dev->vp3054;
 
 	if (state) {
 		vp3054_i2c->state |=  0x0001;	/* SCL high */
@@ -58,7 +58,7 @@ static void vp3054_bit_setsda(void *data, int state)
 {
 	struct cx8802_dev *dev = data;
 	struct cx88_core *core = dev->core;
-	struct vp3054_i2c_state *vp3054_i2c = dev->card_priv;
+	struct vp3054_i2c_state *vp3054_i2c = dev->vp3054;
 
 	if (state) {
 		vp3054_i2c->state |=  0x0002;	/* SDA high */
@@ -113,10 +113,10 @@ int vp3054_i2c_probe(struct cx8802_dev *dev)
 	if (core->boardnr != CX88_BOARD_DNTV_LIVE_DVB_T_PRO)
 		return 0;
 
-	dev->card_priv = kzalloc(sizeof(*vp3054_i2c), GFP_KERNEL);
-	if (dev->card_priv == NULL)
+	vp3054_i2c = kzalloc(sizeof(*vp3054_i2c), GFP_KERNEL);
+	if (vp3054_i2c == NULL)
 		return -ENOMEM;
-	vp3054_i2c = dev->card_priv;
+	dev->vp3054 = vp3054_i2c;
 
 	memcpy(&vp3054_i2c->algo, &vp3054_i2c_algo_template,
 	       sizeof(vp3054_i2c->algo));
@@ -139,8 +139,8 @@ int vp3054_i2c_probe(struct cx8802_dev *dev)
 	if (0 != rc) {
 		printk("%s: vp3054_i2c register FAILED\n", core->name);
 
-		kfree(dev->card_priv);
-		dev->card_priv = NULL;
+		kfree(dev->vp3054);
+		dev->vp3054 = NULL;
 	}
 
 	return rc;
@@ -148,7 +148,7 @@ int vp3054_i2c_probe(struct cx8802_dev *dev)
 
 void vp3054_i2c_remove(struct cx8802_dev *dev)
 {
-	struct vp3054_i2c_state *vp3054_i2c = dev->card_priv;
+	struct vp3054_i2c_state *vp3054_i2c = dev->vp3054;
 
 	if (vp3054_i2c == NULL ||
 	    dev->core->boardnr != CX88_BOARD_DNTV_LIVE_DVB_T_PRO)
diff --git a/drivers/media/video/cx88/cx88.h b/drivers/media/video/cx88/cx88.h
index 42e0a9b8c55..eb296bdecb1 100644
--- a/drivers/media/video/cx88/cx88.h
+++ b/drivers/media/video/cx88/cx88.h
@@ -412,7 +412,9 @@ struct cx8802_suspend_state {
 
 struct cx8802_driver {
 	struct cx88_core *core;
-	struct list_head devlist;
+
+	/* List of drivers attached to device */
+	struct list_head drvlist;
 
 	/* Type of driver and access required */
 	enum cx88_board_type type_id;
@@ -453,27 +455,33 @@ struct cx8802_dev {
 
 	/* for blackbird only */
 	struct list_head           devlist;
+#if defined(CONFIG_VIDEO_CX88_BLACKBIRD) || \
+    defined(CONFIG_VIDEO_CX88_BLACKBIRD_MODULE)
 	struct video_device        *mpeg_dev;
 	u32                        mailbox;
 	int                        width;
 	int                        height;
 
+	/* mpeg params */
+	struct cx2341x_mpeg_params params;
+#endif
+
 #if defined(CONFIG_VIDEO_CX88_DVB) || defined(CONFIG_VIDEO_CX88_DVB_MODULE)
 	/* for dvb only */
 	struct videobuf_dvb        dvb;
+#endif
 
-	void			   *card_priv;
+#if defined(CONFIG_VIDEO_CX88_VP3054) || \
+    defined(CONFIG_VIDEO_CX88_VP3054_MODULE)
+	/* For VP3045 secondary I2C bus support */
+	struct vp3054_i2c_state	   *vp3054;
 #endif
 	/* for switching modulation types */
 	unsigned char              ts_gen_cntrl;
 
-	/* mpeg params */
-	struct cx2341x_mpeg_params params;
-
 	/* List of attached drivers */
-	struct cx8802_driver       drvlist;
-	struct work_struct request_module_wk;
-
+	struct list_head	   drvlist;
+	struct work_struct	   request_module_wk;
 };
 
 /* ----------------------------------------------------------- */
diff --git a/drivers/media/video/em28xx/em28xx-core.c b/drivers/media/video/em28xx/em28xx-core.c
index d3282ec62c5..d56484f2046 100644
--- a/drivers/media/video/em28xx/em28xx-core.c
+++ b/drivers/media/video/em28xx/em28xx-core.c
@@ -648,7 +648,7 @@ void em28xx_uninit_isoc(struct em28xx *dev)
  */
 int em28xx_init_isoc(struct em28xx *dev)
 {
-	/* change interface to 3 which allowes the biggest packet sizes */
+	/* change interface to 3 which allows the biggest packet sizes */
 	int i, errCode;
 	const int sb_size = EM28XX_NUM_PACKETS * dev->max_pkt_size;
 
@@ -673,6 +673,7 @@ int em28xx_init_isoc(struct em28xx *dev)
 					("unable to allocate %i bytes for transfer buffer %i\n",
 					 sb_size, i);
 			em28xx_uninit_isoc(dev);
+			usb_free_urb(urb);
 			return -ENOMEM;
 		}
 		memset(dev->transfer_buffer[i], 0, sb_size);
diff --git a/drivers/media/video/em28xx/em28xx-video.c b/drivers/media/video/em28xx/em28xx-video.c
index e467682aabd..a4c2a907124 100644
--- a/drivers/media/video/em28xx/em28xx-video.c
+++ b/drivers/media/video/em28xx/em28xx-video.c
@@ -1617,7 +1617,6 @@ static int em28xx_init_dev(struct em28xx **devhandle, struct usb_device *udev,
 
 	/* Fills VBI device info */
 	dev->vbi_dev->type = VFL_TYPE_VBI;
-	dev->vbi_dev->hardware = 0;
 	dev->vbi_dev->fops = &em28xx_v4l_fops;
 	dev->vbi_dev->minor = -1;
 	dev->vbi_dev->dev = &dev->udev->dev;
@@ -1629,7 +1628,6 @@ static int em28xx_init_dev(struct em28xx **devhandle, struct usb_device *udev,
 	dev->vdev->type = VID_TYPE_CAPTURE;
 	if (dev->has_tuner)
 		dev->vdev->type |= VID_TYPE_TUNER;
-	dev->vdev->hardware = 0;
 	dev->vdev->fops = &em28xx_v4l_fops;
 	dev->vdev->minor = -1;
 	dev->vdev->dev = &dev->udev->dev;
diff --git a/drivers/media/video/et61x251/et61x251_core.c b/drivers/media/video/et61x251/et61x251_core.c
index d5fef4c01c8..d19d73b81ed 100644
--- a/drivers/media/video/et61x251/et61x251_core.c
+++ b/drivers/media/video/et61x251/et61x251_core.c
@@ -2585,7 +2585,6 @@ et61x251_usb_probe(struct usb_interface* intf, const struct usb_device_id* id)
 	strcpy(cam->v4ldev->name, "ET61X[12]51 PC Camera");
 	cam->v4ldev->owner = THIS_MODULE;
 	cam->v4ldev->type = VID_TYPE_CAPTURE | VID_TYPE_SCALES;
-	cam->v4ldev->hardware = 0;
 	cam->v4ldev->fops = &et61x251_fops;
 	cam->v4ldev->minor = video_nr[dev_nr];
 	cam->v4ldev->release = video_device_release;
diff --git a/drivers/media/video/ir-kbd-i2c.c b/drivers/media/video/ir-kbd-i2c.c
index d98dd0d1e37..29779d8bf7f 100644
--- a/drivers/media/video/ir-kbd-i2c.c
+++ b/drivers/media/video/ir-kbd-i2c.c
@@ -528,6 +528,7 @@ static int ir_probe(struct i2c_adapter *adap)
 		break;
 	case I2C_HW_B_CX2388x:
 		probe = probe_cx88;
+		break;
 	case I2C_HW_B_CX23885:
 		probe = probe_cx23885;
 		break;
diff --git a/drivers/media/video/ivtv/ivtv-driver.c b/drivers/media/video/ivtv/ivtv-driver.c
index fd7a932e1d3..6d2dd8764f8 100644
--- a/drivers/media/video/ivtv/ivtv-driver.c
+++ b/drivers/media/video/ivtv/ivtv-driver.c
@@ -1003,8 +1003,6 @@ static int __devinit ivtv_probe(struct pci_dev *dev,
 
 	IVTV_DEBUG_INFO("base addr: 0x%08x\n", itv->base_addr);
 
-	mutex_lock(&itv->serialize_lock);
-
 	/* PCI Device Setup */
 	if ((retval = ivtv_setup_pci(itv, dev, pci_id)) != 0) {
 		if (retval == -EIO)
@@ -1064,7 +1062,7 @@ static int __devinit ivtv_probe(struct pci_dev *dev,
 	IVTV_DEBUG_INFO("activating i2c...\n");
 	if (init_ivtv_i2c(itv)) {
 		IVTV_ERR("Could not initialize i2c\n");
-		goto free_irq;
+		goto free_io;
 	}
 
 	IVTV_DEBUG_INFO("Active card count: %d.\n", ivtv_cards_active);
@@ -1176,7 +1174,11 @@ static int __devinit ivtv_probe(struct pci_dev *dev,
 		IVTV_ERR("Failed to register irq %d\n", retval);
 		goto free_streams;
 	}
-	mutex_unlock(&itv->serialize_lock);
+	retval = ivtv_streams_register(itv);
+	if (retval) {
+		IVTV_ERR("Error %d registering devices\n", retval);
+		goto free_irq;
+	}
 	IVTV_INFO("Initialized card #%d: %s\n", itv->num, itv->card_name);
 	return 0;
 
@@ -1195,7 +1197,6 @@ static int __devinit ivtv_probe(struct pci_dev *dev,
 		release_mem_region(itv->base_addr + IVTV_DECODER_OFFSET, IVTV_DECODER_SIZE);
       free_workqueue:
 	destroy_workqueue(itv->irq_work_queues);
-	mutex_unlock(&itv->serialize_lock);
       err:
 	if (retval == 0)
 		retval = -ENODEV;
diff --git a/drivers/media/video/ivtv/ivtv-driver.h b/drivers/media/video/ivtv/ivtv-driver.h
index 3bda1df63cb..49ce14d14a5 100644
--- a/drivers/media/video/ivtv/ivtv-driver.h
+++ b/drivers/media/video/ivtv/ivtv-driver.h
@@ -51,6 +51,7 @@
 #include <linux/unistd.h>
 #include <linux/byteorder/swab.h>
 #include <linux/pagemap.h>
+#include <linux/scatterlist.h>
 #include <linux/workqueue.h>
 #include <linux/mutex.h>
 #include <asm/uaccess.h>
diff --git a/drivers/media/video/ivtv/ivtv-fileops.c b/drivers/media/video/ivtv/ivtv-fileops.c
index da50fa4a72a..a200a8a95a2 100644
--- a/drivers/media/video/ivtv/ivtv-fileops.c
+++ b/drivers/media/video/ivtv/ivtv-fileops.c
@@ -822,6 +822,11 @@ int ivtv_v4l2_close(struct inode *inode, struct file *filp)
 			crystal_freq.flags = 0;
 			ivtv_saa7115(itv, VIDIOC_INT_S_CRYSTAL_FREQ, &crystal_freq);
 		}
+		if (atomic_read(&itv->capturing) > 0) {
+			/* Undo video mute */
+			ivtv_vapi(itv, CX2341X_ENC_MUTE_VIDEO, 1,
+				itv->params.video_mute | (itv->params.video_mute_yuv << 8));
+		}
 		/* Done! Unmute and continue. */
 		ivtv_unmute(itv);
 		ivtv_release_stream(s);
@@ -892,6 +897,7 @@ static int ivtv_serialized_open(struct ivtv_stream *s, struct file *filp)
 			if (atomic_read(&itv->capturing) > 0) {
 				/* switching to radio while capture is
 				   in progress is not polite */
+				ivtv_release_stream(s);
 				kfree(item);
 				return -EBUSY;
 			}
@@ -947,7 +953,7 @@ int ivtv_v4l2_open(struct inode *inode, struct file *filp)
 	if (itv == NULL) {
 		/* Couldn't find a device registered
 		   on that minor, shouldn't happen! */
-		IVTV_WARN("No ivtv device found on minor %d\n", minor);
+		printk(KERN_WARNING "No ivtv device found on minor %d\n", minor);
 		return -ENXIO;
 	}
 
diff --git a/drivers/media/video/ivtv/ivtv-ioctl.c b/drivers/media/video/ivtv/ivtv-ioctl.c
index 206eee7542d..fd6826f472e 100644
--- a/drivers/media/video/ivtv/ivtv-ioctl.c
+++ b/drivers/media/video/ivtv/ivtv-ioctl.c
@@ -555,6 +555,7 @@ static int ivtv_try_or_set_fmt(struct ivtv *itv, int streamtype,
 
 	/* set window size */
 	if (fmt->type == V4L2_BUF_TYPE_VIDEO_CAPTURE) {
+		struct cx2341x_mpeg_params *p = &itv->params;
 		int w = fmt->fmt.pix.width;
 		int h = fmt->fmt.pix.height;
 
@@ -566,17 +567,19 @@ static int ivtv_try_or_set_fmt(struct ivtv *itv, int streamtype,
 		fmt->fmt.pix.width = w;
 		fmt->fmt.pix.height = h;
 
-		if (!set_fmt || (itv->params.width == w && itv->params.height == h))
+		if (!set_fmt || (p->width == w && p->height == h))
 			return 0;
 		if (atomic_read(&itv->capturing) > 0)
 			return -EBUSY;
 
-		itv->params.width = w;
-		itv->params.height = h;
+		p->width = w;
+		p->height = h;
 		if (w != 720 || h != (itv->is_50hz ? 576 : 480))
-			itv->params.video_temporal_filter = 0;
+			p->video_temporal_filter = 0;
 		else
-			itv->params.video_temporal_filter = 8;
+			p->video_temporal_filter = 8;
+		if (p->video_encoding == V4L2_MPEG_VIDEO_ENCODING_MPEG_1)
+			fmt->fmt.pix.width /= 2;
 		itv->video_dec_func(itv, VIDIOC_S_FMT, fmt);
 		return ivtv_get_fmt(itv, streamtype, fmt);
 	}
diff --git a/drivers/media/video/ivtv/ivtv-streams.c b/drivers/media/video/ivtv/ivtv-streams.c
index fd135985e70..aa03e61ef31 100644
--- a/drivers/media/video/ivtv/ivtv-streams.c
+++ b/drivers/media/video/ivtv/ivtv-streams.c
@@ -166,10 +166,9 @@ static void ivtv_stream_init(struct ivtv *itv, int type)
 	ivtv_queue_init(&s->q_io);
 }
 
-static int ivtv_reg_dev(struct ivtv *itv, int type)
+static int ivtv_prep_dev(struct ivtv *itv, int type)
 {
 	struct ivtv_stream *s = &itv->streams[type];
-	int vfl_type = ivtv_stream_info[type].vfl_type;
 	int minor_offset = ivtv_stream_info[type].minor_offset;
 	int minor;
 
@@ -187,15 +186,12 @@ static int ivtv_reg_dev(struct ivtv *itv, int type)
 	if (type >= IVTV_DEC_STREAM_TYPE_MPG && !(itv->v4l2_cap & V4L2_CAP_VIDEO_OUTPUT))
 		return 0;
 
-	if (minor_offset >= 0)
-		/* card number + user defined offset + device offset */
-		minor = itv->num + ivtv_first_minor + minor_offset;
-	else
-		minor = -1;
+	/* card number + user defined offset + device offset */
+	minor = itv->num + ivtv_first_minor + minor_offset;
 
 	/* User explicitly selected 0 buffers for these streams, so don't
 	   create them. */
-	if (minor >= 0 && ivtv_stream_info[type].dma != PCI_DMA_NONE &&
+	if (ivtv_stream_info[type].dma != PCI_DMA_NONE &&
 	    itv->options.kilobytes[type] == 0) {
 		IVTV_INFO("Disabled %s device\n", ivtv_stream_info[type].name);
 		return 0;
@@ -223,21 +219,53 @@ static int ivtv_reg_dev(struct ivtv *itv, int type)
 	s->v4l2dev->fops = ivtv_stream_info[type].fops;
 	s->v4l2dev->release = video_device_release;
 
-	if (minor >= 0) {
-		/* Register device. First try the desired minor, then any free one. */
-		if (video_register_device(s->v4l2dev, vfl_type, minor) &&
-		    video_register_device(s->v4l2dev, vfl_type, -1)) {
-			IVTV_ERR("Couldn't register v4l2 device for %s minor %d\n",
-					s->name, minor);
-			video_device_release(s->v4l2dev);
-			s->v4l2dev = NULL;
-			return -ENOMEM;
-		}
+	return 0;
+}
+
+/* Initialize v4l2 variables and prepare v4l2 devices */
+int ivtv_streams_setup(struct ivtv *itv)
+{
+	int type;
+
+	/* Setup V4L2 Devices */
+	for (type = 0; type < IVTV_MAX_STREAMS; type++) {
+		/* Prepare device */
+		if (ivtv_prep_dev(itv, type))
+			break;
+
+		if (itv->streams[type].v4l2dev == NULL)
+			continue;
+
+		/* Allocate Stream */
+		if (ivtv_stream_alloc(&itv->streams[type]))
+			break;
 	}
-	else {
-		/* Don't register a 'hidden' stream (OSD) */
-		IVTV_INFO("Created framebuffer stream for %s\n", s->name);
+	if (type == IVTV_MAX_STREAMS)
 		return 0;
+
+	/* One or more streams could not be initialized. Clean 'em all up. */
+	ivtv_streams_cleanup(itv);
+	return -ENOMEM;
+}
+
+static int ivtv_reg_dev(struct ivtv *itv, int type)
+{
+	struct ivtv_stream *s = &itv->streams[type];
+	int vfl_type = ivtv_stream_info[type].vfl_type;
+	int minor;
+
+	if (s->v4l2dev == NULL)
+		return 0;
+
+	minor = s->v4l2dev->minor;
+	/* Register device. First try the desired minor, then any free one. */
+	if (video_register_device(s->v4l2dev, vfl_type, minor) &&
+			video_register_device(s->v4l2dev, vfl_type, -1)) {
+		IVTV_ERR("Couldn't register v4l2 device for %s minor %d\n",
+				s->name, minor);
+		video_device_release(s->v4l2dev);
+		s->v4l2dev = NULL;
+		return -ENOMEM;
 	}
 
 	switch (vfl_type) {
@@ -262,27 +290,18 @@ static int ivtv_reg_dev(struct ivtv *itv, int type)
 	return 0;
 }
 
-/* Initialize v4l2 variables and register v4l2 devices */
-int ivtv_streams_setup(struct ivtv *itv)
+/* Register v4l2 devices */
+int ivtv_streams_register(struct ivtv *itv)
 {
 	int type;
+	int err = 0;
 
-	/* Setup V4L2 Devices */
-	for (type = 0; type < IVTV_MAX_STREAMS; type++) {
-		/* Register Device */
-		if (ivtv_reg_dev(itv, type))
-			break;
-
-		if (itv->streams[type].v4l2dev == NULL)
-			continue;
+	/* Register V4L2 devices */
+	for (type = 0; type < IVTV_MAX_STREAMS; type++)
+		err |= ivtv_reg_dev(itv, type);
 
-		/* Allocate Stream */
-		if (ivtv_stream_alloc(&itv->streams[type]))
-			break;
-	}
-	if (type == IVTV_MAX_STREAMS) {
+	if (err == 0)
 		return 0;
-	}
 
 	/* One or more streams could not be initialized. Clean 'em all up. */
 	ivtv_streams_cleanup(itv);
@@ -303,11 +322,8 @@ void ivtv_streams_cleanup(struct ivtv *itv)
 			continue;
 
 		ivtv_stream_free(&itv->streams[type]);
-		/* Free Device */
-		if (vdev->minor == -1) /* 'Hidden' never registered stream (OSD) */
-			video_device_release(vdev);
-		else    /* All others, just unregister. */
-			video_unregister_device(vdev);
+		/* Unregister device */
+		video_unregister_device(vdev);
 	}
 }
 
@@ -425,6 +441,7 @@ int ivtv_start_v4l2_encode_stream(struct ivtv_stream *s)
 {
 	u32 data[CX2341X_MBOX_MAX_DATA];
 	struct ivtv *itv = s->itv;
+	struct cx2341x_mpeg_params *p = &itv->params;
 	int captype = 0, subtype = 0;
 	int enable_passthrough = 0;
 
@@ -445,7 +462,7 @@ int ivtv_start_v4l2_encode_stream(struct ivtv_stream *s)
 		}
 		itv->mpg_data_received = itv->vbi_data_inserted = 0;
 		itv->dualwatch_jiffies = jiffies;
-		itv->dualwatch_stereo_mode = itv->params.audio_properties & 0x0300;
+		itv->dualwatch_stereo_mode = p->audio_properties & 0x0300;
 		itv->search_pack_header = 0;
 		break;
 
@@ -477,9 +494,6 @@ int ivtv_start_v4l2_encode_stream(struct ivtv_stream *s)
 	s->subtype = subtype;
 	s->buffers_stolen = 0;
 
-	/* mute/unmute video */
-	ivtv_vapi(itv, CX2341X_ENC_MUTE_VIDEO, 1, test_bit(IVTV_F_I_RADIO_USER, &itv->i_flags) ? 1 : 0);
-
 	/* Clear Streamoff flags in case left from last capture */
 	clear_bit(IVTV_F_S_STREAMOFF, &s->s_flags);
 
@@ -536,7 +550,12 @@ int ivtv_start_v4l2_encode_stream(struct ivtv_stream *s)
 				itv->pgm_info_offset, itv->pgm_info_num);
 
 		/* Setup API for Stream */
-		cx2341x_update(itv, ivtv_api_func, NULL, &itv->params);
+		cx2341x_update(itv, ivtv_api_func, NULL, p);
+
+		/* mute if capturing radio */
+		if (test_bit(IVTV_F_I_RADIO_USER, &itv->i_flags))
+			ivtv_vapi(itv, CX2341X_ENC_MUTE_VIDEO, 1,
+				1 | (p->video_mute_yuv << 8));
 	}
 
 	/* Vsync Setup */
@@ -585,6 +604,7 @@ static int ivtv_setup_v4l2_decode_stream(struct ivtv_stream *s)
 {
 	u32 data[CX2341X_MBOX_MAX_DATA];
 	struct ivtv *itv = s->itv;
+	struct cx2341x_mpeg_params *p = &itv->params;
 	int datatype;
 
 	if (s->v4l2dev == NULL)
@@ -623,7 +643,7 @@ static int ivtv_setup_v4l2_decode_stream(struct ivtv_stream *s)
 		break;
 	}
 	if (ivtv_vapi(itv, CX2341X_DEC_SET_DECODER_SOURCE, 4, datatype,
-			itv->params.width, itv->params.height, itv->params.audio_properties)) {
+			p->width, p->height, p->audio_properties)) {
 		IVTV_DEBUG_WARN("Couldn't initialize decoder source\n");
 	}
 	return 0;
diff --git a/drivers/media/video/ivtv/ivtv-streams.h b/drivers/media/video/ivtv/ivtv-streams.h
index 8f5f5b1c7c8..3d76a415fbd 100644
--- a/drivers/media/video/ivtv/ivtv-streams.h
+++ b/drivers/media/video/ivtv/ivtv-streams.h
@@ -22,6 +22,7 @@
 #define IVTV_STREAMS_H
 
 int ivtv_streams_setup(struct ivtv *itv);
+int ivtv_streams_register(struct ivtv *itv);
 void ivtv_streams_cleanup(struct ivtv *itv);
 
 /* Capture related */
diff --git a/drivers/media/video/ivtv/ivtv-udma.c b/drivers/media/video/ivtv/ivtv-udma.c
index c4626d1cdf4..912b424e520 100644
--- a/drivers/media/video/ivtv/ivtv-udma.c
+++ b/drivers/media/video/ivtv/ivtv-udma.c
@@ -63,10 +63,10 @@ int ivtv_udma_fill_sg_list (struct ivtv_user_dma *dma, struct ivtv_dma_page_info
 			memcpy(page_address(dma->bouncemap[map_offset]) + offset, src, len);
 			kunmap_atomic(src, KM_BOUNCE_READ);
 			local_irq_restore(flags);
-			dma->SGlist[map_offset].page = dma->bouncemap[map_offset];
+			sg_set_page(&dma->SGlist[map_offset], dma->bouncemap[map_offset]);
 		}
 		else {
-			dma->SGlist[map_offset].page = dma->map[map_offset];
+			sg_set_page(&dma->SGlist[map_offset], dma->map[map_offset]);
 		}
 		offset = 0;
 		map_offset++;
diff --git a/drivers/media/video/ivtv/ivtv-yuv.c b/drivers/media/video/ivtv/ivtv-yuv.c
index e2288f224ab..9091c4837bb 100644
--- a/drivers/media/video/ivtv/ivtv-yuv.c
+++ b/drivers/media/video/ivtv/ivtv-yuv.c
@@ -710,7 +710,7 @@ static u32 ivtv_yuv_window_setup (struct ivtv *itv, struct yuv_frame_info *windo
 
 	/* If there's nothing to safe to display, we may as well stop now */
 	if ((int)window->dst_w <= 2 || (int)window->dst_h <= 2 || (int)window->src_w <= 2 || (int)window->src_h <= 2) {
-		return 0;
+		return IVTV_YUV_UPDATE_INVALID;
 	}
 
 	/* Ensure video remains inside OSD area */
@@ -791,7 +791,7 @@ static u32 ivtv_yuv_window_setup (struct ivtv *itv, struct yuv_frame_info *windo
 
 	/* Check again. If there's nothing to safe to display, stop now */
 	if ((int)window->dst_w <= 2 || (int)window->dst_h <= 2 || (int)window->src_w <= 2 || (int)window->src_h <= 2) {
-		return 0;
+		return IVTV_YUV_UPDATE_INVALID;
 	}
 
 	/* Both x offset & width are linked, so they have to be done together */
@@ -840,110 +840,118 @@ void ivtv_yuv_work_handler (struct ivtv *itv)
 	if (!(yuv_update = ivtv_yuv_window_setup (itv, &window)))
 		return;
 
-	/* Update horizontal settings */
-	if (yuv_update & IVTV_YUV_UPDATE_HORIZONTAL)
-		ivtv_yuv_handle_horizontal(itv, &window);
+	if (yuv_update & IVTV_YUV_UPDATE_INVALID) {
+		write_reg(0x01008080, 0x2898);
+	} else if (yuv_update) {
+		write_reg(0x00108080, 0x2898);
 
-	if (yuv_update & IVTV_YUV_UPDATE_VERTICAL)
-		ivtv_yuv_handle_vertical(itv, &window);
+		if (yuv_update & IVTV_YUV_UPDATE_HORIZONTAL)
+			ivtv_yuv_handle_horizontal(itv, &window);
+
+		if (yuv_update & IVTV_YUV_UPDATE_VERTICAL)
+			ivtv_yuv_handle_vertical(itv, &window);
+	}
 
 	memcpy(&itv->yuv_info.old_frame_info, &window, sizeof (itv->yuv_info.old_frame_info));
 }
 
 static void ivtv_yuv_init (struct ivtv *itv)
 {
+	struct yuv_playback_info *yi = &itv->yuv_info;
+
 	IVTV_DEBUG_YUV("ivtv_yuv_init\n");
 
 	/* Take a snapshot of the current register settings */
-	itv->yuv_info.reg_2834 = read_reg(0x02834);
-	itv->yuv_info.reg_2838 = read_reg(0x02838);
-	itv->yuv_info.reg_283c = read_reg(0x0283c);
-	itv->yuv_info.reg_2840 = read_reg(0x02840);
-	itv->yuv_info.reg_2844 = read_reg(0x02844);
-	itv->yuv_info.reg_2848 = read_reg(0x02848);
-	itv->yuv_info.reg_2854 = read_reg(0x02854);
-	itv->yuv_info.reg_285c = read_reg(0x0285c);
-	itv->yuv_info.reg_2864 = read_reg(0x02864);
-	itv->yuv_info.reg_2870 = read_reg(0x02870);
-	itv->yuv_info.reg_2874 = read_reg(0x02874);
-	itv->yuv_info.reg_2898 = read_reg(0x02898);
-	itv->yuv_info.reg_2890 = read_reg(0x02890);
-
-	itv->yuv_info.reg_289c = read_reg(0x0289c);
-	itv->yuv_info.reg_2918 = read_reg(0x02918);
-	itv->yuv_info.reg_291c = read_reg(0x0291c);
-	itv->yuv_info.reg_2920 = read_reg(0x02920);
-	itv->yuv_info.reg_2924 = read_reg(0x02924);
-	itv->yuv_info.reg_2928 = read_reg(0x02928);
-	itv->yuv_info.reg_292c = read_reg(0x0292c);
-	itv->yuv_info.reg_2930 = read_reg(0x02930);
-	itv->yuv_info.reg_2934 = read_reg(0x02934);
-	itv->yuv_info.reg_2938 = read_reg(0x02938);
-	itv->yuv_info.reg_293c = read_reg(0x0293c);
-	itv->yuv_info.reg_2940 = read_reg(0x02940);
-	itv->yuv_info.reg_2944 = read_reg(0x02944);
-	itv->yuv_info.reg_2948 = read_reg(0x02948);
-	itv->yuv_info.reg_294c = read_reg(0x0294c);
-	itv->yuv_info.reg_2950 = read_reg(0x02950);
-	itv->yuv_info.reg_2954 = read_reg(0x02954);
-	itv->yuv_info.reg_2958 = read_reg(0x02958);
-	itv->yuv_info.reg_295c = read_reg(0x0295c);
-	itv->yuv_info.reg_2960 = read_reg(0x02960);
-	itv->yuv_info.reg_2964 = read_reg(0x02964);
-	itv->yuv_info.reg_2968 = read_reg(0x02968);
-	itv->yuv_info.reg_296c = read_reg(0x0296c);
-	itv->yuv_info.reg_2970 = read_reg(0x02970);
-
-	itv->yuv_info.v_filter_1 = -1;
-	itv->yuv_info.v_filter_2 = -1;
-	itv->yuv_info.h_filter = -1;
+	yi->reg_2834 = read_reg(0x02834);
+	yi->reg_2838 = read_reg(0x02838);
+	yi->reg_283c = read_reg(0x0283c);
+	yi->reg_2840 = read_reg(0x02840);
+	yi->reg_2844 = read_reg(0x02844);
+	yi->reg_2848 = read_reg(0x02848);
+	yi->reg_2854 = read_reg(0x02854);
+	yi->reg_285c = read_reg(0x0285c);
+	yi->reg_2864 = read_reg(0x02864);
+	yi->reg_2870 = read_reg(0x02870);
+	yi->reg_2874 = read_reg(0x02874);
+	yi->reg_2898 = read_reg(0x02898);
+	yi->reg_2890 = read_reg(0x02890);
+
+	yi->reg_289c = read_reg(0x0289c);
+	yi->reg_2918 = read_reg(0x02918);
+	yi->reg_291c = read_reg(0x0291c);
+	yi->reg_2920 = read_reg(0x02920);
+	yi->reg_2924 = read_reg(0x02924);
+	yi->reg_2928 = read_reg(0x02928);
+	yi->reg_292c = read_reg(0x0292c);
+	yi->reg_2930 = read_reg(0x02930);
+	yi->reg_2934 = read_reg(0x02934);
+	yi->reg_2938 = read_reg(0x02938);
+	yi->reg_293c = read_reg(0x0293c);
+	yi->reg_2940 = read_reg(0x02940);
+	yi->reg_2944 = read_reg(0x02944);
+	yi->reg_2948 = read_reg(0x02948);
+	yi->reg_294c = read_reg(0x0294c);
+	yi->reg_2950 = read_reg(0x02950);
+	yi->reg_2954 = read_reg(0x02954);
+	yi->reg_2958 = read_reg(0x02958);
+	yi->reg_295c = read_reg(0x0295c);
+	yi->reg_2960 = read_reg(0x02960);
+	yi->reg_2964 = read_reg(0x02964);
+	yi->reg_2968 = read_reg(0x02968);
+	yi->reg_296c = read_reg(0x0296c);
+	yi->reg_2970 = read_reg(0x02970);
+
+	yi->v_filter_1 = -1;
+	yi->v_filter_2 = -1;
+	yi->h_filter = -1;
 
 	/* Set some valid size info */
-	itv->yuv_info.osd_x_offset = read_reg(0x02a04) & 0x00000FFF;
-	itv->yuv_info.osd_y_offset = (read_reg(0x02a04) >> 16) & 0x00000FFF;
+	yi->osd_x_offset = read_reg(0x02a04) & 0x00000FFF;
+	yi->osd_y_offset = (read_reg(0x02a04) >> 16) & 0x00000FFF;
 
 	/* Bit 2 of reg 2878 indicates current decoder output format
 	   0 : NTSC    1 : PAL */
 	if (read_reg(0x2878) & 4)
-		itv->yuv_info.decode_height = 576;
+		yi->decode_height = 576;
 	else
-		itv->yuv_info.decode_height = 480;
+		yi->decode_height = 480;
 
-	/* If no visible size set, assume full size */
-	if (!itv->yuv_info.osd_vis_w)
-		itv->yuv_info.osd_vis_w = 720 - itv->yuv_info.osd_x_offset;
-
-	if (!itv->yuv_info.osd_vis_h) {
-		itv->yuv_info.osd_vis_h = itv->yuv_info.decode_height - itv->yuv_info.osd_y_offset;
+	if (!itv->osd_info) {
+		yi->osd_vis_w = 720 - yi->osd_x_offset;
+		yi->osd_vis_h = yi->decode_height - yi->osd_y_offset;
 	} else {
-		/* If output video standard has changed, requested height may
-		not be legal */
-		if (itv->yuv_info.osd_vis_h + itv->yuv_info.osd_y_offset > itv->yuv_info.decode_height) {
-			IVTV_DEBUG_WARN("Clipping yuv output - fb size (%d) exceeds video standard limit (%d)\n",
-					itv->yuv_info.osd_vis_h + itv->yuv_info.osd_y_offset,
-					itv->yuv_info.decode_height);
-			itv->yuv_info.osd_vis_h = itv->yuv_info.decode_height - itv->yuv_info.osd_y_offset;
+		/* If no visible size set, assume full size */
+		if (!yi->osd_vis_w)
+			yi->osd_vis_w = 720 - yi->osd_x_offset;
+
+		if (!yi->osd_vis_h)
+			yi->osd_vis_h = yi->decode_height - yi->osd_y_offset;
+		else {
+			/* If output video standard has changed, requested height may
+			not be legal */
+			if (yi->osd_vis_h + yi->osd_y_offset > yi->decode_height) {
+				IVTV_DEBUG_WARN("Clipping yuv output - fb size (%d) exceeds video standard limit (%d)\n",
+						yi->osd_vis_h + yi->osd_y_offset,
+						yi->decode_height);
+				yi->osd_vis_h = yi->decode_height - yi->osd_y_offset;
+			}
 		}
 	}
 
 	/* We need a buffer for blanking when Y plane is offset - non-fatal if we can't get one */
-	itv->yuv_info.blanking_ptr = kzalloc(720*16,GFP_KERNEL);
-	if (itv->yuv_info.blanking_ptr) {
-		itv->yuv_info.blanking_dmaptr = pci_map_single(itv->dev, itv->yuv_info.blanking_ptr, 720*16, PCI_DMA_TODEVICE);
-	}
+	yi->blanking_ptr = kzalloc(720*16, GFP_KERNEL);
+	if (yi->blanking_ptr)
+		yi->blanking_dmaptr = pci_map_single(itv->dev, yi->blanking_ptr, 720*16, PCI_DMA_TODEVICE);
 	else {
-		itv->yuv_info.blanking_dmaptr = 0;
-		IVTV_DEBUG_WARN ("Failed to allocate yuv blanking buffer\n");
+		yi->blanking_dmaptr = 0;
+		IVTV_DEBUG_WARN("Failed to allocate yuv blanking buffer\n");
 	}
 
-	IVTV_DEBUG_WARN("Enable video output\n");
-	write_reg_sync(0x00108080, 0x2898);
-
 	/* Enable YUV decoder output */
 	write_reg_sync(0x01, IVTV_REG_VDM);
 
 	set_bit(IVTV_F_I_DECODING_YUV, &itv->i_flags);
-	atomic_set(&itv->yuv_info.next_dma_frame,0);
+	atomic_set(&yi->next_dma_frame, 0);
 }
 
 int ivtv_yuv_prep_frame(struct ivtv *itv, struct ivtv_dma_frame *args)
diff --git a/drivers/media/video/ivtv/ivtv-yuv.h b/drivers/media/video/ivtv/ivtv-yuv.h
index f7215eeca01..3b966f0a204 100644
--- a/drivers/media/video/ivtv/ivtv-yuv.h
+++ b/drivers/media/video/ivtv/ivtv-yuv.h
@@ -34,6 +34,7 @@
 
 #define IVTV_YUV_UPDATE_HORIZONTAL  0x01
 #define IVTV_YUV_UPDATE_VERTICAL    0x02
+#define IVTV_YUV_UPDATE_INVALID     0x04
 
 extern const u32 yuv_offset[4];
 
diff --git a/drivers/media/video/ivtv/ivtvfb.c b/drivers/media/video/ivtv/ivtvfb.c
index 9684048fe56..52ffd154a3d 100644
--- a/drivers/media/video/ivtv/ivtvfb.c
+++ b/drivers/media/video/ivtv/ivtvfb.c
@@ -55,7 +55,6 @@
 static int ivtvfb_card_id = -1;
 static int ivtvfb_debug = 0;
 static int osd_laced;
-static int osd_compat;
 static int osd_depth;
 static int osd_upper;
 static int osd_left;
@@ -65,7 +64,6 @@ static int osd_xres;
 module_param(ivtvfb_card_id, int, 0444);
 module_param_named(debug,ivtvfb_debug, int, 0644);
 module_param(osd_laced, bool, 0444);
-module_param(osd_compat, bool, 0444);
 module_param(osd_depth, int, 0444);
 module_param(osd_upper, int, 0444);
 module_param(osd_left, int, 0444);
@@ -80,12 +78,6 @@ MODULE_PARM_DESC(debug,
 		 "Debug level (bitmask). Default: errors only\n"
 		 "\t\t\t(debug = 3 gives full debugging)");
 
-MODULE_PARM_DESC(osd_compat,
-		 "Compatibility mode - Display size is locked (use for old X drivers)\n"
-		 "\t\t\t0=off\n"
-		 "\t\t\t1=on\n"
-		 "\t\t\tdefault off");
-
 /* Why upper, left, xres, yres, depth, laced ? To match terminology used
    by fbset.
    Why start at 1 for left & upper coordinate ? Because X doesn't allow 0 */
@@ -166,9 +158,6 @@ struct osd_info {
 	unsigned long fb_end_aligned_physaddr;
 #endif
 
-	/* Current osd mode */
-	int osd_mode;
-
 	/* Store the buffer offset */
 	int set_osd_coords_x;
 	int set_osd_coords_y;
@@ -470,13 +459,11 @@ static int ivtvfb_set_var(struct ivtv *itv, struct fb_var_screeninfo *var)
 			IVTVFB_DEBUG_WARN("ivtvfb_set_var - Invalid bpp\n");
 	}
 
-	/* Change osd mode if needed.
-	   Although rare, things can go wrong. The extra mode
-	   change seems to help... */
-	if (osd_mode != -1 && osd_mode != oi->osd_mode) {
+	/* Set video mode. Although rare, the display can become scrambled even
+	   if we don't change mode. Always 'bounce' to osd_mode via mode 0 */
+	if (osd_mode != -1) {
 		ivtv_vapi(itv, CX2341X_OSD_SET_PIXEL_FORMAT, 1, 0);
 		ivtv_vapi(itv, CX2341X_OSD_SET_PIXEL_FORMAT, 1, osd_mode);
-		oi->osd_mode = osd_mode;
 	}
 
 	oi->bits_per_pixel = var->bits_per_pixel;
@@ -579,14 +566,6 @@ static int _ivtvfb_check_var(struct fb_var_screeninfo *var, struct ivtv *itv)
 		osd_height_limit = 480;
 	}
 
-	/* Check the bits per pixel */
-	if (osd_compat) {
-		if (var->bits_per_pixel != 32) {
-			IVTVFB_DEBUG_WARN("Invalid colour mode: %d\n", var->bits_per_pixel);
-			return -EINVAL;
-		}
-	}
-
 	if (var->bits_per_pixel == 8 || var->bits_per_pixel == 32) {
 		var->transp.offset = 24;
 		var->transp.length = 8;
@@ -638,32 +617,20 @@ static int _ivtvfb_check_var(struct fb_var_screeninfo *var, struct ivtv *itv)
 	}
 
 	/* Check the resolution */
-	if (osd_compat) {
-		if (var->xres != oi->ivtvfb_defined.xres ||
-		    var->yres != oi->ivtvfb_defined.yres ||
-		    var->xres_virtual != oi->ivtvfb_defined.xres_virtual ||
-		    var->yres_virtual != oi->ivtvfb_defined.yres_virtual) {
-			IVTVFB_DEBUG_WARN("Invalid resolution: %dx%d (virtual %dx%d)\n",
-				var->xres, var->yres, var->xres_virtual, var->yres_virtual);
-			return -EINVAL;
-		}
+	if (var->xres > IVTV_OSD_MAX_WIDTH || var->yres > osd_height_limit) {
+		IVTVFB_DEBUG_WARN("Invalid resolution: %dx%d\n",
+				var->xres, var->yres);
+		return -EINVAL;
 	}
-	else {
-		if (var->xres > IVTV_OSD_MAX_WIDTH || var->yres > osd_height_limit) {
-			IVTVFB_DEBUG_WARN("Invalid resolution: %dx%d\n",
-					var->xres, var->yres);
-			return -EINVAL;
-		}
 
-		/* Max horizontal size is 1023 @ 32bpp, 2046 & 16bpp, 4092 @ 8bpp */
-		if (var->xres_virtual > 4095 / (var->bits_per_pixel / 8) ||
-		    var->xres_virtual * var->yres_virtual * (var->bits_per_pixel / 8) > oi->video_buffer_size ||
-		    var->xres_virtual < var->xres ||
-		    var->yres_virtual < var->yres) {
-			IVTVFB_DEBUG_WARN("Invalid virtual resolution: %dx%d\n",
-				var->xres_virtual, var->yres_virtual);
-			return -EINVAL;
-		}
+	/* Max horizontal size is 1023 @ 32bpp, 2046 & 16bpp, 4092 @ 8bpp */
+	if (var->xres_virtual > 4095 / (var->bits_per_pixel / 8) ||
+	    var->xres_virtual * var->yres_virtual * (var->bits_per_pixel / 8) > oi->video_buffer_size ||
+	    var->xres_virtual < var->xres ||
+	    var->yres_virtual < var->yres) {
+		IVTVFB_DEBUG_WARN("Invalid virtual resolution: %dx%d\n",
+			var->xres_virtual, var->yres_virtual);
+		return -EINVAL;
 	}
 
 	/* Some extra checks if in 8 bit mode */
@@ -877,17 +844,15 @@ static int ivtvfb_init_vidmode(struct ivtv *itv)
 
 	/* Color mode */
 
-	if (osd_compat) osd_depth = 32;
-	if (osd_depth != 8 && osd_depth != 16 && osd_depth != 32) osd_depth = 8;
+	if (osd_depth != 8 && osd_depth != 16 && osd_depth != 32)
+		osd_depth = 8;
 	oi->bits_per_pixel = osd_depth;
 	oi->bytes_per_pixel = oi->bits_per_pixel / 8;
 
-	/* Invalidate current osd mode to force a mode switch later */
-	oi->osd_mode = -1;
-
 	/* Horizontal size & position */
 
-	if (osd_xres > 720) osd_xres = 720;
+	if (osd_xres > 720)
+		osd_xres = 720;
 
 	/* Must be a multiple of 4 for 8bpp & 2 for 16bpp */
 	if (osd_depth == 8)
@@ -895,10 +860,7 @@ static int ivtvfb_init_vidmode(struct ivtv *itv)
 	else if (osd_depth == 16)
 		osd_xres &= ~1;
 
-	if (osd_xres)
-		start_window.width = osd_xres;
-	else
-		start_window.width = osd_compat ? 720: 640;
+	start_window.width = osd_xres ? osd_xres : 640;
 
 	/* Check horizontal start (osd_left). */
 	if (osd_left && osd_left + start_window.width > 721) {
@@ -921,10 +883,7 @@ static int ivtvfb_init_vidmode(struct ivtv *itv)
 	if (osd_yres > max_height)
 		osd_yres = max_height;
 
-	if (osd_yres)
-		start_window.height = osd_yres;
-	else
-		start_window.height = osd_compat ? max_height : (itv->is_50hz ? 480 : 400);
+	start_window.height = osd_yres ? osd_yres : itv->is_50hz ? 480 : 400;
 
 	/* Check vertical start (osd_upper). */
 	if (osd_upper + start_window.height > max_height + 1) {
@@ -1127,10 +1086,6 @@ static int ivtvfb_init_card(struct ivtv *itv)
 	/* Enable the osd */
 	ivtvfb_blank(FB_BLANK_UNBLANK, &itv->osd_info->ivtvfb_info);
 
-	/* Note if we're running in compatibility mode */
-	if (osd_compat)
-		IVTVFB_INFO("Running in compatibility mode. Display resize & mode change disabled\n");
-
 	/* Allocate DMA */
 	ivtv_udma_alloc(itv);
 	return 0;
@@ -1177,9 +1132,12 @@ static void ivtvfb_cleanup(void)
 	for (i = 0; i < ivtv_cards_active; i++) {
 		itv = ivtv_cards[i];
 		if (itv && (itv->v4l2_cap & V4L2_CAP_VIDEO_OUTPUT) && itv->osd_info) {
+			if (unregister_framebuffer(&itv->osd_info->ivtvfb_info)) {
+				IVTVFB_WARN("Framebuffer %d is in use, cannot unload\n", i);
+				return;
+			}
 			IVTVFB_DEBUG_INFO("Unregister framebuffer %d\n", i);
 			ivtvfb_blank(FB_BLANK_POWERDOWN, &itv->osd_info->ivtvfb_info);
-			unregister_framebuffer(&itv->osd_info->ivtvfb_info);
 			ivtvfb_release_buffers(itv);
 			itv->osd_video_pbase = 0;
 		}
diff --git a/drivers/media/video/meye.c b/drivers/media/video/meye.c
index 69283926a8d..c3116329043 100644
--- a/drivers/media/video/meye.c
+++ b/drivers/media/video/meye.c
@@ -1762,7 +1762,6 @@ static struct video_device meye_template = {
 	.owner		= THIS_MODULE,
 	.name		= "meye",
 	.type		= VID_TYPE_CAPTURE,
-	.hardware	= VID_HARDWARE_MEYE,
 	.fops		= &meye_fops,
 	.release	= video_device_release,
 	.minor		= -1,
diff --git a/drivers/media/video/ov511.c b/drivers/media/video/ov511.c
index b8d4ac0d938..d55d5800efb 100644
--- a/drivers/media/video/ov511.c
+++ b/drivers/media/video/ov511.c
@@ -4668,7 +4668,6 @@ static struct video_device vdev_template = {
 	.owner =	THIS_MODULE,
 	.name =		"OV511 USB Camera",
 	.type =		VID_TYPE_CAPTURE,
-	.hardware =	VID_HARDWARE_OV511,
 	.fops =		&ov511_fops,
 	.release =	video_device_release,
 	.minor =	-1,
diff --git a/drivers/media/video/planb.c b/drivers/media/video/planb.c
index 0ef73d9d584..ce4b2f9791e 100644
--- a/drivers/media/video/planb.c
+++ b/drivers/media/video/planb.c
@@ -2013,7 +2013,6 @@ static struct video_device planb_template=
 	.owner		= THIS_MODULE,
 	.name		= PLANB_DEVICE_NAME,
 	.type		= VID_TYPE_OVERLAY,
-	.hardware	= VID_HARDWARE_PLANB,
 	.open		= planb_open,
 	.close		= planb_close,
 	.read		= planb_read,
diff --git a/drivers/media/video/pms.c b/drivers/media/video/pms.c
index b5a67f0dd19..6820c2aabd3 100644
--- a/drivers/media/video/pms.c
+++ b/drivers/media/video/pms.c
@@ -895,7 +895,6 @@ static struct video_device pms_template=
 	.owner		= THIS_MODULE,
 	.name		= "Mediavision PMS",
 	.type		= VID_TYPE_CAPTURE,
-	.hardware	= VID_HARDWARE_PMS,
 	.fops           = &pms_fops,
 };
 
diff --git a/drivers/media/video/pvrusb2/pvrusb2-encoder.c b/drivers/media/video/pvrusb2/pvrusb2-encoder.c
index 20b614436d2..205087a3e13 100644
--- a/drivers/media/video/pvrusb2/pvrusb2-encoder.c
+++ b/drivers/media/video/pvrusb2/pvrusb2-encoder.c
@@ -209,6 +209,11 @@ static int pvr2_encoder_cmd(void *ctxt,
 
 	LOCK_TAKE(hdw->ctl_lock); do {
 
+		if (!hdw->flag_encoder_ok) {
+			ret = -EIO;
+			break;
+		}
+
 		retry_flag = 0;
 		try_count++;
 		ret = 0;
@@ -273,6 +278,7 @@ static int pvr2_encoder_cmd(void *ctxt,
 			ret = -EBUSY;
 		}
 		if (ret) {
+			hdw->flag_encoder_ok = 0;
 			pvr2_trace(
 				PVR2_TRACE_ERROR_LEGS,
 				"Giving up on command."
diff --git a/drivers/media/video/pvrusb2/pvrusb2-hdw-internal.h b/drivers/media/video/pvrusb2/pvrusb2-hdw-internal.h
index 985d9ae7f5e..f873994b088 100644
--- a/drivers/media/video/pvrusb2/pvrusb2-hdw-internal.h
+++ b/drivers/media/video/pvrusb2/pvrusb2-hdw-internal.h
@@ -225,11 +225,12 @@ struct pvr2_hdw {
 	unsigned int cmd_debug_write_len;  //
 	unsigned int cmd_debug_read_len;   //
 
-	int flag_ok;            // device in known good state
-	int flag_disconnected;  // flag_ok == 0 due to disconnect
-	int flag_init_ok;       // true if structure is fully initialized
-	int flag_streaming_enabled; // true if streaming should be on
-	int fw1_state;          // current situation with fw1
+	int flag_ok;            /* device in known good state */
+	int flag_disconnected;  /* flag_ok == 0 due to disconnect */
+	int flag_init_ok;       /* true if structure is fully initialized */
+	int flag_streaming_enabled; /* true if streaming should be on */
+	int fw1_state;          /* current situation with fw1 */
+	int flag_encoder_ok;    /* True if encoder is healthy */
 
 	int flag_decoder_is_tuned;
 
diff --git a/drivers/media/video/pvrusb2/pvrusb2-hdw.c b/drivers/media/video/pvrusb2/pvrusb2-hdw.c
index 27b12b4b5c8..402c5948825 100644
--- a/drivers/media/video/pvrusb2/pvrusb2-hdw.c
+++ b/drivers/media/video/pvrusb2/pvrusb2-hdw.c
@@ -1248,6 +1248,8 @@ int pvr2_upload_firmware2(struct pvr2_hdw *hdw)
 	   time we configure the encoder, then we'll fully configure it. */
 	hdw->enc_cur_valid = 0;
 
+	hdw->flag_encoder_ok = 0;
+
 	/* First prepare firmware loading */
 	ret |= pvr2_write_register(hdw, 0x0048, 0xffffffff); /*interrupt mask*/
 	ret |= pvr2_hdw_gpio_chg_dir(hdw,0xffffffff,0x00000088); /*gpio dir*/
@@ -1346,6 +1348,7 @@ int pvr2_upload_firmware2(struct pvr2_hdw *hdw)
 		pvr2_trace(PVR2_TRACE_ERROR_LEGS,
 			   "firmware2 upload post-proc failure");
 	} else {
+		hdw->flag_encoder_ok = !0;
 		hdw->subsys_enabled_mask |= (1<<PVR2_SUBSYS_B_ENC_FIRMWARE);
 	}
 	return ret;
diff --git a/drivers/media/video/pvrusb2/pvrusb2-v4l2.c b/drivers/media/video/pvrusb2/pvrusb2-v4l2.c
index 4563b3df8a0..7a596ea7cfe 100644
--- a/drivers/media/video/pvrusb2/pvrusb2-v4l2.c
+++ b/drivers/media/video/pvrusb2/pvrusb2-v4l2.c
@@ -1121,15 +1121,12 @@ static const struct file_operations vdev_fops = {
 };
 
 
-#define VID_HARDWARE_PVRUSB2    38  /* FIXME : need a good value */
-
 static struct video_device vdev_template = {
 	.owner      = THIS_MODULE,
 	.type       = VID_TYPE_CAPTURE | VID_TYPE_TUNER,
 	.type2      = (V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VBI_CAPTURE
 		       | V4L2_CAP_TUNER | V4L2_CAP_AUDIO
 		       | V4L2_CAP_READWRITE),
-	.hardware   = VID_HARDWARE_PVRUSB2,
 	.fops       = &vdev_fops,
 };
 
diff --git a/drivers/media/video/pwc/pwc-if.c b/drivers/media/video/pwc/pwc-if.c
index 950da254214..7300ace8f44 100644
--- a/drivers/media/video/pwc/pwc-if.c
+++ b/drivers/media/video/pwc/pwc-if.c
@@ -166,7 +166,6 @@ static struct video_device pwc_template = {
 	.owner =	THIS_MODULE,
 	.name =		"Philips Webcam",	/* Filled in later */
 	.type =		VID_TYPE_CAPTURE,
-	.hardware =	VID_HARDWARE_PWC,
 	.release =	video_device_release,
 	.fops =         &pwc_fops,
 	.minor =        -1,
diff --git a/drivers/media/video/saa7134/saa6752hs.c b/drivers/media/video/saa7134/saa6752hs.c
index 57f1f5d409e..002e70a33a4 100644
--- a/drivers/media/video/saa7134/saa6752hs.c
+++ b/drivers/media/video/saa7134/saa6752hs.c
@@ -71,7 +71,6 @@ static const struct v4l2_format v4l2_format_table[] =
 
 struct saa6752hs_state {
 	struct i2c_client             client;
-	struct v4l2_mpeg_compression  old_params;
 	struct saa6752hs_mpeg_params  params;
 	enum saa6752hs_videoformat    video_format;
 	v4l2_std_id                   standard;
@@ -161,35 +160,6 @@ static struct saa6752hs_mpeg_params param_defaults =
 	.au_l2_bitrate   = V4L2_MPEG_AUDIO_L2_BITRATE_256K,
 };
 
-static struct v4l2_mpeg_compression old_param_defaults =
-{
-	.st_type         = V4L2_MPEG_TS_2,
-	.st_bitrate      = {
-		.mode    = V4L2_BITRATE_CBR,
-		.target  = 7000,
-	},
-
-	.ts_pid_pmt      = 16,
-	.ts_pid_video    = 260,
-	.ts_pid_audio    = 256,
-	.ts_pid_pcr      = 259,
-
-	.vi_type         = V4L2_MPEG_VI_2,
-	.vi_aspect_ratio = V4L2_MPEG_ASPECT_4_3,
-	.vi_bitrate      = {
-		.mode    = V4L2_BITRATE_VBR,
-		.target  = 4000,
-		.max     = 6000,
-	},
-
-	.au_type         = V4L2_MPEG_AU_2_II,
-	.au_bitrate      = {
-		.mode    = V4L2_BITRATE_CBR,
-		.target  = 256,
-	},
-
-};
-
 /* ---------------------------------------------------------------------- */
 
 static int saa6752hs_chip_command(struct i2c_client* client,
@@ -362,74 +332,6 @@ static void saa6752hs_set_subsampling(struct i2c_client* client,
 }
 
 
-static void saa6752hs_old_set_params(struct i2c_client* client,
-				 struct v4l2_mpeg_compression* params)
-{
-	struct saa6752hs_state *h = i2c_get_clientdata(client);
-
-	/* check PIDs */
-	if (params->ts_pid_pmt <= MPEG_PID_MAX) {
-		h->old_params.ts_pid_pmt = params->ts_pid_pmt;
-		h->params.ts_pid_pmt = params->ts_pid_pmt;
-	}
-	if (params->ts_pid_pcr <= MPEG_PID_MAX) {
-		h->old_params.ts_pid_pcr = params->ts_pid_pcr;
-		h->params.ts_pid_pcr = params->ts_pid_pcr;
-	}
-	if (params->ts_pid_video <= MPEG_PID_MAX) {
-		h->old_params.ts_pid_video = params->ts_pid_video;
-		h->params.ts_pid_video = params->ts_pid_video;
-	}
-	if (params->ts_pid_audio <= MPEG_PID_MAX) {
-		h->old_params.ts_pid_audio = params->ts_pid_audio;
-		h->params.ts_pid_audio = params->ts_pid_audio;
-	}
-
-	/* check bitrate parameters */
-	if ((params->vi_bitrate.mode == V4L2_BITRATE_CBR) ||
-	    (params->vi_bitrate.mode == V4L2_BITRATE_VBR)) {
-		h->old_params.vi_bitrate.mode = params->vi_bitrate.mode;
-		h->params.vi_bitrate_mode = (params->vi_bitrate.mode == V4L2_BITRATE_VBR) ?
-		       V4L2_MPEG_VIDEO_BITRATE_MODE_VBR : V4L2_MPEG_VIDEO_BITRATE_MODE_CBR;
-	}
-	if (params->vi_bitrate.mode != V4L2_BITRATE_NONE)
-		h->old_params.st_bitrate.target = params->st_bitrate.target;
-	if (params->vi_bitrate.mode != V4L2_BITRATE_NONE)
-		h->old_params.vi_bitrate.target = params->vi_bitrate.target;
-	if (params->vi_bitrate.mode == V4L2_BITRATE_VBR)
-		h->old_params.vi_bitrate.max = params->vi_bitrate.max;
-	if (params->au_bitrate.mode != V4L2_BITRATE_NONE)
-		h->old_params.au_bitrate.target = params->au_bitrate.target;
-
-	/* aspect ratio */
-	if (params->vi_aspect_ratio == V4L2_MPEG_ASPECT_4_3 ||
-	    params->vi_aspect_ratio == V4L2_MPEG_ASPECT_16_9) {
-		h->old_params.vi_aspect_ratio = params->vi_aspect_ratio;
-		if (params->vi_aspect_ratio == V4L2_MPEG_ASPECT_4_3)
-			h->params.vi_aspect = V4L2_MPEG_VIDEO_ASPECT_4x3;
-		else
-			h->params.vi_aspect = V4L2_MPEG_VIDEO_ASPECT_16x9;
-	}
-
-	/* range checks */
-	if (h->old_params.st_bitrate.target > MPEG_TOTAL_TARGET_BITRATE_MAX)
-		h->old_params.st_bitrate.target = MPEG_TOTAL_TARGET_BITRATE_MAX;
-	if (h->old_params.vi_bitrate.target > MPEG_VIDEO_TARGET_BITRATE_MAX)
-		h->old_params.vi_bitrate.target = MPEG_VIDEO_TARGET_BITRATE_MAX;
-	if (h->old_params.vi_bitrate.max > MPEG_VIDEO_MAX_BITRATE_MAX)
-		h->old_params.vi_bitrate.max = MPEG_VIDEO_MAX_BITRATE_MAX;
-	h->params.vi_bitrate = params->vi_bitrate.target;
-	h->params.vi_bitrate_peak = params->vi_bitrate.max;
-	if (h->old_params.au_bitrate.target <= 256) {
-		h->old_params.au_bitrate.target = 256;
-		h->params.au_l2_bitrate = V4L2_MPEG_AUDIO_L2_BITRATE_256K;
-	}
-	else {
-		h->old_params.au_bitrate.target = 384;
-		h->params.au_l2_bitrate = V4L2_MPEG_AUDIO_L2_BITRATE_384K;
-	}
-}
-
 static int handle_ctrl(struct saa6752hs_mpeg_params *params,
 		struct v4l2_ext_control *ctrl, unsigned int cmd)
 {
@@ -697,7 +599,6 @@ static int saa6752hs_attach(struct i2c_adapter *adap, int addr, int kind)
 		return -ENOMEM;
 	h->client = client_template;
 	h->params = param_defaults;
-	h->old_params = old_param_defaults;
 	h->client.adapter = adap;
 	h->client.addr = addr;
 
@@ -734,23 +635,11 @@ saa6752hs_command(struct i2c_client *client, unsigned int cmd, void *arg)
 {
 	struct saa6752hs_state *h = i2c_get_clientdata(client);
 	struct v4l2_ext_controls *ctrls = arg;
-	struct v4l2_mpeg_compression *old_params = arg;
 	struct saa6752hs_mpeg_params params;
 	int err = 0;
 	int i;
 
 	switch (cmd) {
-	case VIDIOC_S_MPEGCOMP:
-		if (NULL == old_params) {
-			/* apply settings and start encoder */
-			saa6752hs_init(client);
-			break;
-		}
-		saa6752hs_old_set_params(client, old_params);
-		/* fall through */
-	case VIDIOC_G_MPEGCOMP:
-		*old_params = h->old_params;
-		break;
 	case VIDIOC_S_EXT_CTRLS:
 		if (ctrls->ctrl_class != V4L2_CTRL_CLASS_MPEG)
 			return -EINVAL;
diff --git a/drivers/media/video/saa7134/saa7134-core.c b/drivers/media/video/saa7134/saa7134-core.c
index 1a4a24471f2..a499eea379e 100644
--- a/drivers/media/video/saa7134/saa7134-core.c
+++ b/drivers/media/video/saa7134/saa7134-core.c
@@ -429,7 +429,7 @@ int saa7134_set_dmabits(struct saa7134_dev *dev)
 
 	assert_spin_locked(&dev->slock);
 
-	if (dev->inresume)
+	if (dev->insuspend)
 		return 0;
 
 	/* video capture -- dma 0 + video task A */
@@ -563,6 +563,9 @@ static irqreturn_t saa7134_irq(int irq, void *dev_id)
 	unsigned long report,status;
 	int loop, handled = 0;
 
+	if (dev->insuspend)
+		goto out;
+
 	for (loop = 0; loop < 10; loop++) {
 		report = saa_readl(SAA7134_IRQ_REPORT);
 		status = saa_readl(SAA7134_IRQ_STATUS);
@@ -1163,6 +1166,7 @@ static void __devexit saa7134_finidev(struct pci_dev *pci_dev)
 	kfree(dev);
 }
 
+#ifdef CONFIG_PM
 static int saa7134_suspend(struct pci_dev *pci_dev , pm_message_t state)
 {
 
@@ -1176,6 +1180,19 @@ static int saa7134_suspend(struct pci_dev *pci_dev , pm_message_t state)
 	saa_writel(SAA7134_IRQ2, 0);
 	saa_writel(SAA7134_MAIN_CTRL, 0);
 
+	synchronize_irq(pci_dev->irq);
+	dev->insuspend = 1;
+
+	/* Disable timeout timers - if we have active buffers, we will
+	   fill them on resume*/
+
+	del_timer(&dev->video_q.timeout);
+	del_timer(&dev->vbi_q.timeout);
+	del_timer(&dev->ts_q.timeout);
+
+	if (dev->remote)
+		saa7134_ir_stop(dev);
+
 	pci_set_power_state(pci_dev, pci_choose_state(pci_dev, state));
 	pci_save_state(pci_dev);
 
@@ -1194,24 +1211,27 @@ static int saa7134_resume(struct pci_dev *pci_dev)
 	/* Do things that are done in saa7134_initdev ,
 		except of initializing memory structures.*/
 
-	dev->inresume = 1;
 	saa7134_board_init1(dev);
 
+	/* saa7134_hwinit1 */
 	if (saa7134_boards[dev->board].video_out)
 		saa7134_videoport_init(dev);
-
 	if (card_has_mpeg(dev))
 		saa7134_ts_init_hw(dev);
-
+	if (dev->remote)
+		saa7134_ir_start(dev, dev->remote);
 	saa7134_hw_enable1(dev);
-	saa7134_set_decoder(dev);
-	saa7134_i2c_call_clients(dev, VIDIOC_S_STD, &dev->tvnorm->id);
+
+
 	saa7134_board_init2(dev);
-	saa7134_hw_enable2(dev);
 
+	/*saa7134_hwinit2*/
+	saa7134_set_tvnorm_hw(dev);
 	saa7134_tvaudio_setmute(dev);
 	saa7134_tvaudio_setvolume(dev, dev->ctl_volume);
+	saa7134_tvaudio_do_scan(dev);
 	saa7134_enable_i2s(dev);
+	saa7134_hw_enable2(dev);
 
 	/*resume unfinished buffer(s)*/
 	spin_lock_irqsave(&dev->slock, flags);
@@ -1219,13 +1239,19 @@ static int saa7134_resume(struct pci_dev *pci_dev)
 	saa7134_buffer_requeue(dev, &dev->vbi_q);
 	saa7134_buffer_requeue(dev, &dev->ts_q);
 
+	/* FIXME: Disable DMA audio sound - temporary till proper support
+		  is implemented*/
+
+	dev->dmasound.dma_running = 0;
+
 	/* start DMA now*/
-	dev->inresume = 0;
+	dev->insuspend = 0;
 	saa7134_set_dmabits(dev);
 	spin_unlock_irqrestore(&dev->slock, flags);
 
 	return 0;
 }
+#endif
 
 /* ----------------------------------------------------------- */
 
@@ -1262,8 +1288,10 @@ static struct pci_driver saa7134_pci_driver = {
 	.id_table = saa7134_pci_tbl,
 	.probe    = saa7134_initdev,
 	.remove   = __devexit_p(saa7134_finidev),
+#ifdef CONFIG_PM
 	.suspend  = saa7134_suspend,
 	.resume   = saa7134_resume
+#endif
 };
 
 static int saa7134_init(void)
diff --git a/drivers/media/video/saa7134/saa7134-empress.c b/drivers/media/video/saa7134/saa7134-empress.c
index 34ca874dd7f..75d0c5bf46d 100644
--- a/drivers/media/video/saa7134/saa7134-empress.c
+++ b/drivers/media/video/saa7134/saa7134-empress.c
@@ -284,17 +284,6 @@ static int ts_do_ioctl(struct inode *inode, struct file *file,
 	case VIDIOC_S_CTRL:
 		return saa7134_common_ioctl(dev, cmd, arg);
 
-	case VIDIOC_S_MPEGCOMP:
-		printk(KERN_WARNING "VIDIOC_S_MPEGCOMP is obsolete. "
-				    "Replace with VIDIOC_S_EXT_CTRLS!");
-		saa7134_i2c_call_clients(dev, VIDIOC_S_MPEGCOMP, arg);
-		ts_init_encoder(dev);
-		return 0;
-	case VIDIOC_G_MPEGCOMP:
-		printk(KERN_WARNING "VIDIOC_G_MPEGCOMP is obsolete. "
-				    "Replace with VIDIOC_G_EXT_CTRLS!");
-		saa7134_i2c_call_clients(dev, VIDIOC_G_MPEGCOMP, arg);
-		return 0;
 	case VIDIOC_S_EXT_CTRLS:
 		/* count == 0 is abused in saa6752hs.c, so that special
 		   case is handled here explicitly. */
@@ -342,7 +331,6 @@ static struct video_device saa7134_empress_template =
 	.name          = "saa7134-empress",
 	.type          = 0 /* FIXME */,
 	.type2         = 0 /* FIXME */,
-	.hardware      = 0,
 	.fops          = &ts_fops,
 	.minor	       = -1,
 };
diff --git a/drivers/media/video/saa7134/saa7134-input.c b/drivers/media/video/saa7134/saa7134-input.c
index 80d2644f765..3abaa1b8ac9 100644
--- a/drivers/media/video/saa7134/saa7134-input.c
+++ b/drivers/media/video/saa7134/saa7134-input.c
@@ -44,6 +44,14 @@ module_param(ir_rc5_remote_gap, int, 0644);
 static int ir_rc5_key_timeout = 115;
 module_param(ir_rc5_key_timeout, int, 0644);
 
+static int repeat_delay = 500;
+module_param(repeat_delay, int, 0644);
+MODULE_PARM_DESC(repeat_delay, "delay before key repeat started");
+static int repeat_period = 33;
+module_param(repeat_period, int, 0644);
+MODULE_PARM_DESC(repeat_period, "repeat period between"
+    "keypresses when key is down");
+
 #define dprintk(fmt, arg...)	if (ir_debug) \
 	printk(KERN_DEBUG "%s/ir: " fmt, dev->name , ## arg)
 #define i2cdprintk(fmt, arg...)    if (ir_debug) \
@@ -59,6 +67,13 @@ static int build_key(struct saa7134_dev *dev)
 	struct card_ir *ir = dev->remote;
 	u32 gpio, data;
 
+	/* here comes the additional handshake steps for some cards */
+	switch (dev->board) {
+	case SAA7134_BOARD_GOTVIEW_7135:
+		saa_setb(SAA7134_GPIO_GPSTATUS1, 0x80);
+		saa_clearb(SAA7134_GPIO_GPSTATUS1, 0x80);
+		break;
+	}
 	/* rising SAA7134_GPIO_GPRESCAN reads the status */
 	saa_clearb(SAA7134_GPIO_GPMODE3,SAA7134_GPIO_GPRESCAN);
 	saa_setb(SAA7134_GPIO_GPMODE3,SAA7134_GPIO_GPRESCAN);
@@ -159,7 +174,7 @@ static void saa7134_input_timer(unsigned long data)
 	mod_timer(&ir->timer, jiffies + msecs_to_jiffies(ir->polling));
 }
 
-static void saa7134_ir_start(struct saa7134_dev *dev, struct card_ir *ir)
+void saa7134_ir_start(struct saa7134_dev *dev, struct card_ir *ir)
 {
 	if (ir->polling) {
 		setup_timer(&ir->timer, saa7134_input_timer,
@@ -182,7 +197,7 @@ static void saa7134_ir_start(struct saa7134_dev *dev, struct card_ir *ir)
 	}
 }
 
-static void saa7134_ir_stop(struct saa7134_dev *dev)
+void saa7134_ir_stop(struct saa7134_dev *dev)
 {
 	if (dev->remote->polling)
 		del_timer_sync(&dev->remote->timer);
@@ -285,10 +300,10 @@ int saa7134_input_init1(struct saa7134_dev *dev)
 		break;
 	case SAA7134_BOARD_GOTVIEW_7135:
 		ir_codes     = ir_codes_gotview7135;
-		mask_keycode = 0x0003EC;
-		mask_keyup   = 0x008000;
+		mask_keycode = 0x0003CC;
 		mask_keydown = 0x000010;
-		polling	     = 50; // ms
+		polling	     = 5; /* ms */
+		saa_setb(SAA7134_GPIO_GPMODE1, 0x80);
 		break;
 	case SAA7134_BOARD_VIDEOMATE_TV_PVR:
 	case SAA7134_BOARD_VIDEOMATE_GOLD_PLUS:
@@ -386,6 +401,10 @@ int saa7134_input_init1(struct saa7134_dev *dev)
 	if (err)
 		goto err_out_stop;
 
+	/* the remote isn't as bouncy as a keyboard */
+	ir->dev->rep[REP_DELAY] = repeat_delay;
+	ir->dev->rep[REP_PERIOD] = repeat_period;
+
 	return 0;
 
  err_out_stop:
diff --git a/drivers/media/video/saa7134/saa7134-tvaudio.c b/drivers/media/video/saa7134/saa7134-tvaudio.c
index 1b9e39a5ea4..f8e304c7623 100644
--- a/drivers/media/video/saa7134/saa7134-tvaudio.c
+++ b/drivers/media/video/saa7134/saa7134-tvaudio.c
@@ -27,6 +27,7 @@
 #include <linux/kthread.h>
 #include <linux/slab.h>
 #include <linux/delay.h>
+#include <linux/freezer.h>
 #include <asm/div64.h>
 
 #include "saa7134-reg.h"
@@ -231,7 +232,7 @@ static void mute_input_7134(struct saa7134_dev *dev)
 	}
 
 	if (dev->hw_mute  == mute &&
-		dev->hw_input == in && !dev->inresume) {
+		dev->hw_input == in && !dev->insuspend) {
 		dprintk("mute/input: nothing to do [mute=%d,input=%s]\n",
 			mute,in->name);
 		return;
@@ -502,13 +503,17 @@ static int tvaudio_thread(void *data)
 	unsigned int i, audio, nscan;
 	int max1,max2,carrier,rx,mode,lastmode,default_carrier;
 
-	allow_signal(SIGTERM);
+
+	set_freezable();
+
 	for (;;) {
 		tvaudio_sleep(dev,-1);
-		if (kthread_should_stop() || signal_pending(current))
+		if (kthread_should_stop())
 			goto done;
 
 	restart:
+		try_to_freeze();
+
 		dev->thread.scan1 = dev->thread.scan2;
 		dprintk("tvaudio thread scan start [%d]\n",dev->thread.scan1);
 		dev->tvaudio  = NULL;
@@ -612,9 +617,12 @@ static int tvaudio_thread(void *data)
 
 		lastmode = 42;
 		for (;;) {
+
+			try_to_freeze();
+
 			if (tvaudio_sleep(dev,5000))
 				goto restart;
-			if (kthread_should_stop() || signal_pending(current))
+			if (kthread_should_stop())
 				break;
 			if (UNSET == dev->thread.mode) {
 				rx = tvaudio_getstereo(dev,&tvaudio[i]);
@@ -630,6 +638,7 @@ static int tvaudio_thread(void *data)
 	}
 
  done:
+	dev->thread.stopped = 1;
 	return 0;
 }
 
@@ -777,7 +786,8 @@ static int tvaudio_thread_ddep(void *data)
 	struct saa7134_dev *dev = data;
 	u32 value, norms, clock;
 
-	allow_signal(SIGTERM);
+
+	set_freezable();
 
 	clock = saa7134_boards[dev->board].audio_clock;
 	if (UNSET != audio_clock_override)
@@ -790,10 +800,13 @@ static int tvaudio_thread_ddep(void *data)
 
 	for (;;) {
 		tvaudio_sleep(dev,-1);
-		if (kthread_should_stop() || signal_pending(current))
+		if (kthread_should_stop())
 			goto done;
 
 	restart:
+
+		try_to_freeze();
+
 		dev->thread.scan1 = dev->thread.scan2;
 		dprintk("tvaudio thread scan start [%d]\n",dev->thread.scan1);
 
@@ -870,6 +883,7 @@ static int tvaudio_thread_ddep(void *data)
 	}
 
  done:
+	dev->thread.stopped = 1;
 	return 0;
 }
 
@@ -997,7 +1011,7 @@ int saa7134_tvaudio_init2(struct saa7134_dev *dev)
 int saa7134_tvaudio_fini(struct saa7134_dev *dev)
 {
 	/* shutdown tvaudio thread */
-	if (dev->thread.thread)
+	if (dev->thread.thread && !dev->thread.stopped)
 		kthread_stop(dev->thread.thread);
 
 	saa_andorb(SAA7134_ANALOG_IO_SELECT, 0x07, 0x00); /* LINE1 */
@@ -1013,7 +1027,9 @@ int saa7134_tvaudio_do_scan(struct saa7134_dev *dev)
 	} else if (dev->thread.thread) {
 		dev->thread.mode = UNSET;
 		dev->thread.scan2++;
-		wake_up_process(dev->thread.thread);
+
+		if (!dev->insuspend && !dev->thread.stopped)
+			wake_up_process(dev->thread.thread);
 	} else {
 		dev->automute = 0;
 		saa7134_tvaudio_setmute(dev);
diff --git a/drivers/media/video/saa7134/saa7134-video.c b/drivers/media/video/saa7134/saa7134-video.c
index 471b92793c1..3b9ffb4b648 100644
--- a/drivers/media/video/saa7134/saa7134-video.c
+++ b/drivers/media/video/saa7134/saa7134-video.c
@@ -560,15 +560,8 @@ void set_tvnorm(struct saa7134_dev *dev, struct saa7134_tvnorm *norm)
 
 	dev->crop_current = dev->crop_defrect;
 
-	saa7134_set_decoder(dev);
+	saa7134_set_tvnorm_hw(dev);
 
-	if (card_in(dev, dev->ctl_input).tv) {
-		if ((card(dev).tuner_type == TUNER_PHILIPS_TDA8290)
-				&& ((card(dev).tuner_config == 1)
-				||  (card(dev).tuner_config == 2)))
-			saa7134_set_gpio(dev, 22, 5);
-		saa7134_i2c_call_clients(dev, VIDIOC_S_STD, &norm->id);
-	}
 }
 
 static void video_mux(struct saa7134_dev *dev, int input)
@@ -579,7 +572,8 @@ static void video_mux(struct saa7134_dev *dev, int input)
 	saa7134_tvaudio_setinput(dev, &card_in(dev, input));
 }
 
-void saa7134_set_decoder(struct saa7134_dev *dev)
+
+static void saa7134_set_decoder(struct saa7134_dev *dev)
 {
 	int luma_control, sync_control, mux;
 
@@ -630,6 +624,19 @@ void saa7134_set_decoder(struct saa7134_dev *dev)
 	saa_writeb(SAA7134_RAW_DATA_OFFSET,       0x80);
 }
 
+void saa7134_set_tvnorm_hw(struct saa7134_dev *dev)
+{
+	saa7134_set_decoder(dev);
+
+	if (card_in(dev, dev->ctl_input).tv) {
+		if ((card(dev).tuner_type == TUNER_PHILIPS_TDA8290)
+				&& ((card(dev).tuner_config == 1)
+				||  (card(dev).tuner_config == 2)))
+			saa7134_set_gpio(dev, 22, 5);
+		saa7134_i2c_call_clients(dev, VIDIOC_S_STD, &dev->tvnorm->id);
+	}
+}
+
 static void set_h_prescale(struct saa7134_dev *dev, int task, int prescale)
 {
 	static const struct {
@@ -2352,7 +2359,6 @@ struct video_device saa7134_video_template =
 	.name          = "saa7134-video",
 	.type          = VID_TYPE_CAPTURE|VID_TYPE_TUNER|
 			 VID_TYPE_CLIPPING|VID_TYPE_SCALES,
-	.hardware      = 0,
 	.fops          = &video_fops,
 	.minor         = -1,
 };
@@ -2361,7 +2367,6 @@ struct video_device saa7134_vbi_template =
 {
 	.name          = "saa7134-vbi",
 	.type          = VID_TYPE_TUNER|VID_TYPE_TELETEXT,
-	.hardware      = 0,
 	.fops          = &video_fops,
 	.minor         = -1,
 };
@@ -2370,7 +2375,6 @@ struct video_device saa7134_radio_template =
 {
 	.name          = "saa7134-radio",
 	.type          = VID_TYPE_TUNER,
-	.hardware      = 0,
 	.fops          = &radio_fops,
 	.minor         = -1,
 };
diff --git a/drivers/media/video/saa7134/saa7134.h b/drivers/media/video/saa7134/saa7134.h
index 28ec6804bd5..66a390c321a 100644
--- a/drivers/media/video/saa7134/saa7134.h
+++ b/drivers/media/video/saa7134/saa7134.h
@@ -333,6 +333,7 @@ struct saa7134_thread {
 	unsigned int               scan1;
 	unsigned int               scan2;
 	unsigned int               mode;
+	unsigned int		   stopped;
 };
 
 /* buffer for one video/vbi/ts frame */
@@ -524,7 +525,7 @@ struct saa7134_dev {
 	unsigned int               hw_mute;
 	int                        last_carrier;
 	int                        nosignal;
-	unsigned int               inresume;
+	unsigned int               insuspend;
 
 	/* SAA7134_MPEG_* */
 	struct saa7134_ts          ts;
@@ -632,7 +633,7 @@ extern struct video_device saa7134_radio_template;
 
 void set_tvnorm(struct saa7134_dev *dev, struct saa7134_tvnorm *norm);
 int saa7134_videoport_init(struct saa7134_dev *dev);
-void saa7134_set_decoder(struct saa7134_dev *dev);
+void saa7134_set_tvnorm_hw(struct saa7134_dev *dev);
 
 int saa7134_common_ioctl(struct saa7134_dev *dev,
 			 unsigned int cmd, void *arg);
@@ -706,6 +707,8 @@ int  saa7134_input_init1(struct saa7134_dev *dev);
 void saa7134_input_fini(struct saa7134_dev *dev);
 void saa7134_input_irq(struct saa7134_dev *dev);
 void saa7134_set_i2c_ir(struct saa7134_dev *dev, struct IR_i2c *ir);
+void saa7134_ir_start(struct saa7134_dev *dev, struct card_ir *ir);
+void saa7134_ir_stop(struct saa7134_dev *dev);
 
 
 /*
diff --git a/drivers/media/video/se401.c b/drivers/media/video/se401.c
index 93fb04ed99a..d5d7d6cf734 100644
--- a/drivers/media/video/se401.c
+++ b/drivers/media/video/se401.c
@@ -1231,7 +1231,6 @@ static struct video_device se401_template = {
 	.owner =	THIS_MODULE,
 	.name =         "se401 USB camera",
 	.type =         VID_TYPE_CAPTURE,
-	.hardware =     VID_HARDWARE_SE401,
 	.fops =         &se401_fops,
 };
 
diff --git a/drivers/media/video/sn9c102/sn9c102_core.c b/drivers/media/video/sn9c102/sn9c102_core.c
index 6991e06f765..511847912c4 100644
--- a/drivers/media/video/sn9c102/sn9c102_core.c
+++ b/drivers/media/video/sn9c102/sn9c102_core.c
@@ -3319,7 +3319,6 @@ sn9c102_usb_probe(struct usb_interface* intf, const struct usb_device_id* id)
 	strcpy(cam->v4ldev->name, "SN9C1xx PC Camera");
 	cam->v4ldev->owner = THIS_MODULE;
 	cam->v4ldev->type = VID_TYPE_CAPTURE | VID_TYPE_SCALES;
-	cam->v4ldev->hardware = 0;
 	cam->v4ldev->fops = &sn9c102_fops;
 	cam->v4ldev->minor = video_nr[dev_nr];
 	cam->v4ldev->release = video_device_release;
diff --git a/drivers/media/video/stradis.c b/drivers/media/video/stradis.c
index eb220461ac7..3fb85af5d1f 100644
--- a/drivers/media/video/stradis.c
+++ b/drivers/media/video/stradis.c
@@ -1917,7 +1917,6 @@ static const struct file_operations saa_fops = {
 static struct video_device saa_template = {
 	.name = "SAA7146A",
 	.type = VID_TYPE_CAPTURE | VID_TYPE_OVERLAY,
-	.hardware = VID_HARDWARE_SAA7146,
 	.fops = &saa_fops,
 	.minor = -1,
 };
diff --git a/drivers/media/video/stv680.c b/drivers/media/video/stv680.c
index 9e009a7ab86..afc32aa56fd 100644
--- a/drivers/media/video/stv680.c
+++ b/drivers/media/video/stv680.c
@@ -1398,7 +1398,6 @@ static struct video_device stv680_template = {
 	.owner =	THIS_MODULE,
 	.name =		"STV0680 USB camera",
 	.type =		VID_TYPE_CAPTURE,
-	.hardware =	VID_HARDWARE_SE401,
 	.fops =         &stv680_fops,
 	.release =	video_device_release,
 	.minor = 	-1,
diff --git a/drivers/media/video/tuner-core.c b/drivers/media/video/tuner-core.c
index 94843086cda..6a777604f07 100644
--- a/drivers/media/video/tuner-core.c
+++ b/drivers/media/video/tuner-core.c
@@ -113,7 +113,7 @@ static void fe_standby(struct tuner *t)
 static int fe_has_signal(struct tuner *t)
 {
 	struct dvb_tuner_ops *fe_tuner_ops = &t->fe.ops.tuner_ops;
-	u16 strength;
+	u16 strength = 0;
 
 	if (fe_tuner_ops->get_rf_strength)
 		fe_tuner_ops->get_rf_strength(&t->fe, &strength);
diff --git a/drivers/media/video/usbvideo/usbvideo.c b/drivers/media/video/usbvideo/usbvideo.c
index 37ce36b9e58..fb434b5602a 100644
--- a/drivers/media/video/usbvideo/usbvideo.c
+++ b/drivers/media/video/usbvideo/usbvideo.c
@@ -952,7 +952,6 @@ static const struct file_operations usbvideo_fops = {
 static const struct video_device usbvideo_template = {
 	.owner =      THIS_MODULE,
 	.type =       VID_TYPE_CAPTURE,
-	.hardware =   VID_HARDWARE_CPIA,
 	.fops =       &usbvideo_fops,
 };
 
diff --git a/drivers/media/video/usbvideo/vicam.c b/drivers/media/video/usbvideo/vicam.c
index db3c9e3deb2..da1ba021110 100644
--- a/drivers/media/video/usbvideo/vicam.c
+++ b/drivers/media/video/usbvideo/vicam.c
@@ -1074,7 +1074,6 @@ static struct video_device vicam_template = {
 	.owner 		= THIS_MODULE,
 	.name 		= "ViCam-based USB Camera",
 	.type 		= VID_TYPE_CAPTURE,
-	.hardware 	= VID_HARDWARE_VICAM,
 	.fops 		= &vicam_fops,
 	.minor 		= -1,
 };
diff --git a/drivers/media/video/usbvision/usbvision-video.c b/drivers/media/video/usbvision/usbvision-video.c
index e2f3c01cfa1..36e689fa16c 100644
--- a/drivers/media/video/usbvision/usbvision-video.c
+++ b/drivers/media/video/usbvision/usbvision-video.c
@@ -1400,7 +1400,6 @@ static const struct file_operations usbvision_fops = {
 static struct video_device usbvision_video_template = {
 	.owner             = THIS_MODULE,
 	.type		= VID_TYPE_TUNER | VID_TYPE_CAPTURE,
-	.hardware	= VID_HARDWARE_USBVISION,
 	.fops		= &usbvision_fops,
 	.name           = "usbvision-video",
 	.release	= video_device_release,
@@ -1455,7 +1454,6 @@ static struct video_device usbvision_radio_template=
 {
 	.owner             = THIS_MODULE,
 	.type		= VID_TYPE_TUNER,
-	.hardware	= VID_HARDWARE_USBVISION,
 	.fops		= &usbvision_radio_fops,
 	.name           = "usbvision-radio",
 	.release	= video_device_release,
@@ -1492,7 +1490,6 @@ static struct video_device usbvision_vbi_template=
 {
 	.owner             = THIS_MODULE,
 	.type		= VID_TYPE_TUNER,
-	.hardware	= VID_HARDWARE_USBVISION,
 	.fops		= &usbvision_vbi_fops,
 	.release	= video_device_release,
 	.name           = "usbvision-vbi",
diff --git a/drivers/media/video/v4l2-common.c b/drivers/media/video/v4l2-common.c
index 321249240d0..1141b4bf41c 100644
--- a/drivers/media/video/v4l2-common.c
+++ b/drivers/media/video/v4l2-common.c
@@ -317,8 +317,6 @@ static const char *v4l2_ioctls[] = {
 	[_IOC_NR(VIDIOC_ENUM_FMT)]         = "VIDIOC_ENUM_FMT",
 	[_IOC_NR(VIDIOC_G_FMT)]            = "VIDIOC_G_FMT",
 	[_IOC_NR(VIDIOC_S_FMT)]            = "VIDIOC_S_FMT",
-	[_IOC_NR(VIDIOC_G_MPEGCOMP)]       = "VIDIOC_G_MPEGCOMP",
-	[_IOC_NR(VIDIOC_S_MPEGCOMP)]       = "VIDIOC_S_MPEGCOMP",
 	[_IOC_NR(VIDIOC_REQBUFS)]          = "VIDIOC_REQBUFS",
 	[_IOC_NR(VIDIOC_QUERYBUF)]         = "VIDIOC_QUERYBUF",
 	[_IOC_NR(VIDIOC_G_FBUF)]           = "VIDIOC_G_FBUF",
diff --git a/drivers/media/video/videobuf-core.c b/drivers/media/video/videobuf-core.c
index 5599a36490f..89a44f16f0b 100644
--- a/drivers/media/video/videobuf-core.c
+++ b/drivers/media/video/videobuf-core.c
@@ -967,6 +967,7 @@ int videobuf_cgmbuf(struct videobuf_queue *q,
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(videobuf_cgmbuf);
 #endif
 
 /* --------------------------------------------------------------------- */
@@ -985,7 +986,6 @@ EXPORT_SYMBOL_GPL(videobuf_reqbufs);
 EXPORT_SYMBOL_GPL(videobuf_querybuf);
 EXPORT_SYMBOL_GPL(videobuf_qbuf);
 EXPORT_SYMBOL_GPL(videobuf_dqbuf);
-EXPORT_SYMBOL_GPL(videobuf_cgmbuf);
 EXPORT_SYMBOL_GPL(videobuf_streamon);
 EXPORT_SYMBOL_GPL(videobuf_streamoff);
 
diff --git a/drivers/media/video/videobuf-dma-sg.c b/drivers/media/video/videobuf-dma-sg.c
index 3eb6123227b..9ab94a749d8 100644
--- a/drivers/media/video/videobuf-dma-sg.c
+++ b/drivers/media/video/videobuf-dma-sg.c
@@ -27,6 +27,7 @@
 #include <linux/pci.h>
 #include <linux/vmalloc.h>
 #include <linux/pagemap.h>
+#include <linux/scatterlist.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 
@@ -60,12 +61,13 @@ videobuf_vmalloc_to_sg(unsigned char *virt, int nr_pages)
 	sglist = kcalloc(nr_pages, sizeof(struct scatterlist), GFP_KERNEL);
 	if (NULL == sglist)
 		return NULL;
+	sg_init_table(sglist, nr_pages);
 	for (i = 0; i < nr_pages; i++, virt += PAGE_SIZE) {
 		pg = vmalloc_to_page(virt);
 		if (NULL == pg)
 			goto err;
 		BUG_ON(PageHighMem(pg));
-		sglist[i].page   = pg;
+		sg_set_page(&sglist[i], pg);
 		sglist[i].length = PAGE_SIZE;
 	}
 	return sglist;
@@ -86,13 +88,14 @@ videobuf_pages_to_sg(struct page **pages, int nr_pages, int offset)
 	sglist = kcalloc(nr_pages, sizeof(*sglist), GFP_KERNEL);
 	if (NULL == sglist)
 		return NULL;
+	sg_init_table(sglist, nr_pages);
 
 	if (NULL == pages[0])
 		goto nopage;
 	if (PageHighMem(pages[0]))
 		/* DMA to highmem pages might not work */
 		goto highmem;
-	sglist[0].page   = pages[0];
+	sg_set_page(&sglist[0], pages[0]);
 	sglist[0].offset = offset;
 	sglist[0].length = PAGE_SIZE - offset;
 	for (i = 1; i < nr_pages; i++) {
@@ -100,7 +103,7 @@ videobuf_pages_to_sg(struct page **pages, int nr_pages, int offset)
 			goto nopage;
 		if (PageHighMem(pages[i]))
 			goto highmem;
-		sglist[i].page   = pages[i];
+		sg_set_page(&sglist[i], pages[i]);
 		sglist[i].length = PAGE_SIZE;
 	}
 	return sglist;
diff --git a/drivers/media/video/videocodec.c b/drivers/media/video/videocodec.c
index f2bbd7a4d56..87951ec8254 100644
--- a/drivers/media/video/videocodec.c
+++ b/drivers/media/video/videocodec.c
@@ -86,8 +86,8 @@ videocodec_attach (struct videocodec_master *master)
 	}
 
 	dprintk(2,
-		"videocodec_attach: '%s', type: %x, flags %lx, magic %lx\n",
-		master->name, master->type, master->flags, master->magic);
+		"videocodec_attach: '%s', flags %lx, magic %lx\n",
+		master->name, master->flags, master->magic);
 
 	if (!h) {
 		dprintk(1,
diff --git a/drivers/media/video/videodev.c b/drivers/media/video/videodev.c
index 8d8e517b344..9611c399028 100644
--- a/drivers/media/video/videodev.c
+++ b/drivers/media/video/videodev.c
@@ -1313,48 +1313,6 @@ static int __video_do_ioctl(struct inode *inode, struct file *file,
 		ret=vfd->vidioc_cropcap(file, fh, p);
 		break;
 	}
-	case VIDIOC_G_MPEGCOMP:
-	{
-		struct v4l2_mpeg_compression *p=arg;
-
-		/*FIXME: Several fields not shown */
-		if (!vfd->vidioc_g_mpegcomp)
-			break;
-		ret=vfd->vidioc_g_mpegcomp(file, fh, p);
-		if (!ret)
-			dbgarg (cmd, "ts_pid_pmt=%d, ts_pid_audio=%d,"
-					" ts_pid_video=%d, ts_pid_pcr=%d, "
-					"ps_size=%d, au_sample_rate=%d, "
-					"au_pesid=%c, vi_frame_rate=%d, "
-					"vi_frames_per_gop=%d, "
-					"vi_bframes_count=%d, vi_pesid=%c\n",
-					p->ts_pid_pmt,p->ts_pid_audio,
-					p->ts_pid_video,p->ts_pid_pcr,
-					p->ps_size, p->au_sample_rate,
-					p->au_pesid, p->vi_frame_rate,
-					p->vi_frames_per_gop,
-					p->vi_bframes_count, p->vi_pesid);
-		break;
-	}
-	case VIDIOC_S_MPEGCOMP:
-	{
-		struct v4l2_mpeg_compression *p=arg;
-		/*FIXME: Several fields not shown */
-		if (!vfd->vidioc_s_mpegcomp)
-			break;
-		dbgarg (cmd, "ts_pid_pmt=%d, ts_pid_audio=%d, "
-				"ts_pid_video=%d, ts_pid_pcr=%d, ps_size=%d, "
-				"au_sample_rate=%d, au_pesid=%c, "
-				"vi_frame_rate=%d, vi_frames_per_gop=%d, "
-				"vi_bframes_count=%d, vi_pesid=%c\n",
-				p->ts_pid_pmt,p->ts_pid_audio, p->ts_pid_video,
-				p->ts_pid_pcr, p->ps_size, p->au_sample_rate,
-				p->au_pesid, p->vi_frame_rate,
-				p->vi_frames_per_gop, p->vi_bframes_count,
-				p->vi_pesid);
-		ret=vfd->vidioc_s_mpegcomp(file, fh, p);
-		break;
-	}
 	case VIDIOC_G_JPEGCOMP:
 	{
 		struct v4l2_jpegcompression *p=arg;
diff --git a/drivers/media/video/vivi.c b/drivers/media/video/vivi.c
index b532aa280a1..ee73dc75131 100644
--- a/drivers/media/video/vivi.c
+++ b/drivers/media/video/vivi.c
@@ -1119,7 +1119,6 @@ static const struct file_operations vivi_fops = {
 static struct video_device vivi = {
 	.name		= "vivi",
 	.type		= VID_TYPE_CAPTURE,
-	.hardware	= 0,
 	.fops           = &vivi_fops,
 	.minor		= -1,
 //	.release	= video_device_release,
diff --git a/drivers/media/video/w9966.c b/drivers/media/video/w9966.c
index 47366408637..08aaae07c7e 100644
--- a/drivers/media/video/w9966.c
+++ b/drivers/media/video/w9966.c
@@ -196,7 +196,6 @@ static struct video_device w9966_template = {
 	.owner		= THIS_MODULE,
 	.name           = W9966_DRIVERNAME,
 	.type           = VID_TYPE_CAPTURE | VID_TYPE_SCALES,
-	.hardware       = VID_HARDWARE_W9966,
 	.fops           = &w9966_fops,
 };
 
diff --git a/drivers/media/video/w9968cf.c b/drivers/media/video/w9968cf.c
index 9e7f3e685d7..2ae1430f5f7 100644
--- a/drivers/media/video/w9968cf.c
+++ b/drivers/media/video/w9968cf.c
@@ -3549,7 +3549,6 @@ w9968cf_usb_probe(struct usb_interface* intf, const struct usb_device_id* id)
 	strcpy(cam->v4ldev->name, symbolic(camlist, mod_id));
 	cam->v4ldev->owner = THIS_MODULE;
 	cam->v4ldev->type = VID_TYPE_CAPTURE | VID_TYPE_SCALES;
-	cam->v4ldev->hardware = VID_HARDWARE_W9968CF;
 	cam->v4ldev->fops = &w9968cf_fops;
 	cam->v4ldev->minor = video_nr[dev_nr];
 	cam->v4ldev->release = video_device_release;
diff --git a/drivers/media/video/zc0301/zc0301_core.c b/drivers/media/video/zc0301/zc0301_core.c
index 08a93c31c0a..2c5665c8244 100644
--- a/drivers/media/video/zc0301/zc0301_core.c
+++ b/drivers/media/video/zc0301/zc0301_core.c
@@ -1985,7 +1985,6 @@ zc0301_usb_probe(struct usb_interface* intf, const struct usb_device_id* id)
 	strcpy(cam->v4ldev->name, "ZC0301[P] PC Camera");
 	cam->v4ldev->owner = THIS_MODULE;
 	cam->v4ldev->type = VID_TYPE_CAPTURE | VID_TYPE_SCALES;
-	cam->v4ldev->hardware = 0;
 	cam->v4ldev->fops = &zc0301_fops;
 	cam->v4ldev->minor = video_nr[dev_nr];
 	cam->v4ldev->release = video_device_release;
diff --git a/drivers/media/video/zoran_card.c b/drivers/media/video/zoran_card.c
index 48da36a15fc..6e0ac4c5c37 100644
--- a/drivers/media/video/zoran_card.c
+++ b/drivers/media/video/zoran_card.c
@@ -1235,8 +1235,14 @@ zoran_setup_videocodec (struct zoran *zr,
 		return m;
 	}
 
-	m->magic = 0L; /* magic not used */
-	m->type = VID_HARDWARE_ZR36067;
+	/* magic and type are unused for master struct. Makes sense only at
+	   codec structs.
+	   In the past, .type were initialized to the old V4L1 .hardware
+	   value, as VID_HARDWARE_ZR36067
+	 */
+	m->magic = 0L;
+	m->type = 0;
+
 	m->flags = CODEC_FLAG_ENCODER | CODEC_FLAG_DECODER;
 	strncpy(m->name, ZR_DEVNAME(zr), sizeof(m->name));
 	m->data = zr;
diff --git a/drivers/media/video/zoran_driver.c b/drivers/media/video/zoran_driver.c
index 419e5af7853..dd3d7d2c8b0 100644
--- a/drivers/media/video/zoran_driver.c
+++ b/drivers/media/video/zoran_driver.c
@@ -60,7 +60,6 @@
 
 #include <linux/spinlock.h>
 #define     MAP_NR(x)       virt_to_page(x)
-#define     ZORAN_HARDWARE  VID_HARDWARE_ZR36067
 #define     ZORAN_VID_TYPE  ( \
 				VID_TYPE_CAPTURE | \
 				VID_TYPE_OVERLAY | \
@@ -4659,7 +4658,6 @@ struct video_device zoran_template __devinitdata = {
 #ifdef CONFIG_VIDEO_V4L2
 	.type2 = ZORAN_V4L2_VID_FLAGS,
 #endif
-	.hardware = ZORAN_HARDWARE,
 	.fops = &zoran_fops,
 	.release = &zoran_vdev_release,
 	.minor = -1
diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c
index a5d0354bbbd..9203a0b221b 100644
--- a/drivers/mmc/card/queue.c
+++ b/drivers/mmc/card/queue.c
@@ -13,6 +13,7 @@
 #include <linux/blkdev.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
+#include <linux/scatterlist.h>
 
 #include <linux/mmc/card.h>
 #include <linux/mmc/host.h>
@@ -153,19 +154,21 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, spinlock_t *lock
 			blk_queue_max_hw_segments(mq->queue, bouncesz / 512);
 			blk_queue_max_segment_size(mq->queue, bouncesz);
 
-			mq->sg = kzalloc(sizeof(struct scatterlist),
+			mq->sg = kmalloc(sizeof(struct scatterlist),
 				GFP_KERNEL);
 			if (!mq->sg) {
 				ret = -ENOMEM;
 				goto cleanup_queue;
 			}
+			sg_init_table(mq->sg, 1);
 
-			mq->bounce_sg = kzalloc(sizeof(struct scatterlist) *
+			mq->bounce_sg = kmalloc(sizeof(struct scatterlist) *
 				bouncesz / 512, GFP_KERNEL);
 			if (!mq->bounce_sg) {
 				ret = -ENOMEM;
 				goto cleanup_queue;
 			}
+			sg_init_table(mq->bounce_sg, bouncesz / 512);
 		}
 	}
 #endif
@@ -302,12 +305,12 @@ static void copy_sg(struct scatterlist *dst, unsigned int dst_len,
 		BUG_ON(dst_len == 0);
 
 		if (dst_size == 0) {
-			dst_buf = page_address(dst->page) + dst->offset;
+			dst_buf = sg_virt(dst);
 			dst_size = dst->length;
 		}
 
 		if (src_size == 0) {
-			src_buf = page_address(src->page) + src->offset;
+			src_buf = sg_virt(dst);
 			src_size = src->length;
 		}
 
@@ -353,9 +356,7 @@ unsigned int mmc_queue_map_sg(struct mmc_queue *mq)
 		return 1;
 	}
 
-	mq->sg[0].page = virt_to_page(mq->bounce_buf);
-	mq->sg[0].offset = offset_in_page(mq->bounce_buf);
-	mq->sg[0].length = 0;
+	sg_init_one(mq->sg, mq->bounce_buf, 0);
 
 	while (sg_len) {
 		mq->sg[0].length += mq->bounce_sg[sg_len - 1].length;
diff --git a/drivers/mmc/host/at91_mci.c b/drivers/mmc/host/at91_mci.c
index 7a452c2ad1f..b1edcefdd4f 100644
--- a/drivers/mmc/host/at91_mci.c
+++ b/drivers/mmc/host/at91_mci.c
@@ -149,7 +149,7 @@ static inline void at91_mci_sg_to_dma(struct at91mci_host *host, struct mmc_data
 
 		sg = &data->sg[i];
 
-		sgbuffer = kmap_atomic(sg->page, KM_BIO_SRC_IRQ) + sg->offset;
+		sgbuffer = kmap_atomic(sg_page(sg), KM_BIO_SRC_IRQ) + sg->offset;
 		amount = min(size, sg->length);
 		size -= amount;
 
@@ -226,7 +226,7 @@ static void at91_mci_pre_dma_read(struct at91mci_host *host)
 		sg = &data->sg[host->transfer_index++];
 		pr_debug("sg = %p\n", sg);
 
-		sg->dma_address = dma_map_page(NULL, sg->page, sg->offset, sg->length, DMA_FROM_DEVICE);
+		sg->dma_address = dma_map_page(NULL, sg_page(sg), sg->offset, sg->length, DMA_FROM_DEVICE);
 
 		pr_debug("dma address = %08X, length = %d\n", sg->dma_address, sg->length);
 
@@ -283,7 +283,7 @@ static void at91_mci_post_dma_read(struct at91mci_host *host)
 			int index;
 
 			/* Swap the contents of the buffer */
-			buffer = kmap_atomic(sg->page, KM_BIO_SRC_IRQ) + sg->offset;
+			buffer = kmap_atomic(sg_page(sg), KM_BIO_SRC_IRQ) + sg->offset;
 			pr_debug("buffer = %p, length = %d\n", buffer, sg->length);
 
 			for (index = 0; index < (sg->length / 4); index++)
@@ -292,7 +292,7 @@ static void at91_mci_post_dma_read(struct at91mci_host *host)
 			kunmap_atomic(buffer, KM_BIO_SRC_IRQ);
 		}
 
-		flush_dcache_page(sg->page);
+		flush_dcache_page(sg_page(sg));
 	}
 
 	/* Is there another transfer to trigger? */
diff --git a/drivers/mmc/host/au1xmmc.c b/drivers/mmc/host/au1xmmc.c
index 92c4d0dfee4..bcbb6d247bf 100644
--- a/drivers/mmc/host/au1xmmc.c
+++ b/drivers/mmc/host/au1xmmc.c
@@ -340,7 +340,7 @@ static void au1xmmc_send_pio(struct au1xmmc_host *host)
 
 	/* This is the pointer to the data buffer */
 	sg = &data->sg[host->pio.index];
-	sg_ptr = page_address(sg->page) + sg->offset + host->pio.offset;
+	sg_ptr = sg_virt(sg) + host->pio.offset;
 
 	/* This is the space left inside the buffer */
 	sg_len = data->sg[host->pio.index].length - host->pio.offset;
@@ -400,7 +400,7 @@ static void au1xmmc_receive_pio(struct au1xmmc_host *host)
 
 	if (host->pio.index < host->dma.len) {
 		sg = &data->sg[host->pio.index];
-		sg_ptr = page_address(sg->page) + sg->offset + host->pio.offset;
+		sg_ptr = sg_virt(sg) + host->pio.offset;
 
 		/* This is the space left inside the buffer */
 		sg_len = sg_dma_len(&data->sg[host->pio.index]) - host->pio.offset;
@@ -613,14 +613,11 @@ au1xmmc_prepare_data(struct au1xmmc_host *host, struct mmc_data *data)
 
     			if (host->flags & HOST_F_XMIT){
       				ret = au1xxx_dbdma_put_source_flags(channel,
-					(void *) (page_address(sg->page) +
-						  sg->offset),
-					len, flags);
+					(void *) sg_virt(sg), len, flags);
 			}
     			else {
       				ret = au1xxx_dbdma_put_dest_flags(channel,
-					(void *) (page_address(sg->page) +
-						  sg->offset),
+					(void *) sg_virt(sg),
 					len, flags);
 			}
 
diff --git a/drivers/mmc/host/imxmmc.c b/drivers/mmc/host/imxmmc.c
index 6ebc41e7592..fc72e1fadb6 100644
--- a/drivers/mmc/host/imxmmc.c
+++ b/drivers/mmc/host/imxmmc.c
@@ -262,7 +262,7 @@ static void imxmci_setup_data(struct imxmci_host *host, struct mmc_data *data)
 		}
 
 		/* Convert back to virtual address */
-		host->data_ptr = (u16*)(page_address(data->sg->page) + data->sg->offset);
+		host->data_ptr = (u16*)sg_virt(sg);
 		host->data_cnt = 0;
 
 		clear_bit(IMXMCI_PEND_DMA_DATA_b, &host->pending_events);
diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c
index 7ae18eaed6c..12c2d807c14 100644
--- a/drivers/mmc/host/mmc_spi.c
+++ b/drivers/mmc/host/mmc_spi.c
@@ -813,7 +813,7 @@ mmc_spi_data_do(struct mmc_spi_host *host, struct mmc_command *cmd,
 					&& dir == DMA_FROM_DEVICE)
 				dir = DMA_BIDIRECTIONAL;
 
-			dma_addr = dma_map_page(dma_dev, sg->page, 0,
+			dma_addr = dma_map_page(dma_dev, sg_page(sg), 0,
 						PAGE_SIZE, dir);
 			if (direction == DMA_TO_DEVICE)
 				t->tx_dma = dma_addr + sg->offset;
@@ -822,7 +822,7 @@ mmc_spi_data_do(struct mmc_spi_host *host, struct mmc_command *cmd,
 		}
 
 		/* allow pio too; we don't allow highmem */
-		kmap_addr = kmap(sg->page);
+		kmap_addr = kmap(sg_page(sg));
 		if (direction == DMA_TO_DEVICE)
 			t->tx_buf = kmap_addr + sg->offset;
 		else
@@ -855,8 +855,8 @@ mmc_spi_data_do(struct mmc_spi_host *host, struct mmc_command *cmd,
 
 		/* discard mappings */
 		if (direction == DMA_FROM_DEVICE)
-			flush_kernel_dcache_page(sg->page);
-		kunmap(sg->page);
+			flush_kernel_dcache_page(sg_page(sg));
+		kunmap(sg_page(sg));
 		if (dma_dev)
 			dma_unmap_page(dma_dev, dma_addr, PAGE_SIZE, dir);
 
diff --git a/drivers/mmc/host/mmci.h b/drivers/mmc/host/mmci.h
index 000e6a91978..0f39c490f02 100644
--- a/drivers/mmc/host/mmci.h
+++ b/drivers/mmc/host/mmci.h
@@ -169,7 +169,7 @@ static inline char *mmci_kmap_atomic(struct mmci_host *host, unsigned long *flag
 	struct scatterlist *sg = host->sg_ptr;
 
 	local_irq_save(*flags);
-	return kmap_atomic(sg->page, KM_BIO_SRC_IRQ) + sg->offset;
+	return kmap_atomic(sg_page(sg), KM_BIO_SRC_IRQ) + sg->offset;
 }
 
 static inline void mmci_kunmap_atomic(struct mmci_host *host, void *buffer, unsigned long *flags)
diff --git a/drivers/mmc/host/omap.c b/drivers/mmc/host/omap.c
index 60a67dfcda6..971e18b91f4 100644
--- a/drivers/mmc/host/omap.c
+++ b/drivers/mmc/host/omap.c
@@ -24,10 +24,10 @@
 #include <linux/mmc/host.h>
 #include <linux/mmc/card.h>
 #include <linux/clk.h>
+#include <linux/scatterlist.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/scatterlist.h>
 #include <asm/mach-types.h>
 
 #include <asm/arch/board.h>
@@ -383,7 +383,7 @@ mmc_omap_sg_to_buf(struct mmc_omap_host *host)
 
 	sg = host->data->sg + host->sg_idx;
 	host->buffer_bytes_left = sg->length;
-	host->buffer = page_address(sg->page) + sg->offset;
+	host->buffer = sg_virt(sg);
 	if (host->buffer_bytes_left > host->total_bytes_left)
 		host->buffer_bytes_left = host->total_bytes_left;
 }
diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index b397121b947..d7c5b94d8c5 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -13,6 +13,7 @@
 #include <linux/highmem.h>
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
+#include <linux/scatterlist.h>
 
 #include <linux/mmc/host.h>
 
@@ -231,7 +232,7 @@ static void sdhci_deactivate_led(struct sdhci_host *host)
 
 static inline char* sdhci_sg_to_buffer(struct sdhci_host* host)
 {
-	return page_address(host->cur_sg->page) + host->cur_sg->offset;
+	return sg_virt(host->cur_sg);
 }
 
 static inline int sdhci_next_sg(struct sdhci_host* host)
diff --git a/drivers/mmc/host/tifm_sd.c b/drivers/mmc/host/tifm_sd.c
index 9b904795eb7..c11a3d25605 100644
--- a/drivers/mmc/host/tifm_sd.c
+++ b/drivers/mmc/host/tifm_sd.c
@@ -192,7 +192,7 @@ static void tifm_sd_transfer_data(struct tifm_sd *host)
 		}
 		off = sg[host->sg_pos].offset + host->block_pos;
 
-		pg = nth_page(sg[host->sg_pos].page, off >> PAGE_SHIFT);
+		pg = nth_page(sg_page(&sg[host->sg_pos]), off >> PAGE_SHIFT);
 		p_off = offset_in_page(off);
 		p_cnt = PAGE_SIZE - p_off;
 		p_cnt = min(p_cnt, cnt);
@@ -241,18 +241,18 @@ static void tifm_sd_bounce_block(struct tifm_sd *host, struct mmc_data *r_data)
 		}
 		off = sg[host->sg_pos].offset + host->block_pos;
 
-		pg = nth_page(sg[host->sg_pos].page, off >> PAGE_SHIFT);
+		pg = nth_page(sg_page(&sg[host->sg_pos]), off >> PAGE_SHIFT);
 		p_off = offset_in_page(off);
 		p_cnt = PAGE_SIZE - p_off;
 		p_cnt = min(p_cnt, cnt);
 		p_cnt = min(p_cnt, t_size);
 
 		if (r_data->flags & MMC_DATA_WRITE)
-			tifm_sd_copy_page(host->bounce_buf.page,
+			tifm_sd_copy_page(sg_page(&host->bounce_buf),
 					  r_data->blksz - t_size,
 					  pg, p_off, p_cnt);
 		else if (r_data->flags & MMC_DATA_READ)
-			tifm_sd_copy_page(pg, p_off, host->bounce_buf.page,
+			tifm_sd_copy_page(pg, p_off, sg_page(&host->bounce_buf),
 					  r_data->blksz - t_size, p_cnt);
 
 		t_size -= p_cnt;
diff --git a/drivers/mmc/host/wbsd.c b/drivers/mmc/host/wbsd.c
index 80db11c05f2..fa4c8c53cc7 100644
--- a/drivers/mmc/host/wbsd.c
+++ b/drivers/mmc/host/wbsd.c
@@ -269,7 +269,7 @@ static inline int wbsd_next_sg(struct wbsd_host *host)
 
 static inline char *wbsd_sg_to_buffer(struct wbsd_host *host)
 {
-	return page_address(host->cur_sg->page) + host->cur_sg->offset;
+	return sg_virt(host->cur_sg);
 }
 
 static inline void wbsd_sg_to_dma(struct wbsd_host *host, struct mmc_data *data)
@@ -283,7 +283,7 @@ static inline void wbsd_sg_to_dma(struct wbsd_host *host, struct mmc_data *data)
 	len = data->sg_len;
 
 	for (i = 0; i < len; i++) {
-		sgbuf = page_address(sg[i].page) + sg[i].offset;
+		sgbuf = sg_virt(&sg[i]);
 		memcpy(dmabuf, sgbuf, sg[i].length);
 		dmabuf += sg[i].length;
 	}
@@ -300,7 +300,7 @@ static inline void wbsd_dma_to_sg(struct wbsd_host *host, struct mmc_data *data)
 	len = data->sg_len;
 
 	for (i = 0; i < len; i++) {
-		sgbuf = page_address(sg[i].page) + sg[i].offset;
+		sgbuf = sg_virt(&sg[i]);
 		memcpy(sgbuf, dmabuf, sg[i].length);
 		dmabuf += sg[i].length;
 	}
diff --git a/drivers/mtd/chips/cfi_cmdset_0001.c b/drivers/mtd/chips/cfi_cmdset_0001.c
index 3aa3dca56ae..a9eb1c51624 100644
--- a/drivers/mtd/chips/cfi_cmdset_0001.c
+++ b/drivers/mtd/chips/cfi_cmdset_0001.c
@@ -85,6 +85,7 @@ static int cfi_intelext_point (struct mtd_info *mtd, loff_t from, size_t len,
 static void cfi_intelext_unpoint (struct mtd_info *mtd, u_char *addr, loff_t from,
 			size_t len);
 
+static int chip_ready (struct map_info *map, struct flchip *chip, unsigned long adr, int mode);
 static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr, int mode);
 static void put_chip(struct map_info *map, struct flchip *chip, unsigned long adr);
 #include "fwh_lock.h"
@@ -641,73 +642,13 @@ static int cfi_intelext_partition_fixup(struct mtd_info *mtd,
 /*
  *  *********** CHIP ACCESS FUNCTIONS ***********
  */
-
-static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr, int mode)
+static int chip_ready (struct map_info *map, struct flchip *chip, unsigned long adr, int mode)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	struct cfi_private *cfi = map->fldrv_priv;
 	map_word status, status_OK = CMD(0x80), status_PWS = CMD(0x01);
-	unsigned long timeo;
 	struct cfi_pri_intelext *cfip = cfi->cmdset_priv;
-
- resettime:
-	timeo = jiffies + HZ;
- retry:
-	if (chip->priv && (mode == FL_WRITING || mode == FL_ERASING || mode == FL_OTP_WRITE || mode == FL_SHUTDOWN)) {
-		/*
-		 * OK. We have possibility for contension on the write/erase
-		 * operations which are global to the real chip and not per
-		 * partition.  So let's fight it over in the partition which
-		 * currently has authority on the operation.
-		 *
-		 * The rules are as follows:
-		 *
-		 * - any write operation must own shared->writing.
-		 *
-		 * - any erase operation must own _both_ shared->writing and
-		 *   shared->erasing.
-		 *
-		 * - contension arbitration is handled in the owner's context.
-		 *
-		 * The 'shared' struct can be read and/or written only when
-		 * its lock is taken.
-		 */
-		struct flchip_shared *shared = chip->priv;
-		struct flchip *contender;
-		spin_lock(&shared->lock);
-		contender = shared->writing;
-		if (contender && contender != chip) {
-			/*
-			 * The engine to perform desired operation on this
-			 * partition is already in use by someone else.
-			 * Let's fight over it in the context of the chip
-			 * currently using it.  If it is possible to suspend,
-			 * that other partition will do just that, otherwise
-			 * it'll happily send us to sleep.  In any case, when
-			 * get_chip returns success we're clear to go ahead.
-			 */
-			int ret = spin_trylock(contender->mutex);
-			spin_unlock(&shared->lock);
-			if (!ret)
-				goto retry;
-			spin_unlock(chip->mutex);
-			ret = get_chip(map, contender, contender->start, mode);
-			spin_lock(chip->mutex);
-			if (ret) {
-				spin_unlock(contender->mutex);
-				return ret;
-			}
-			timeo = jiffies + HZ;
-			spin_lock(&shared->lock);
-			spin_unlock(contender->mutex);
-		}
-
-		/* We now own it */
-		shared->writing = chip;
-		if (mode == FL_ERASING)
-			shared->erasing = chip;
-		spin_unlock(&shared->lock);
-	}
+	unsigned long timeo = jiffies + HZ;
 
 	switch (chip->state) {
 
@@ -722,16 +663,11 @@ static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr
 			if (chip->priv && map_word_andequal(map, status, status_PWS, status_PWS))
 				break;
 
-			if (time_after(jiffies, timeo)) {
-				printk(KERN_ERR "%s: Waiting for chip to be ready timed out. Status %lx\n",
-				       map->name, status.x[0]);
-				return -EIO;
-			}
 			spin_unlock(chip->mutex);
 			cfi_udelay(1);
 			spin_lock(chip->mutex);
 			/* Someone else might have been playing with it. */
-			goto retry;
+			return -EAGAIN;
 		}
 
 	case FL_READY:
@@ -809,10 +745,82 @@ static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr
 		schedule();
 		remove_wait_queue(&chip->wq, &wait);
 		spin_lock(chip->mutex);
-		goto resettime;
+		return -EAGAIN;
 	}
 }
 
+static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr, int mode)
+{
+	int ret;
+
+ retry:
+	if (chip->priv && (mode == FL_WRITING || mode == FL_ERASING
+			   || mode == FL_OTP_WRITE || mode == FL_SHUTDOWN)) {
+		/*
+		 * OK. We have possibility for contention on the write/erase
+		 * operations which are global to the real chip and not per
+		 * partition.  So let's fight it over in the partition which
+		 * currently has authority on the operation.
+		 *
+		 * The rules are as follows:
+		 *
+		 * - any write operation must own shared->writing.
+		 *
+		 * - any erase operation must own _both_ shared->writing and
+		 *   shared->erasing.
+		 *
+		 * - contention arbitration is handled in the owner's context.
+		 *
+		 * The 'shared' struct can be read and/or written only when
+		 * its lock is taken.
+		 */
+		struct flchip_shared *shared = chip->priv;
+		struct flchip *contender;
+		spin_lock(&shared->lock);
+		contender = shared->writing;
+		if (contender && contender != chip) {
+			/*
+			 * The engine to perform desired operation on this
+			 * partition is already in use by someone else.
+			 * Let's fight over it in the context of the chip
+			 * currently using it.  If it is possible to suspend,
+			 * that other partition will do just that, otherwise
+			 * it'll happily send us to sleep.  In any case, when
+			 * get_chip returns success we're clear to go ahead.
+			 */
+			ret = spin_trylock(contender->mutex);
+			spin_unlock(&shared->lock);
+			if (!ret)
+				goto retry;
+			spin_unlock(chip->mutex);
+			ret = chip_ready(map, contender, contender->start, mode);
+			spin_lock(chip->mutex);
+
+			if (ret == -EAGAIN) {
+				spin_unlock(contender->mutex);
+				goto retry;
+			}
+			if (ret) {
+				spin_unlock(contender->mutex);
+				return ret;
+			}
+			spin_lock(&shared->lock);
+			spin_unlock(contender->mutex);
+		}
+
+		/* We now own it */
+		shared->writing = chip;
+		if (mode == FL_ERASING)
+			shared->erasing = chip;
+		spin_unlock(&shared->lock);
+	}
+	ret = chip_ready(map, chip, adr, mode);
+	if (ret == -EAGAIN)
+		goto retry;
+
+	return ret;
+}
+
 static void put_chip(struct map_info *map, struct flchip *chip, unsigned long adr)
 {
 	struct cfi_private *cfi = map->fldrv_priv;
diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig
index 8f9c3baeb38..246d4512f64 100644
--- a/drivers/mtd/nand/Kconfig
+++ b/drivers/mtd/nand/Kconfig
@@ -300,7 +300,7 @@ config MTD_NAND_PLATFORM
 	  via platform_data.
 
 config MTD_ALAUDA
-	tristate "MTD driver for Olympus MAUSB-10 and Fijufilm DPC-R1"
+	tristate "MTD driver for Olympus MAUSB-10 and Fujifilm DPC-R1"
 	depends on MTD_NAND && USB
 	help
 	  These two (and possibly other) Alauda-based cardreaders for
diff --git a/drivers/mtd/nand/diskonchip.c b/drivers/mtd/nand/diskonchip.c
index ab9f5c5db38..0e72153b329 100644
--- a/drivers/mtd/nand/diskonchip.c
+++ b/drivers/mtd/nand/diskonchip.c
@@ -220,7 +220,7 @@ static int doc_ecc_decode(struct rs_control *rs, uint8_t *data, uint8_t *ecc)
 		}
 	}
 	/* If the parity is wrong, no rescue possible */
-	return parity ? -1 : nerr;
+	return parity ? -EBADMSG : nerr;
 }
 
 static void DoC_Delay(struct doc_priv *doc, unsigned short cycles)
@@ -1034,7 +1034,7 @@ static int doc200x_correct_data(struct mtd_info *mtd, u_char *dat,
 		WriteDOC(DOC_ECC_DIS, docptr, Mplus_ECCConf);
 	else
 		WriteDOC(DOC_ECC_DIS, docptr, ECCConf);
-	if (no_ecc_failures && (ret == -1)) {
+	if (no_ecc_failures && (ret == -EBADMSG)) {
 		printk(KERN_ERR "suppressing ECC failure\n");
 		ret = 0;
 	}
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index b4e0e772389..e29c1da7f56 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -789,7 +789,7 @@ static int nand_read_page_swecc(struct mtd_info *mtd, struct nand_chip *chip,
 		int stat;
 
 		stat = chip->ecc.correct(mtd, p, &ecc_code[i], &ecc_calc[i]);
-		if (stat == -1)
+		if (stat < 0)
 			mtd->ecc_stats.failed++;
 		else
 			mtd->ecc_stats.corrected += stat;
@@ -833,7 +833,7 @@ static int nand_read_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
 		int stat;
 
 		stat = chip->ecc.correct(mtd, p, &ecc_code[i], &ecc_calc[i]);
-		if (stat == -1)
+		if (stat < 0)
 			mtd->ecc_stats.failed++;
 		else
 			mtd->ecc_stats.corrected += stat;
@@ -874,7 +874,7 @@ static int nand_read_page_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
 		chip->read_buf(mtd, oob, eccbytes);
 		stat = chip->ecc.correct(mtd, p, oob, NULL);
 
-		if (stat == -1)
+		if (stat < 0)
 			mtd->ecc_stats.failed++;
 		else
 			mtd->ecc_stats.corrected += stat;
diff --git a/drivers/mtd/nand/nand_ecc.c b/drivers/mtd/nand/nand_ecc.c
index fde593e5e63..9003a135e05 100644
--- a/drivers/mtd/nand/nand_ecc.c
+++ b/drivers/mtd/nand/nand_ecc.c
@@ -189,7 +189,7 @@ int nand_correct_data(struct mtd_info *mtd, u_char *dat,
 	if(countbits(s0 | ((uint32_t)s1 << 8) | ((uint32_t)s2 <<16)) == 1)
 		return 1;
 
-	return -1;
+	return -EBADMSG;
 }
 EXPORT_SYMBOL(nand_correct_data);
 
diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c
index a7574807dc4..10490b48d9f 100644
--- a/drivers/mtd/nand/nandsim.c
+++ b/drivers/mtd/nand/nandsim.c
@@ -511,7 +511,7 @@ static int init_nandsim(struct mtd_info *mtd)
 	}
 
 	if (ns->options & OPT_SMALLPAGE) {
-		if (ns->geom.totsz < (64 << 20)) {
+		if (ns->geom.totsz < (32 << 20)) {
 			ns->geom.pgaddrbytes  = 3;
 			ns->geom.secaddrbytes = 2;
 		} else {
diff --git a/drivers/mtd/nand/s3c2410.c b/drivers/mtd/nand/s3c2410.c
index 21b921dd6aa..66f76e9618d 100644
--- a/drivers/mtd/nand/s3c2410.c
+++ b/drivers/mtd/nand/s3c2410.c
@@ -488,12 +488,24 @@ static void s3c2410_nand_read_buf(struct mtd_info *mtd, u_char *buf, int len)
 	readsb(this->IO_ADDR_R, buf, len);
 }
 
+static void s3c2440_nand_read_buf(struct mtd_info *mtd, u_char *buf, int len)
+{
+	struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd);
+	readsl(info->regs + S3C2440_NFDATA, buf, len / 4);
+}
+
 static void s3c2410_nand_write_buf(struct mtd_info *mtd, const u_char *buf, int len)
 {
 	struct nand_chip *this = mtd->priv;
 	writesb(this->IO_ADDR_W, buf, len);
 }
 
+static void s3c2440_nand_write_buf(struct mtd_info *mtd, const u_char *buf, int len)
+{
+	struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd);
+	writesl(info->regs + S3C2440_NFDATA, buf, len / 4);
+}
+
 /* device management functions */
 
 static int s3c2410_nand_remove(struct platform_device *pdev)
@@ -604,6 +616,8 @@ static void s3c2410_nand_init_chip(struct s3c2410_nand_info *info,
 		info->sel_bit	= S3C2440_NFCONT_nFCE;
 		chip->cmd_ctrl  = s3c2440_nand_hwcontrol;
 		chip->dev_ready = s3c2440_nand_devready;
+		chip->read_buf  = s3c2440_nand_read_buf;
+		chip->write_buf	= s3c2440_nand_write_buf;
 		break;
 
 	case TYPE_S3C2412:
diff --git a/drivers/mtd/onenand/onenand_sim.c b/drivers/mtd/onenand/onenand_sim.c
index 0d89ad5776f..d64200b7c94 100644
--- a/drivers/mtd/onenand/onenand_sim.c
+++ b/drivers/mtd/onenand/onenand_sim.c
@@ -88,11 +88,11 @@ do {									\
 
 /**
  * onenand_lock_handle - Handle Lock scheme
- * @param this		OneNAND device structure
- * @param cmd		The command to be sent
+ * @this:		OneNAND device structure
+ * @cmd:		The command to be sent
  *
  * Send lock command to OneNAND device.
- * The lock scheme is depends on chip type.
+ * The lock scheme depends on chip type.
  */
 static void onenand_lock_handle(struct onenand_chip *this, int cmd)
 {
@@ -131,8 +131,8 @@ static void onenand_lock_handle(struct onenand_chip *this, int cmd)
 
 /**
  * onenand_bootram_handle - Handle BootRAM area
- * @param this		OneNAND device structure
- * @param cmd		The command to be sent
+ * @this:		OneNAND device structure
+ * @cmd:		The command to be sent
  *
  * Emulate BootRAM area. It is possible to do basic operation using BootRAM.
  */
@@ -153,10 +153,10 @@ static void onenand_bootram_handle(struct onenand_chip *this, int cmd)
 
 /**
  * onenand_update_interrupt - Set interrupt register
- * @param this         OneNAND device structure
- * @param cmd          The command to be sent
+ * @this:         OneNAND device structure
+ * @cmd:          The command to be sent
  *
- * Update interrupt register. The status is depends on command.
+ * Update interrupt register. The status depends on command.
  */
 static void onenand_update_interrupt(struct onenand_chip *this, int cmd)
 {
@@ -189,11 +189,12 @@ static void onenand_update_interrupt(struct onenand_chip *this, int cmd)
 }
 
 /**
- * onenand_check_overwrite - Check over-write if happend
- * @param dest		The destination pointer
- * @param src		The source pointer
- * @param count		The length to be check
- * @return		0 on same, otherwise 1
+ * onenand_check_overwrite - Check if over-write happened
+ * @dest:		The destination pointer
+ * @src:		The source pointer
+ * @count:		The length to be check
+ *
+ * Returns:		0 on same, otherwise 1
  *
  * Compare the source with destination
  */
@@ -213,10 +214,10 @@ static int onenand_check_overwrite(void *dest, void *src, size_t count)
 
 /**
  * onenand_data_handle - Handle OneNAND Core and DataRAM
- * @param this		OneNAND device structure
- * @param cmd		The command to be sent
- * @param dataram	Which dataram used
- * @param offset	The offset to OneNAND Core
+ * @this:		OneNAND device structure
+ * @cmd:		The command to be sent
+ * @dataram:		Which dataram used
+ * @offset:		The offset to OneNAND Core
  *
  * Copy data from OneNAND Core to DataRAM (read)
  * Copy data from DataRAM to OneNAND Core (write)
@@ -295,8 +296,8 @@ static void onenand_data_handle(struct onenand_chip *this, int cmd,
 
 /**
  * onenand_command_handle - Handle command
- * @param this		OneNAND device structure
- * @param cmd		The command to be sent
+ * @this:		OneNAND device structure
+ * @cmd:		The command to be sent
  *
  * Emulate OneNAND command.
  */
@@ -350,8 +351,8 @@ static void onenand_command_handle(struct onenand_chip *this, int cmd)
 
 /**
  * onenand_writew - [OneNAND Interface] Emulate write operation
- * @param value		value to write
- * @param addr		address to write
+ * @value:		value to write
+ * @addr:		address to write
  *
  * Write OneNAND register with value
  */
@@ -373,7 +374,7 @@ static void onenand_writew(unsigned short value, void __iomem * addr)
 
 /**
  * flash_init - Initialize OneNAND simulator
- * @param flash		OneNAND simulaotr data strucutres
+ * @flash:		OneNAND simulator data strucutres
  *
  * Initialize OneNAND simulator.
  */
@@ -416,7 +417,7 @@ static int __init flash_init(struct onenand_flash *flash)
 
 /**
  * flash_exit - Clean up OneNAND simulator
- * @param flash		OneNAND simulaotr data strucutres
+ * @flash:		OneNAND simulator data structures
  *
  * Clean up OneNAND simulator.
  */
@@ -424,7 +425,6 @@ static void flash_exit(struct onenand_flash *flash)
 {
 	vfree(ONENAND_CORE(flash));
 	kfree(flash->base);
-	kfree(flash);
 }
 
 static int __init onenand_sim_init(void)
@@ -449,7 +449,7 @@ static int __init onenand_sim_init(void)
 	info->onenand.write_word = onenand_writew;
 
 	if (flash_init(&info->flash)) {
-		printk(KERN_ERR "Unable to allocat flash.\n");
+		printk(KERN_ERR "Unable to allocate flash.\n");
 		kfree(ffchars);
 		kfree(info);
 		return -ENOMEM;
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index eb75773a9eb..86b8641b466 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -3103,4 +3103,10 @@ config NETPOLL_TRAP
 config NET_POLL_CONTROLLER
 	def_bool NETPOLL
 
+config VIRTIO_NET
+	tristate "Virtio network driver (EXPERIMENTAL)"
+	depends on EXPERIMENTAL && VIRTIO
+	---help---
+	  This is the virtual network driver for lguest.  Say Y or M.
+
 endif # NETDEVICES
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 22f78cbd126..593262065c9 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -183,7 +183,6 @@ obj-$(CONFIG_ZORRO8390) += zorro8390.o
 obj-$(CONFIG_HPLANCE) += hplance.o 7990.o
 obj-$(CONFIG_MVME147_NET) += mvme147.o 7990.o
 obj-$(CONFIG_EQUALIZER) += eql.o
-obj-$(CONFIG_LGUEST_NET) += lguest_net.o
 obj-$(CONFIG_MIPS_JAZZ_SONIC) += jazzsonic.o
 obj-$(CONFIG_MIPS_AU1X00_ENET) += au1000_eth.o
 obj-$(CONFIG_MIPS_SIM_NET) += mipsnet.o
@@ -243,3 +242,4 @@ obj-$(CONFIG_FS_ENET) += fs_enet/
 
 obj-$(CONFIG_NETXEN_NIC) += netxen/
 obj-$(CONFIG_NIU) += niu.o
+obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
diff --git a/drivers/net/cpmac.c b/drivers/net/cpmac.c
index ed53aaab4c0..ae419736158 100644
--- a/drivers/net/cpmac.c
+++ b/drivers/net/cpmac.c
@@ -471,7 +471,7 @@ static int cpmac_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	len = max(skb->len, ETH_ZLEN);
-	queue = skb->queue_mapping;
+	queue = skb_get_queue_mapping(skb);
 #ifdef CONFIG_NETDEVICES_MULTIQUEUE
 	netif_stop_subqueue(dev, queue);
 #else
diff --git a/drivers/net/fec.c b/drivers/net/fec.c
index 2b5782056dd..0fbf1bbbaee 100644
--- a/drivers/net/fec.c
+++ b/drivers/net/fec.c
@@ -751,13 +751,11 @@ mii_queue(struct net_device *dev, int regval, void (*func)(uint, struct net_devi
 		if (mii_head) {
 			mii_tail->mii_next = mip;
 			mii_tail = mip;
-		}
-		else {
+		} else {
 			mii_head = mii_tail = mip;
 			fep->hwp->fec_mii_data = regval;
 		}
-	}
-	else {
+	} else {
 		retval = 1;
 	}
 
@@ -768,14 +766,11 @@ mii_queue(struct net_device *dev, int regval, void (*func)(uint, struct net_devi
 
 static void mii_do_cmd(struct net_device *dev, const phy_cmd_t *c)
 {
-	int k;
-
 	if(!c)
 		return;
 
-	for(k = 0; (c+k)->mii_data != mk_mii_end; k++) {
-		mii_queue(dev, (c+k)->mii_data, (c+k)->funct);
-	}
+	for (; c->mii_data != mk_mii_end; c++)
+		mii_queue(dev, c->mii_data, c->funct);
 }
 
 static void mii_parse_sr(uint mii_reg, struct net_device *dev)
@@ -792,7 +787,6 @@ static void mii_parse_sr(uint mii_reg, struct net_device *dev)
 		status |= PHY_STAT_FAULT;
 	if (mii_reg & 0x0020)
 		status |= PHY_STAT_ANC;
-
 	*s = status;
 }
 
@@ -1239,7 +1233,6 @@ mii_link_interrupt(int irq, void * dev_id);
 #endif
 
 #if defined(CONFIG_M5272)
-
 /*
  *	Code specific to Coldfire 5272 setup.
  */
@@ -2020,8 +2013,7 @@ static void mii_relink(struct work_struct *work)
 		    & (PHY_STAT_100FDX | PHY_STAT_10FDX))
 			duplex = 1;
 		fec_restart(dev, duplex);
-	}
-	else
+	} else
 		fec_stop(dev);
 
 #if 0
@@ -2119,8 +2111,7 @@ mii_discover_phy(uint mii_reg, struct net_device *dev)
 			fep->phy_id = phytype << 16;
 			mii_queue(dev, mk_mii_read(MII_REG_PHYIR2),
 							mii_discover_phy3);
-		}
-		else {
+		} else {
 			fep->phy_addr++;
 			mii_queue(dev, mk_mii_read(MII_REG_PHYIR1),
 							mii_discover_phy);
@@ -2574,8 +2565,7 @@ fec_restart(struct net_device *dev, int duplex)
 	if (duplex) {
 		fecp->fec_r_cntrl = OPT_FRAME_SIZE | 0x04;/* MII enable */
 		fecp->fec_x_cntrl = 0x04;		  /* FD enable */
-	}
-	else {
+	} else {
 		/* MII enable|No Rcv on Xmit */
 		fecp->fec_r_cntrl = OPT_FRAME_SIZE | 0x06;
 		fecp->fec_x_cntrl = 0x00;
diff --git a/drivers/net/lguest_net.c b/drivers/net/lguest_net.c
deleted file mode 100644
index abce2ee8430..00000000000
--- a/drivers/net/lguest_net.c
+++ /dev/null
@@ -1,555 +0,0 @@
-/*D:500
- * The Guest network driver.
- *
- * This is very simple a virtual network driver, and our last Guest driver.
- * The only trick is that it can talk directly to multiple other recipients
- * (ie. other Guests on the same network).  It can also be used with only the
- * Host on the network.
- :*/
-
-/* Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-//#define DEBUG
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/module.h>
-#include <linux/mm_types.h>
-#include <linux/io.h>
-#include <linux/lguest_bus.h>
-
-#define SHARED_SIZE		PAGE_SIZE
-#define MAX_LANS		4
-#define NUM_SKBS		8
-
-/*M:011 Network code master Jeff Garzik points out numerous shortcomings in
- * this driver if it aspires to greatness.
- *
- * Firstly, it doesn't use "NAPI": the networking's New API, and is poorer for
- * it.  As he says "NAPI means system-wide load leveling, across multiple
- * network interfaces.  Lack of NAPI can mean competition at higher loads."
- *
- * He also points out that we don't implement set_mac_address, so users cannot
- * change the devices hardware address.  When I asked why one would want to:
- * "Bonding, and situations where you /do/ want the MAC address to "leak" out
- * of the host onto the wider net."
- *
- * Finally, he would like module unloading: "It is not unrealistic to think of
- * [un|re|]loading the net support module in an lguest guest.  And, adding
- * module support makes the programmer more responsible, because they now have
- * to learn to clean up after themselves.  Any driver that cannot clean up
- * after itself is an incomplete driver in my book."
- :*/
-
-/*D:530 The "struct lguestnet_info" contains all the information we need to
- * know about the network device. */
-struct lguestnet_info
-{
-	/* The mapped device page(s) (an array of "struct lguest_net"). */
-	struct lguest_net *peer;
-	/* The physical address of the device page(s) */
-	unsigned long peer_phys;
-	/* The size of the device page(s). */
-	unsigned long mapsize;
-
-	/* The lguest_device I come from */
-	struct lguest_device *lgdev;
-
-	/* My peerid (ie. my slot in the array). */
-	unsigned int me;
-
-	/* Receive queue: the network packets waiting to be filled. */
-	struct sk_buff *skb[NUM_SKBS];
-	struct lguest_dma dma[NUM_SKBS];
-};
-/*:*/
-
-/* How many bytes left in this page. */
-static unsigned int rest_of_page(void *data)
-{
-	return PAGE_SIZE - ((unsigned long)data % PAGE_SIZE);
-}
-
-/*D:570 Each peer (ie. Guest or Host) on the network binds their receive
- * buffers to a different key: we simply use the physical address of the
- * device's memory page plus the peer number.  The Host insists that all keys
- * be a multiple of 4, so we multiply the peer number by 4. */
-static unsigned long peer_key(struct lguestnet_info *info, unsigned peernum)
-{
-	return info->peer_phys + 4 * peernum;
-}
-
-/* This is the routine which sets up a "struct lguest_dma" to point to a
- * network packet, similar to req_to_dma() in lguest_blk.c.  The structure of a
- * "struct sk_buff" has grown complex over the years: it consists of a "head"
- * linear section pointed to by "skb->data", and possibly an array of
- * "fragments" in the case of a non-linear packet.
- *
- * Our receive buffers don't use fragments at all but outgoing skbs might, so
- * we handle it. */
-static void skb_to_dma(const struct sk_buff *skb, unsigned int headlen,
-		       struct lguest_dma *dma)
-{
-	unsigned int i, seg;
-
-	/* First, we put the linear region into the "struct lguest_dma".  Each
-	 * entry can't go over a page boundary, so even though all our packets
-	 * are 1514 bytes or less, we might need to use two entries here: */
-	for (i = seg = 0; i < headlen; seg++, i += rest_of_page(skb->data+i)) {
-		dma->addr[seg] = virt_to_phys(skb->data + i);
-		dma->len[seg] = min((unsigned)(headlen - i),
-				    rest_of_page(skb->data + i));
-	}
-
-	/* Now we handle the fragments: at least they're guaranteed not to go
-	 * over a page.  skb_shinfo(skb) returns a pointer to the structure
-	 * which tells us about the number of fragments and the fragment
-	 * array. */
-	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, seg++) {
-		const skb_frag_t *f = &skb_shinfo(skb)->frags[i];
-		/* Should not happen with MTU less than 64k - 2 * PAGE_SIZE. */
-		if (seg == LGUEST_MAX_DMA_SECTIONS) {
-			/* We will end up sending a truncated packet should
-			 * this ever happen.  Plus, a cool log message! */
-			printk("Woah dude!  Megapacket!\n");
-			break;
-		}
-		dma->addr[seg] = page_to_phys(f->page) + f->page_offset;
-		dma->len[seg] = f->size;
-	}
-
-	/* If after all that we didn't use the entire "struct lguest_dma"
-	 * array, we terminate it with a 0 length. */
-	if (seg < LGUEST_MAX_DMA_SECTIONS)
-		dma->len[seg] = 0;
-}
-
-/*
- * Packet transmission.
- *
- * Our packet transmission is a little unusual.  A real network card would just
- * send out the packet and leave the receivers to decide if they're interested.
- * Instead, we look through the network device memory page and see if any of
- * the ethernet addresses match the packet destination, and if so we send it to
- * that Guest.
- *
- * This is made a little more complicated in two cases.  The first case is
- * broadcast packets: for that we send the packet to all Guests on the network,
- * one at a time.  The second case is "promiscuous" mode, where a Guest wants
- * to see all the packets on the network.  We need a way for the Guest to tell
- * us it wants to see all packets, so it sets the "multicast" bit on its
- * published MAC address, which is never valid in a real ethernet address.
- */
-#define PROMISC_BIT		0x01
-
-/* This is the callback which is summoned whenever the network device's
- * multicast or promiscuous state changes.  If the card is in promiscuous mode,
- * we advertise that in our ethernet address in the device's memory.  We do the
- * same if Linux wants any or all multicast traffic.  */
-static void lguestnet_set_multicast(struct net_device *dev)
-{
-	struct lguestnet_info *info = netdev_priv(dev);
-
-	if ((dev->flags & (IFF_PROMISC|IFF_ALLMULTI)) || dev->mc_count)
-		info->peer[info->me].mac[0] |= PROMISC_BIT;
-	else
-		info->peer[info->me].mac[0] &= ~PROMISC_BIT;
-}
-
-/* A simple test function to see if a peer wants to see all packets.*/
-static int promisc(struct lguestnet_info *info, unsigned int peer)
-{
-	return info->peer[peer].mac[0] & PROMISC_BIT;
-}
-
-/* Another simple function to see if a peer's advertised ethernet address
- * matches a packet's destination ethernet address. */
-static int mac_eq(const unsigned char mac[ETH_ALEN],
-		  struct lguestnet_info *info, unsigned int peer)
-{
-	/* Ignore multicast bit, which peer turns on to mean promisc. */
-	if ((info->peer[peer].mac[0] & (~PROMISC_BIT)) != mac[0])
-		return 0;
-	return memcmp(mac+1, info->peer[peer].mac+1, ETH_ALEN-1) == 0;
-}
-
-/* This is the function which actually sends a packet once we've decided a
- * peer wants it: */
-static void transfer_packet(struct net_device *dev,
-			    struct sk_buff *skb,
-			    unsigned int peernum)
-{
-	struct lguestnet_info *info = netdev_priv(dev);
-	struct lguest_dma dma;
-
-	/* We use our handy "struct lguest_dma" packing function to prepare
-	 * the skb for sending. */
-	skb_to_dma(skb, skb_headlen(skb), &dma);
-	pr_debug("xfer length %04x (%u)\n", htons(skb->len), skb->len);
-
-	/* This is the actual send call which copies the packet. */
-	lguest_send_dma(peer_key(info, peernum), &dma);
-
-	/* Check that the entire packet was transmitted.  If not, it could mean
-	 * that the other Guest registered a short receive buffer, but this
-	 * driver should never do that.  More likely, the peer is dead. */
-	if (dma.used_len != skb->len) {
-		dev->stats.tx_carrier_errors++;
-		pr_debug("Bad xfer to peer %i: %i of %i (dma %p/%i)\n",
-			 peernum, dma.used_len, skb->len,
-			 (void *)dma.addr[0], dma.len[0]);
-	} else {
-		/* On success we update the stats. */
-		dev->stats.tx_bytes += skb->len;
-		dev->stats.tx_packets++;
-	}
-}
-
-/* Another helper function to tell is if a slot in the device memory is unused.
- * Since we always set the Local Assignment bit in the ethernet address, the
- * first byte can never be 0. */
-static int unused_peer(const struct lguest_net peer[], unsigned int num)
-{
-	return peer[num].mac[0] == 0;
-}
-
-/* Finally, here is the routine which handles an outgoing packet.  It's called
- * "start_xmit" for traditional reasons. */
-static int lguestnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
-{
-	unsigned int i;
-	int broadcast;
-	struct lguestnet_info *info = netdev_priv(dev);
-	/* Extract the destination ethernet address from the packet. */
-	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
-	DECLARE_MAC_BUF(mac);
-
-	pr_debug("%s: xmit %s\n", dev->name, print_mac(mac, dest));
-
-	/* If it's a multicast packet, we broadcast to everyone.  That's not
-	 * very efficient, but there are very few applications which actually
-	 * use multicast, which is a shame really.
-	 *
-	 * As etherdevice.h points out: "By definition the broadcast address is
-	 * also a multicast address."  So we don't have to test for broadcast
-	 * packets separately. */
-	broadcast = is_multicast_ether_addr(dest);
-
-	/* Look through all the published ethernet addresses to see if we
-	 * should send this packet. */
-	for (i = 0; i < info->mapsize/sizeof(struct lguest_net); i++) {
-		/* We don't send to ourselves (we actually can't SEND_DMA to
-		 * ourselves anyway), and don't send to unused slots.*/
-		if (i == info->me || unused_peer(info->peer, i))
-			continue;
-
-		/* If it's broadcast we send it.  If they want every packet we
-		 * send it.  If the destination matches their address we send
-		 * it.  Otherwise we go to the next peer. */
-		if (!broadcast && !promisc(info, i) && !mac_eq(dest, info, i))
-			continue;
-
-		pr_debug("lguestnet %s: sending from %i to %i\n",
-			 dev->name, info->me, i);
-		/* Our routine which actually does the transfer. */
-		transfer_packet(dev, skb, i);
-	}
-
-	/* An xmit routine is expected to dispose of the packet, so we do. */
-	dev_kfree_skb(skb);
-
-	/* As per kernel convention, 0 means success.  This is why I love
-	 * networking: even if we never sent to anyone, that's still
-	 * success! */
-	return 0;
-}
-
-/*D:560
- * Packet receiving.
- *
- * First, here's a helper routine which fills one of our array of receive
- * buffers: */
-static int fill_slot(struct net_device *dev, unsigned int slot)
-{
-	struct lguestnet_info *info = netdev_priv(dev);
-
-	/* We can receive ETH_DATA_LEN (1500) byte packets, plus a standard
-	 * ethernet header of ETH_HLEN (14) bytes. */
-	info->skb[slot] = netdev_alloc_skb(dev, ETH_HLEN + ETH_DATA_LEN);
-	if (!info->skb[slot]) {
-		printk("%s: could not fill slot %i\n", dev->name, slot);
-		return -ENOMEM;
-	}
-
-	/* skb_to_dma() is a helper which sets up the "struct lguest_dma" to
-	 * point to the data in the skb: we also use it for sending out a
-	 * packet. */
-	skb_to_dma(info->skb[slot], ETH_HLEN + ETH_DATA_LEN, &info->dma[slot]);
-
-	/* This is a Write Memory Barrier: it ensures that the entry in the
-	 * receive buffer array is written *before* we set the "used_len" entry
-	 * to 0.  If the Host were looking at the receive buffer array from a
-	 * different CPU, it could potentially see "used_len = 0" and not see
-	 * the updated receive buffer information.  This would be a horribly
-	 * nasty bug, so make sure the compiler and CPU know this has to happen
-	 * first. */
-	wmb();
-	/* Writing 0 to "used_len" tells the Host it can use this receive
-	 * buffer now. */
-	info->dma[slot].used_len = 0;
-	return 0;
-}
-
-/* This is the actual receive routine.  When we receive an interrupt from the
- * Host to tell us a packet has been delivered, we arrive here: */
-static irqreturn_t lguestnet_rcv(int irq, void *dev_id)
-{
-	struct net_device *dev = dev_id;
-	struct lguestnet_info *info = netdev_priv(dev);
-	unsigned int i, done = 0;
-
-	/* Look through our entire receive array for an entry which has data
-	 * in it. */
-	for (i = 0; i < ARRAY_SIZE(info->dma); i++) {
-		unsigned int length;
-		struct sk_buff *skb;
-
-		length = info->dma[i].used_len;
-		if (length == 0)
-			continue;
-
-		/* We've found one!  Remember the skb (we grabbed the length
-		 * above), and immediately refill the slot we've taken it
-		 * from. */
-		done++;
-		skb = info->skb[i];
-		fill_slot(dev, i);
-
-		/* This shouldn't happen: micropackets could be sent by a
-		 * badly-behaved Guest on the network, but the Host will never
-		 * stuff more data in the buffer than the buffer length. */
-		if (length < ETH_HLEN || length > ETH_HLEN + ETH_DATA_LEN) {
-			pr_debug(KERN_WARNING "%s: unbelievable skb len: %i\n",
-				 dev->name, length);
-			dev_kfree_skb(skb);
-			continue;
-		}
-
-		/* skb_put(), what a great function!  I've ranted about this
-		 * function before (http://lkml.org/lkml/1999/9/26/24).  You
-		 * call it after you've added data to the end of an skb (in
-		 * this case, it was the Host which wrote the data). */
-		skb_put(skb, length);
-
-		/* The ethernet header contains a protocol field: we use the
-		 * standard helper to extract it, and place the result in
-		 * skb->protocol.  The helper also sets up skb->pkt_type and
-		 * eats up the ethernet header from the front of the packet. */
-		skb->protocol = eth_type_trans(skb, dev);
-
-		/* If this device doesn't need checksums for sending, we also
-		 * don't need to check the packets when they come in. */
-		if (dev->features & NETIF_F_NO_CSUM)
-			skb->ip_summed = CHECKSUM_UNNECESSARY;
-
-		/* As a last resort for debugging the driver or the lguest I/O
-		 * subsystem, you can uncomment the "#define DEBUG" at the top
-		 * of this file, which turns all the pr_debug() into printk()
-		 * and floods the logs. */
-		pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
-			 ntohs(skb->protocol), skb->len, skb->pkt_type);
-
-		/* Update the packet and byte counts (visible from ifconfig,
-		 * and good for debugging). */
-		dev->stats.rx_bytes += skb->len;
-		dev->stats.rx_packets++;
-
-		/* Hand our fresh network packet into the stack's "network
-		 * interface receive" routine.  That will free the packet
-		 * itself when it's finished. */
-		netif_rx(skb);
-	}
-
-	/* If we found any packets, we assume the interrupt was for us. */
-	return done ? IRQ_HANDLED : IRQ_NONE;
-}
-
-/*D:550 This is where we start: when the device is brought up by dhcpd or
- * ifconfig.  At this point we advertise our MAC address to the rest of the
- * network, and register receive buffers ready for incoming packets. */
-static int lguestnet_open(struct net_device *dev)
-{
-	int i;
-	struct lguestnet_info *info = netdev_priv(dev);
-
-	/* Copy our MAC address into the device page, so others on the network
-	 * can find us. */
-	memcpy(info->peer[info->me].mac, dev->dev_addr, ETH_ALEN);
-
-	/* We might already be in promisc mode (dev->flags & IFF_PROMISC).  Our
-	 * set_multicast callback handles this already, so we call it now. */
-	lguestnet_set_multicast(dev);
-
-	/* Allocate packets and put them into our "struct lguest_dma" array.
-	 * If we fail to allocate all the packets we could still limp along,
-	 * but it's a sign of real stress so we should probably give up now. */
-	for (i = 0; i < ARRAY_SIZE(info->dma); i++) {
-		if (fill_slot(dev, i) != 0)
-			goto cleanup;
-	}
-
-	/* Finally we tell the Host where our array of "struct lguest_dma"
-	 * receive buffers is, binding it to the key corresponding to the
-	 * device's physical memory plus our peerid. */
-	if (lguest_bind_dma(peer_key(info,info->me), info->dma,
-			    NUM_SKBS, lgdev_irq(info->lgdev)) != 0)
-		goto cleanup;
-	return 0;
-
-cleanup:
-	while (--i >= 0)
-		dev_kfree_skb(info->skb[i]);
-	return -ENOMEM;
-}
-/*:*/
-
-/* The close routine is called when the device is no longer in use: we clean up
- * elegantly. */
-static int lguestnet_close(struct net_device *dev)
-{
-	unsigned int i;
-	struct lguestnet_info *info = netdev_priv(dev);
-
-	/* Clear all trace of our existence out of the device memory by setting
-	 * the slot which held our MAC address to 0 (unused). */
-	memset(&info->peer[info->me], 0, sizeof(info->peer[info->me]));
-
-	/* Unregister our array of receive buffers */
-	lguest_unbind_dma(peer_key(info, info->me), info->dma);
-	for (i = 0; i < ARRAY_SIZE(info->dma); i++)
-		dev_kfree_skb(info->skb[i]);
-	return 0;
-}
-
-/*D:510 The network device probe function is basically a standard ethernet
- * device setup.  It reads the "struct lguest_device_desc" and sets the "struct
- * net_device".  Oh, the line-by-line excitement!  Let's skip over it. :*/
-static int lguestnet_probe(struct lguest_device *lgdev)
-{
-	int err, irqf = IRQF_SHARED;
-	struct net_device *dev;
-	struct lguestnet_info *info;
-	struct lguest_device_desc *desc = &lguest_devices[lgdev->index];
-
-	pr_debug("lguest_net: probing for device %i\n", lgdev->index);
-
-	dev = alloc_etherdev(sizeof(struct lguestnet_info));
-	if (!dev)
-		return -ENOMEM;
-
-	/* Ethernet defaults with some changes */
-	ether_setup(dev);
-	dev->set_mac_address = NULL;
-
-	dev->dev_addr[0] = 0x02; /* set local assignment bit (IEEE802) */
-	dev->dev_addr[1] = 0x00;
-	memcpy(&dev->dev_addr[2], &lguest_data.guestid, 2);
-	dev->dev_addr[4] = 0x00;
-	dev->dev_addr[5] = 0x00;
-
-	dev->open = lguestnet_open;
-	dev->stop = lguestnet_close;
-	dev->hard_start_xmit = lguestnet_start_xmit;
-
-	/* We don't actually support multicast yet, but turning on/off
-	 * promisc also calls dev->set_multicast_list. */
-	dev->set_multicast_list = lguestnet_set_multicast;
-	SET_NETDEV_DEV(dev, &lgdev->dev);
-
-	/* The network code complains if you have "scatter-gather" capability
-	 * if you don't also handle checksums (it seem that would be
-	 * "illogical").  So we use a lie of omission and don't tell it that we
-	 * can handle scattered packets unless we also don't want checksums,
-	 * even though to us they're completely independent. */
-	if (desc->features & LGUEST_NET_F_NOCSUM)
-		dev->features = NETIF_F_SG|NETIF_F_NO_CSUM;
-
-	info = netdev_priv(dev);
-	info->mapsize = PAGE_SIZE * desc->num_pages;
-	info->peer_phys = ((unsigned long)desc->pfn << PAGE_SHIFT);
-	info->lgdev = lgdev;
-	info->peer = lguest_map(info->peer_phys, desc->num_pages);
-	if (!info->peer) {
-		err = -ENOMEM;
-		goto free;
-	}
-
-	/* This stores our peerid (upper bits reserved for future). */
-	info->me = (desc->features & (info->mapsize-1));
-
-	err = register_netdev(dev);
-	if (err) {
-		pr_debug("lguestnet: registering device failed\n");
-		goto unmap;
-	}
-
-	if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS)
-		irqf |= IRQF_SAMPLE_RANDOM;
-	if (request_irq(lgdev_irq(lgdev), lguestnet_rcv, irqf, "lguestnet",
-			dev) != 0) {
-		pr_debug("lguestnet: cannot get irq %i\n", lgdev_irq(lgdev));
-		goto unregister;
-	}
-
-	pr_debug("lguestnet: registered device %s\n", dev->name);
-	/* Finally, we put the "struct net_device" in the generic "struct
-	 * lguest_device"s private pointer.  Again, it's not necessary, but
-	 * makes sure the cool kernel kids don't tease us. */
-	lgdev->private = dev;
-	return 0;
-
-unregister:
-	unregister_netdev(dev);
-unmap:
-	lguest_unmap(info->peer);
-free:
-	free_netdev(dev);
-	return err;
-}
-
-static struct lguest_driver lguestnet_drv = {
-	.name = "lguestnet",
-	.owner = THIS_MODULE,
-	.device_type = LGUEST_DEVICE_T_NET,
-	.probe = lguestnet_probe,
-};
-
-static __init int lguestnet_init(void)
-{
-	return register_lguest_driver(&lguestnet_drv);
-}
-module_init(lguestnet_init);
-
-MODULE_DESCRIPTION("Lguest network driver");
-MODULE_LICENSE("GPL");
-
-/*D:580
- * This is the last of the Drivers, and with this we have covered the many and
- * wonderous and fine (and boring) details of the Guest.
- *
- * "make Launcher" beckons, where we answer questions like "Where do Guests
- * come from?", and "What do you do when someone asks for optimization?"
- */
diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c
index 6471d33afb7..50648738d67 100644
--- a/drivers/net/mlx4/fw.c
+++ b/drivers/net/mlx4/fw.c
@@ -736,7 +736,7 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 	MLX4_PUT(inbox, (u8) (PAGE_SHIFT - 12), INIT_HCA_UAR_PAGE_SZ_OFFSET);
 	MLX4_PUT(inbox, param->log_uar_sz,      INIT_HCA_LOG_UAR_SZ_OFFSET);
 
-	err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_INIT_HCA, 1000);
+	err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_INIT_HCA, 10000);
 
 	if (err)
 		mlx4_err(dev, "INIT_HCA returns %d\n", err);
diff --git a/drivers/net/mlx4/icm.c b/drivers/net/mlx4/icm.c
index 4b3c109d5ea..887633b207d 100644
--- a/drivers/net/mlx4/icm.c
+++ b/drivers/net/mlx4/icm.c
@@ -60,7 +60,7 @@ static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chu
 			     PCI_DMA_BIDIRECTIONAL);
 
 	for (i = 0; i < chunk->npages; ++i)
-		__free_pages(chunk->mem[i].page,
+		__free_pages(sg_page(&chunk->mem[i]),
 			     get_order(chunk->mem[i].length));
 }
 
@@ -70,7 +70,7 @@ static void mlx4_free_icm_coherent(struct mlx4_dev *dev, struct mlx4_icm_chunk *
 
 	for (i = 0; i < chunk->npages; ++i)
 		dma_free_coherent(&dev->pdev->dev, chunk->mem[i].length,
-				  lowmem_page_address(chunk->mem[i].page),
+				  lowmem_page_address(sg_page(&chunk->mem[i])),
 				  sg_dma_address(&chunk->mem[i]));
 }
 
@@ -95,10 +95,13 @@ void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent)
 
 static int mlx4_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_mask)
 {
-	mem->page = alloc_pages(gfp_mask, order);
-	if (!mem->page)
+	struct page *page;
+
+	page = alloc_pages(gfp_mask, order);
+	if (!page)
 		return -ENOMEM;
 
+	sg_set_page(mem, page);
 	mem->length = PAGE_SIZE << order;
 	mem->offset = 0;
 	return 0;
@@ -145,6 +148,7 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
 			if (!chunk)
 				goto fail;
 
+			sg_init_table(chunk->mem, MLX4_ICM_CHUNK_LEN);
 			chunk->npages = 0;
 			chunk->nsg    = 0;
 			list_add_tail(&chunk->list, &icm->chunk_list);
@@ -334,7 +338,7 @@ void *mlx4_table_find(struct mlx4_icm_table *table, int obj, dma_addr_t *dma_han
 			 * been assigned to.
 			 */
 			if (chunk->mem[i].length > offset) {
-				page = chunk->mem[i].page;
+				page = sg_page(&chunk->mem[i]);
 				goto out;
 			}
 			offset -= chunk->mem[i].length;
diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c
index 5b41e8bdd6b..651c2699d5e 100644
--- a/drivers/net/mv643xx_eth.c
+++ b/drivers/net/mv643xx_eth.c
@@ -3274,6 +3274,7 @@ static const struct ethtool_ops mv643xx_ethtool_ops = {
 	.get_drvinfo            = mv643xx_get_drvinfo,
 	.get_link               = mv643xx_eth_get_link,
 	.set_sg			= ethtool_op_set_sg,
+	.get_sset_count		= mv643xx_get_sset_count,
 	.get_ethtool_stats      = mv643xx_get_ethtool_stats,
 	.get_strings            = mv643xx_get_strings,
 	.nway_reset		= mv643xx_eth_nway_restart,
diff --git a/drivers/net/niu.c b/drivers/net/niu.c
index ed1f9bbb2a3..112ab079ce7 100644
--- a/drivers/net/niu.c
+++ b/drivers/net/niu.c
@@ -3103,31 +3103,12 @@ static int niu_alloc_tx_ring_info(struct niu *np,
 
 static void niu_size_rbr(struct niu *np, struct rx_ring_info *rp)
 {
-	u16 bs;
+	u16 bss;
 
-	switch (PAGE_SIZE) {
-	case 4 * 1024:
-	case 8 * 1024:
-	case 16 * 1024:
-	case 32 * 1024:
-		rp->rbr_block_size = PAGE_SIZE;
-		rp->rbr_blocks_per_page = 1;
-		break;
+	bss = min(PAGE_SHIFT, 15);
 
-	default:
-		if (PAGE_SIZE % (32 * 1024) == 0)
-			bs = 32 * 1024;
-		else if (PAGE_SIZE % (16 * 1024) == 0)
-			bs = 16 * 1024;
-		else if (PAGE_SIZE % (8 * 1024) == 0)
-			bs = 8 * 1024;
-		else if (PAGE_SIZE % (4 * 1024) == 0)
-			bs = 4 * 1024;
-		else
-			BUG();
-		rp->rbr_block_size = bs;
-		rp->rbr_blocks_per_page = PAGE_SIZE / bs;
-	}
+	rp->rbr_block_size = 1 << bss;
+	rp->rbr_blocks_per_page = 1 << (PAGE_SHIFT-bss);
 
 	rp->rbr_sizes[0] = 256;
 	rp->rbr_sizes[1] = 1024;
@@ -7902,12 +7883,7 @@ static int __init niu_init(void)
 {
 	int err = 0;
 
-	BUILD_BUG_ON((PAGE_SIZE < 4 * 1024) ||
-		     ((PAGE_SIZE > 32 * 1024) &&
-		      ((PAGE_SIZE % (32 * 1024)) != 0 &&
-		       (PAGE_SIZE % (16 * 1024)) != 0 &&
-		       (PAGE_SIZE % (8 * 1024)) != 0 &&
-		       (PAGE_SIZE % (4 * 1024)) != 0)));
+	BUILD_BUG_ON(PAGE_SIZE < 4 * 1024);
 
 	niu_debug = netif_msg_init(debug, NIU_MSG_DEFAULT);
 
diff --git a/drivers/net/ppp_mppe.c b/drivers/net/ppp_mppe.c
index c0b6d19d145..bcb0885011c 100644
--- a/drivers/net/ppp_mppe.c
+++ b/drivers/net/ppp_mppe.c
@@ -55,7 +55,7 @@
 #include <linux/mm.h>
 #include <linux/ppp_defs.h>
 #include <linux/ppp-comp.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 
 #include "ppp_mppe.h"
 
@@ -68,9 +68,7 @@ MODULE_VERSION("1.0.2");
 static unsigned int
 setup_sg(struct scatterlist *sg, const void *address, unsigned int length)
 {
-	sg[0].page = virt_to_page(address);
-	sg[0].offset = offset_in_page(address);
-	sg[0].length = length;
+	sg_init_one(sg, address, length);
 	return length;
 }
 
diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 419c00cbe6e..e8960f294a6 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -44,7 +44,8 @@
 		printk( "Assertion failed! %s,%s,%s,line=%d\n",	\
 		#expr,__FILE__,__FUNCTION__,__LINE__);		\
 	}
-#define dprintk(fmt, args...)	do { printk(PFX fmt, ## args); } while (0)
+#define dprintk(fmt, args...) \
+	do { printk(KERN_DEBUG PFX fmt, ## args); } while (0)
 #else
 #define assert(expr) do {} while (0)
 #define dprintk(fmt, args...)	do {} while (0)
@@ -111,19 +112,15 @@ enum mac_version {
 	RTL_GIGA_MAC_VER_05 = 0x05, // 8110SCd
 	RTL_GIGA_MAC_VER_06 = 0x06, // 8110SCe
 	RTL_GIGA_MAC_VER_11 = 0x0b, // 8168Bb
-	RTL_GIGA_MAC_VER_12 = 0x0c, // 8168Be 8168Bf
-	RTL_GIGA_MAC_VER_13 = 0x0d, // 8101Eb 8101Ec
-	RTL_GIGA_MAC_VER_14 = 0x0e, // 8101
-	RTL_GIGA_MAC_VER_15 = 0x0f  // 8101
-};
-
-enum phy_version {
-	RTL_GIGA_PHY_VER_C = 0x03, /* PHY Reg 0x03 bit0-3 == 0x0000 */
-	RTL_GIGA_PHY_VER_D = 0x04, /* PHY Reg 0x03 bit0-3 == 0x0000 */
-	RTL_GIGA_PHY_VER_E = 0x05, /* PHY Reg 0x03 bit0-3 == 0x0000 */
-	RTL_GIGA_PHY_VER_F = 0x06, /* PHY Reg 0x03 bit0-3 == 0x0001 */
-	RTL_GIGA_PHY_VER_G = 0x07, /* PHY Reg 0x03 bit0-3 == 0x0002 */
-	RTL_GIGA_PHY_VER_H = 0x08, /* PHY Reg 0x03 bit0-3 == 0x0003 */
+	RTL_GIGA_MAC_VER_12 = 0x0c, // 8168Be
+	RTL_GIGA_MAC_VER_13 = 0x0d, // 8101Eb
+	RTL_GIGA_MAC_VER_14 = 0x0e, // 8101 ?
+	RTL_GIGA_MAC_VER_15 = 0x0f, // 8101 ?
+	RTL_GIGA_MAC_VER_16 = 0x11, // 8101Ec
+	RTL_GIGA_MAC_VER_17 = 0x10, // 8168Bf
+	RTL_GIGA_MAC_VER_18 = 0x12, // 8168CP
+	RTL_GIGA_MAC_VER_19 = 0x13, // 8168C
+	RTL_GIGA_MAC_VER_20 = 0x14  // 8168C
 };
 
 #define _R(NAME,MAC,MASK) \
@@ -144,7 +141,12 @@ static const struct {
 	_R("RTL8168b/8111b",	RTL_GIGA_MAC_VER_12, 0xff7e1880), // PCI-E
 	_R("RTL8101e",		RTL_GIGA_MAC_VER_13, 0xff7e1880), // PCI-E 8139
 	_R("RTL8100e",		RTL_GIGA_MAC_VER_14, 0xff7e1880), // PCI-E 8139
-	_R("RTL8100e",		RTL_GIGA_MAC_VER_15, 0xff7e1880)  // PCI-E 8139
+	_R("RTL8100e",		RTL_GIGA_MAC_VER_15, 0xff7e1880), // PCI-E 8139
+	_R("RTL8168b/8111b",	RTL_GIGA_MAC_VER_17, 0xff7e1880), // PCI-E
+	_R("RTL8101e",		RTL_GIGA_MAC_VER_16, 0xff7e1880), // PCI-E
+	_R("RTL8168cp/8111cp",	RTL_GIGA_MAC_VER_18, 0xff7e1880), // PCI-E
+	_R("RTL8168c/8111c",	RTL_GIGA_MAC_VER_19, 0xff7e1880), // PCI-E
+	_R("RTL8168c/8111c",	RTL_GIGA_MAC_VER_20, 0xff7e1880)  // PCI-E
 };
 #undef _R
 
@@ -165,7 +167,7 @@ static struct pci_device_id rtl8169_pci_tbl[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_REALTEK,	0x8168), 0, 0, RTL_CFG_1 },
 	{ PCI_DEVICE(PCI_VENDOR_ID_REALTEK,	0x8169), 0, 0, RTL_CFG_0 },
 	{ PCI_DEVICE(PCI_VENDOR_ID_DLINK,	0x4300), 0, 0, RTL_CFG_0 },
-	{ PCI_DEVICE(0x1259,			0xc107), 0, 0, RTL_CFG_0 },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AT,		0xc107), 0, 0, RTL_CFG_0 },
 	{ PCI_DEVICE(0x16ec,			0x0116), 0, 0, RTL_CFG_0 },
 	{ PCI_VENDOR_ID_LINKSYS,		0x1032,
 		PCI_ANY_ID, 0x0024, 0, 0, RTL_CFG_0 },
@@ -277,6 +279,7 @@ enum rtl_register_content {
 	TxDMAShift = 8,	/* DMA burst value (0-7) is shift this many bits */
 
 	/* Config1 register p.24 */
+	MSIEnable	= (1 << 5),	/* Enable Message Signaled Interrupt */
 	PMEnable	= (1 << 0),	/* Power Management Enable */
 
 	/* Config2 register p. 25 */
@@ -380,17 +383,20 @@ struct ring_info {
 	u8		__pad[sizeof(void *) - sizeof(u32)];
 };
 
+enum features {
+	RTL_FEATURE_WOL	= (1 << 0),
+	RTL_FEATURE_MSI	= (1 << 1),
+};
+
 struct rtl8169_private {
 	void __iomem *mmio_addr;	/* memory map physical address */
 	struct pci_dev *pci_dev;	/* Index of PCI device */
 	struct net_device *dev;
 	struct napi_struct napi;
-	struct net_device_stats stats;	/* statistics of net device */
 	spinlock_t lock;		/* spin lock flag */
 	u32 msg_enable;
 	int chipset;
 	int mac_version;
-	int phy_version;
 	u32 cur_rx; /* Index into the Rx descriptor buffer of next Rx pkt. */
 	u32 cur_tx; /* Index into the Tx descriptor buffer of next Rx pkt. */
 	u32 dirty_rx;
@@ -420,7 +426,7 @@ struct rtl8169_private {
 	unsigned int (*phy_reset_pending)(void __iomem *);
 	unsigned int (*link_ok)(void __iomem *);
 	struct delayed_work task;
-	unsigned wol_enabled : 1;
+	unsigned features;
 };
 
 MODULE_AUTHOR("Realtek and the Linux r8169 crew <netdev@vger.kernel.org>");
@@ -626,7 +632,10 @@ static int rtl8169_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
 
 	RTL_W8(Cfg9346, Cfg9346_Lock);
 
-	tp->wol_enabled = (wol->wolopts) ? 1 : 0;
+	if (wol->wolopts)
+		tp->features |= RTL_FEATURE_WOL;
+	else
+		tp->features &= ~RTL_FEATURE_WOL;
 
 	spin_unlock_irq(&tp->lock);
 
@@ -707,7 +716,8 @@ static int rtl8169_set_speed_xmii(struct net_device *dev,
 
 		/* This tweak comes straight from Realtek's driver. */
 		if ((speed == SPEED_100) && (duplex == DUPLEX_HALF) &&
-		    (tp->mac_version == RTL_GIGA_MAC_VER_13)) {
+		    ((tp->mac_version == RTL_GIGA_MAC_VER_13) ||
+		     (tp->mac_version == RTL_GIGA_MAC_VER_16))) {
 			auto_nego = ADVERTISE_100HALF | ADVERTISE_CSMA;
 		}
 	}
@@ -715,7 +725,8 @@ static int rtl8169_set_speed_xmii(struct net_device *dev,
 	/* The 8100e/8101e do Fast Ethernet only. */
 	if ((tp->mac_version == RTL_GIGA_MAC_VER_13) ||
 	    (tp->mac_version == RTL_GIGA_MAC_VER_14) ||
-	    (tp->mac_version == RTL_GIGA_MAC_VER_15)) {
+	    (tp->mac_version == RTL_GIGA_MAC_VER_15) ||
+	    (tp->mac_version == RTL_GIGA_MAC_VER_16)) {
 		if ((giga_ctrl & (ADVERTISE_1000FULL | ADVERTISE_1000HALF)) &&
 		    netif_msg_link(tp)) {
 			printk(KERN_INFO "%s: PHY does not support 1000Mbps.\n",
@@ -726,7 +737,8 @@ static int rtl8169_set_speed_xmii(struct net_device *dev,
 
 	auto_nego |= ADVERTISE_PAUSE_CAP | ADVERTISE_PAUSE_ASYM;
 
-	if (tp->mac_version == RTL_GIGA_MAC_VER_12) {
+	if ((tp->mac_version == RTL_GIGA_MAC_VER_12) ||
+	    (tp->mac_version == RTL_GIGA_MAC_VER_17)) {
 		/* Vendor specific (0x1f) and reserved (0x0e) MII registers. */
 		mdio_write(ioaddr, 0x1f, 0x0000);
 		mdio_write(ioaddr, 0x0e, 0x0000);
@@ -1104,26 +1116,51 @@ static void rtl8169_get_mac_version(struct rtl8169_private *tp,
 	 */
 	const struct {
 		u32 mask;
+		u32 val;
 		int mac_version;
 	} mac_info[] = {
-		{ 0x38800000,	RTL_GIGA_MAC_VER_15 },
-		{ 0x38000000,	RTL_GIGA_MAC_VER_12 },
-		{ 0x34000000,	RTL_GIGA_MAC_VER_13 },
-		{ 0x30800000,	RTL_GIGA_MAC_VER_14 },
-		{ 0x30000000,	RTL_GIGA_MAC_VER_11 },
-		{ 0x98000000,	RTL_GIGA_MAC_VER_06 },
-		{ 0x18000000,	RTL_GIGA_MAC_VER_05 },
-		{ 0x10000000,	RTL_GIGA_MAC_VER_04 },
-		{ 0x04000000,	RTL_GIGA_MAC_VER_03 },
-		{ 0x00800000,	RTL_GIGA_MAC_VER_02 },
-		{ 0x00000000,	RTL_GIGA_MAC_VER_01 }	/* Catch-all */
+		/* 8168B family. */
+		{ 0x7c800000, 0x3c800000,	RTL_GIGA_MAC_VER_18 },
+		{ 0x7cf00000, 0x3c000000,	RTL_GIGA_MAC_VER_19 },
+		{ 0x7cf00000, 0x3c200000,	RTL_GIGA_MAC_VER_20 },
+		{ 0x7c800000, 0x3c000000,	RTL_GIGA_MAC_VER_20 },
+
+		/* 8168B family. */
+		{ 0x7cf00000, 0x38000000,	RTL_GIGA_MAC_VER_12 },
+		{ 0x7cf00000, 0x38500000,	RTL_GIGA_MAC_VER_17 },
+		{ 0x7c800000, 0x38000000,	RTL_GIGA_MAC_VER_17 },
+		{ 0x7c800000, 0x30000000,	RTL_GIGA_MAC_VER_11 },
+
+		/* 8101 family. */
+		{ 0x7cf00000, 0x34000000,	RTL_GIGA_MAC_VER_13 },
+		{ 0x7cf00000, 0x34200000,	RTL_GIGA_MAC_VER_16 },
+		{ 0x7c800000, 0x34000000,	RTL_GIGA_MAC_VER_16 },
+		/* FIXME: where did these entries come from ? -- FR */
+		{ 0xfc800000, 0x38800000,	RTL_GIGA_MAC_VER_15 },
+		{ 0xfc800000, 0x30800000,	RTL_GIGA_MAC_VER_14 },
+
+		/* 8110 family. */
+		{ 0xfc800000, 0x98000000,	RTL_GIGA_MAC_VER_06 },
+		{ 0xfc800000, 0x18000000,	RTL_GIGA_MAC_VER_05 },
+		{ 0xfc800000, 0x10000000,	RTL_GIGA_MAC_VER_04 },
+		{ 0xfc800000, 0x04000000,	RTL_GIGA_MAC_VER_03 },
+		{ 0xfc800000, 0x00800000,	RTL_GIGA_MAC_VER_02 },
+		{ 0xfc800000, 0x00000000,	RTL_GIGA_MAC_VER_01 },
+
+		{ 0x00000000, 0x00000000,	RTL_GIGA_MAC_VER_01 }	/* Catch-all */
 	}, *p = mac_info;
 	u32 reg;
 
-	reg = RTL_R32(TxConfig) & 0xfc800000;
-	while ((reg & p->mask) != p->mask)
+	reg = RTL_R32(TxConfig);
+	while ((reg & p->mask) != p->val)
 		p++;
 	tp->mac_version = p->mac_version;
+
+	if (p->mask == 0x00000000) {
+		struct pci_dev *pdev = tp->pci_dev;
+
+		dev_info(&pdev->dev, "unknown MAC (%08x)\n", reg);
+	}
 }
 
 static void rtl8169_print_mac_version(struct rtl8169_private *tp)
@@ -1131,54 +1168,21 @@ static void rtl8169_print_mac_version(struct rtl8169_private *tp)
 	dprintk("mac_version = 0x%02x\n", tp->mac_version);
 }
 
-static void rtl8169_get_phy_version(struct rtl8169_private *tp,
-				    void __iomem *ioaddr)
-{
-	const struct {
-		u16 mask;
-		u16 set;
-		int phy_version;
-	} phy_info[] = {
-		{ 0x000f, 0x0002, RTL_GIGA_PHY_VER_G },
-		{ 0x000f, 0x0001, RTL_GIGA_PHY_VER_F },
-		{ 0x000f, 0x0000, RTL_GIGA_PHY_VER_E },
-		{ 0x0000, 0x0000, RTL_GIGA_PHY_VER_D } /* Catch-all */
-	}, *p = phy_info;
+struct phy_reg {
 	u16 reg;
+	u16 val;
+};
 
-	reg = mdio_read(ioaddr, MII_PHYSID2) & 0xffff;
-	while ((reg & p->mask) != p->set)
-		p++;
-	tp->phy_version = p->phy_version;
-}
-
-static void rtl8169_print_phy_version(struct rtl8169_private *tp)
+static void rtl_phy_write(void __iomem *ioaddr, struct phy_reg *regs, int len)
 {
-	struct {
-		int version;
-		char *msg;
-		u32 reg;
-	} phy_print[] = {
-		{ RTL_GIGA_PHY_VER_G, "RTL_GIGA_PHY_VER_G", 0x0002 },
-		{ RTL_GIGA_PHY_VER_F, "RTL_GIGA_PHY_VER_F", 0x0001 },
-		{ RTL_GIGA_PHY_VER_E, "RTL_GIGA_PHY_VER_E", 0x0000 },
-		{ RTL_GIGA_PHY_VER_D, "RTL_GIGA_PHY_VER_D", 0x0000 },
-		{ 0, NULL, 0x0000 }
-	}, *p;
-
-	for (p = phy_print; p->msg; p++) {
-		if (tp->phy_version == p->version) {
-			dprintk("phy_version == %s (%04x)\n", p->msg, p->reg);
-			return;
-		}
+	while (len-- > 0) {
+		mdio_write(ioaddr, regs->reg, regs->val);
+		regs++;
 	}
-	dprintk("phy_version == Unknown\n");
 }
 
-static void rtl8169_hw_phy_config(struct net_device *dev)
+static void rtl8169s_hw_phy_config(void __iomem *ioaddr)
 {
-	struct rtl8169_private *tp = netdev_priv(dev);
-	void __iomem *ioaddr = tp->mmio_addr;
 	struct {
 		u16 regs[5]; /* Beware of bit-sign propagation */
 	} phy_magic[5] = { {
@@ -1211,33 +1215,9 @@ static void rtl8169_hw_phy_config(struct net_device *dev)
 	}, *p = phy_magic;
 	unsigned int i;
 
-	rtl8169_print_mac_version(tp);
-	rtl8169_print_phy_version(tp);
-
-	if (tp->mac_version <= RTL_GIGA_MAC_VER_01)
-		return;
-	if (tp->phy_version >= RTL_GIGA_PHY_VER_H)
-		return;
-
-	dprintk("MAC version != 0 && PHY version == 0 or 1\n");
-	dprintk("Do final_reg2.cfg\n");
-
-	/* Shazam ! */
-
-	if (tp->mac_version == RTL_GIGA_MAC_VER_04) {
-		mdio_write(ioaddr, 31, 0x0002);
-		mdio_write(ioaddr,  1, 0x90d0);
-		mdio_write(ioaddr, 31, 0x0000);
-		return;
-	}
-
-	if ((tp->mac_version != RTL_GIGA_MAC_VER_02) &&
-	    (tp->mac_version != RTL_GIGA_MAC_VER_03))
-		return;
-
-	mdio_write(ioaddr, 31, 0x0001);			//w 31 2 0 1
-	mdio_write(ioaddr, 21, 0x1000);			//w 21 15 0 1000
-	mdio_write(ioaddr, 24, 0x65c7);			//w 24 15 0 65c7
+	mdio_write(ioaddr, 0x1f, 0x0001);		//w 31 2 0 1
+	mdio_write(ioaddr, 0x15, 0x1000);		//w 21 15 0 1000
+	mdio_write(ioaddr, 0x18, 0x65c7);		//w 24 15 0 65c7
 	rtl8169_write_gmii_reg_bit(ioaddr, 4, 11, 0);	//w 4 11 11 0
 
 	for (i = 0; i < ARRAY_SIZE(phy_magic); i++, p++) {
@@ -1250,7 +1230,115 @@ static void rtl8169_hw_phy_config(struct net_device *dev)
 		rtl8169_write_gmii_reg_bit(ioaddr, 4, 11, 1); //w 4 11 11 1
 		rtl8169_write_gmii_reg_bit(ioaddr, 4, 11, 0); //w 4 11 11 0
 	}
-	mdio_write(ioaddr, 31, 0x0000); //w 31 2 0 0
+	mdio_write(ioaddr, 0x1f, 0x0000); //w 31 2 0 0
+}
+
+static void rtl8169sb_hw_phy_config(void __iomem *ioaddr)
+{
+	struct phy_reg phy_reg_init[] = {
+		{ 0x1f, 0x0002 },
+		{ 0x01, 0x90d0 },
+		{ 0x1f, 0x0000 }
+	};
+
+	rtl_phy_write(ioaddr, phy_reg_init, ARRAY_SIZE(phy_reg_init));
+}
+static void rtl8168b_hw_phy_config(void __iomem *ioaddr)
+{
+	struct phy_reg phy_reg_init[] = {
+		{ 0x1f, 0x0000 },
+		{ 0x10, 0xf41b },
+		{ 0x1f, 0x0000 }
+	};
+
+	rtl_phy_write(ioaddr, phy_reg_init, ARRAY_SIZE(phy_reg_init));
+}
+
+static void rtl8168cp_hw_phy_config(void __iomem *ioaddr)
+{
+	struct phy_reg phy_reg_init[] = {
+		{ 0x1f, 0x0000 },
+		{ 0x1d, 0x0f00 },
+		{ 0x1f, 0x0002 },
+		{ 0x0c, 0x1ec8 },
+		{ 0x1f, 0x0000 }
+	};
+
+	rtl_phy_write(ioaddr, phy_reg_init, ARRAY_SIZE(phy_reg_init));
+}
+
+static void rtl8168c_hw_phy_config(void __iomem *ioaddr)
+{
+	struct phy_reg phy_reg_init[] = {
+		{ 0x1f, 0x0001 },
+		{ 0x12, 0x2300 },
+		{ 0x1f, 0x0002 },
+		{ 0x00, 0x88d4 },
+		{ 0x01, 0x82b1 },
+		{ 0x03, 0x7002 },
+		{ 0x08, 0x9e30 },
+		{ 0x09, 0x01f0 },
+		{ 0x0a, 0x5500 },
+		{ 0x0c, 0x00c8 },
+		{ 0x1f, 0x0003 },
+		{ 0x12, 0xc096 },
+		{ 0x16, 0x000a },
+		{ 0x1f, 0x0000 }
+	};
+
+	rtl_phy_write(ioaddr, phy_reg_init, ARRAY_SIZE(phy_reg_init));
+}
+
+static void rtl8168cx_hw_phy_config(void __iomem *ioaddr)
+{
+	struct phy_reg phy_reg_init[] = {
+		{ 0x1f, 0x0000 },
+		{ 0x12, 0x2300 },
+		{ 0x1f, 0x0003 },
+		{ 0x16, 0x0f0a },
+		{ 0x1f, 0x0000 },
+		{ 0x1f, 0x0002 },
+		{ 0x0c, 0x7eb8 },
+		{ 0x1f, 0x0000 }
+	};
+
+	rtl_phy_write(ioaddr, phy_reg_init, ARRAY_SIZE(phy_reg_init));
+}
+
+static void rtl_hw_phy_config(struct net_device *dev)
+{
+	struct rtl8169_private *tp = netdev_priv(dev);
+	void __iomem *ioaddr = tp->mmio_addr;
+
+	rtl8169_print_mac_version(tp);
+
+	switch (tp->mac_version) {
+	case RTL_GIGA_MAC_VER_01:
+		break;
+	case RTL_GIGA_MAC_VER_02:
+	case RTL_GIGA_MAC_VER_03:
+		rtl8169s_hw_phy_config(ioaddr);
+		break;
+	case RTL_GIGA_MAC_VER_04:
+		rtl8169sb_hw_phy_config(ioaddr);
+		break;
+	case RTL_GIGA_MAC_VER_11:
+	case RTL_GIGA_MAC_VER_12:
+	case RTL_GIGA_MAC_VER_17:
+		rtl8168b_hw_phy_config(ioaddr);
+		break;
+	case RTL_GIGA_MAC_VER_18:
+		rtl8168cp_hw_phy_config(ioaddr);
+		break;
+	case RTL_GIGA_MAC_VER_19:
+		rtl8168c_hw_phy_config(ioaddr);
+		break;
+	case RTL_GIGA_MAC_VER_20:
+		rtl8168cx_hw_phy_config(ioaddr);
+		break;
+	default:
+		break;
+	}
 }
 
 static void rtl8169_phy_timer(unsigned long __opaque)
@@ -1262,7 +1350,6 @@ static void rtl8169_phy_timer(unsigned long __opaque)
 	unsigned long timeout = RTL8169_PHY_TIMEOUT;
 
 	assert(tp->mac_version > RTL_GIGA_MAC_VER_01);
-	assert(tp->phy_version < RTL_GIGA_PHY_VER_H);
 
 	if (!(tp->phy_1000_ctrl_reg & ADVERTISE_1000FULL))
 		return;
@@ -1297,8 +1384,7 @@ static inline void rtl8169_delete_timer(struct net_device *dev)
 	struct rtl8169_private *tp = netdev_priv(dev);
 	struct timer_list *timer = &tp->timer;
 
-	if ((tp->mac_version <= RTL_GIGA_MAC_VER_01) ||
-	    (tp->phy_version >= RTL_GIGA_PHY_VER_H))
+	if (tp->mac_version <= RTL_GIGA_MAC_VER_01)
 		return;
 
 	del_timer_sync(timer);
@@ -1309,8 +1395,7 @@ static inline void rtl8169_request_timer(struct net_device *dev)
 	struct rtl8169_private *tp = netdev_priv(dev);
 	struct timer_list *timer = &tp->timer;
 
-	if ((tp->mac_version <= RTL_GIGA_MAC_VER_01) ||
-	    (tp->phy_version >= RTL_GIGA_PHY_VER_H))
+	if (tp->mac_version <= RTL_GIGA_MAC_VER_01)
 		return;
 
 	mod_timer(timer, jiffies + RTL8169_PHY_TIMEOUT);
@@ -1362,7 +1447,7 @@ static void rtl8169_init_phy(struct net_device *dev, struct rtl8169_private *tp)
 {
 	void __iomem *ioaddr = tp->mmio_addr;
 
-	rtl8169_hw_phy_config(dev);
+	rtl_hw_phy_config(dev);
 
 	dprintk("Set MAC Reg C+CR Offset 0x82h = 0x01h\n");
 	RTL_W8(0x82, 0x01);
@@ -1457,6 +1542,7 @@ static const struct rtl_cfg_info {
 	unsigned int align;
 	u16 intr_event;
 	u16 napi_event;
+	unsigned msi;
 } rtl_cfg_infos [] = {
 	[RTL_CFG_0] = {
 		.hw_start	= rtl_hw_start_8169,
@@ -1464,7 +1550,8 @@ static const struct rtl_cfg_info {
 		.align		= 0,
 		.intr_event	= SYSErr | LinkChg | RxOverflow |
 				  RxFIFOOver | TxErr | TxOK | RxOK | RxErr,
-		.napi_event	= RxFIFOOver | TxErr | TxOK | RxOK | RxOverflow
+		.napi_event	= RxFIFOOver | TxErr | TxOK | RxOK | RxOverflow,
+		.msi		= 0
 	},
 	[RTL_CFG_1] = {
 		.hw_start	= rtl_hw_start_8168,
@@ -1472,7 +1559,8 @@ static const struct rtl_cfg_info {
 		.align		= 8,
 		.intr_event	= SYSErr | LinkChg | RxOverflow |
 				  TxErr | TxOK | RxOK | RxErr,
-		.napi_event	= TxErr | TxOK | RxOK | RxOverflow
+		.napi_event	= TxErr | TxOK | RxOK | RxOverflow,
+		.msi		= RTL_FEATURE_MSI
 	},
 	[RTL_CFG_2] = {
 		.hw_start	= rtl_hw_start_8101,
@@ -1480,10 +1568,39 @@ static const struct rtl_cfg_info {
 		.align		= 8,
 		.intr_event	= SYSErr | LinkChg | RxOverflow | PCSTimeout |
 				  RxFIFOOver | TxErr | TxOK | RxOK | RxErr,
-		.napi_event	= RxFIFOOver | TxErr | TxOK | RxOK | RxOverflow
+		.napi_event	= RxFIFOOver | TxErr | TxOK | RxOK | RxOverflow,
+		.msi		= RTL_FEATURE_MSI
 	}
 };
 
+/* Cfg9346_Unlock assumed. */
+static unsigned rtl_try_msi(struct pci_dev *pdev, void __iomem *ioaddr,
+			    const struct rtl_cfg_info *cfg)
+{
+	unsigned msi = 0;
+	u8 cfg2;
+
+	cfg2 = RTL_R8(Config2) & ~MSIEnable;
+	if (cfg->msi) {
+		if (pci_enable_msi(pdev)) {
+			dev_info(&pdev->dev, "no MSI. Back to INTx.\n");
+		} else {
+			cfg2 |= MSIEnable;
+			msi = RTL_FEATURE_MSI;
+		}
+	}
+	RTL_W8(Config2, cfg2);
+	return msi;
+}
+
+static void rtl_disable_msi(struct pci_dev *pdev, struct rtl8169_private *tp)
+{
+	if (tp->features & RTL_FEATURE_MSI) {
+		pci_disable_msi(pdev);
+		tp->features &= ~RTL_FEATURE_MSI;
+	}
+}
+
 static int __devinit
 rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
@@ -1596,10 +1713,8 @@ rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	/* Identify chip attached to board */
 	rtl8169_get_mac_version(tp, ioaddr);
-	rtl8169_get_phy_version(tp, ioaddr);
 
 	rtl8169_print_mac_version(tp);
-	rtl8169_print_phy_version(tp);
 
 	for (i = ARRAY_SIZE(rtl_chip_info) - 1; i >= 0; i--) {
 		if (tp->mac_version == rtl_chip_info[i].mac_version)
@@ -1619,6 +1734,7 @@ rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	RTL_W8(Cfg9346, Cfg9346_Unlock);
 	RTL_W8(Config1, RTL_R8(Config1) | PMEnable);
 	RTL_W8(Config5, RTL_R8(Config5) & PMEStatus);
+	tp->features |= rtl_try_msi(pdev, ioaddr, cfg);
 	RTL_W8(Cfg9346, Cfg9346_Lock);
 
 	if (RTL_R8(PHYstatus) & TBI_Enable) {
@@ -1686,7 +1802,7 @@ rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	rc = register_netdev(dev);
 	if (rc < 0)
-		goto err_out_unmap_5;
+		goto err_out_msi_5;
 
 	pci_set_drvdata(pdev, dev);
 
@@ -1709,7 +1825,8 @@ rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 out:
 	return rc;
 
-err_out_unmap_5:
+err_out_msi_5:
+	rtl_disable_msi(pdev, tp);
 	iounmap(ioaddr);
 err_out_free_res_4:
 	pci_release_regions(pdev);
@@ -1730,6 +1847,7 @@ static void __devexit rtl8169_remove_one(struct pci_dev *pdev)
 	flush_scheduled_work();
 
 	unregister_netdev(dev);
+	rtl_disable_msi(pdev, tp);
 	rtl8169_release_board(pdev, dev, tp->mmio_addr);
 	pci_set_drvdata(pdev, NULL);
 }
@@ -1773,7 +1891,8 @@ static int rtl8169_open(struct net_device *dev)
 
 	smp_mb();
 
-	retval = request_irq(dev->irq, rtl8169_interrupt, IRQF_SHARED,
+	retval = request_irq(dev->irq, rtl8169_interrupt,
+			     (tp->features & RTL_FEATURE_MSI) ? 0 : IRQF_SHARED,
 			     dev->name, dev);
 	if (retval < 0)
 		goto err_release_ring_2;
@@ -1933,7 +2052,7 @@ static void rtl_hw_start_8169(struct net_device *dev)
 
 	if ((tp->mac_version == RTL_GIGA_MAC_VER_02) ||
 	    (tp->mac_version == RTL_GIGA_MAC_VER_03)) {
-		dprintk(KERN_INFO PFX "Set MAC Reg C+CR Offset 0xE0. "
+		dprintk("Set MAC Reg C+CR Offset 0xE0. "
 			"Bit-3 and bit-14 MUST be 1\n");
 		tp->cp_cmd |= (1 << 14);
 	}
@@ -2029,7 +2148,8 @@ static void rtl_hw_start_8101(struct net_device *dev)
 	void __iomem *ioaddr = tp->mmio_addr;
 	struct pci_dev *pdev = tp->pci_dev;
 
-	if (tp->mac_version == RTL_GIGA_MAC_VER_13) {
+	if ((tp->mac_version == RTL_GIGA_MAC_VER_13) ||
+	    (tp->mac_version == RTL_GIGA_MAC_VER_16)) {
 		pci_write_config_word(pdev, 0x68, 0x00);
 		pci_write_config_word(pdev, 0x69, 0x08);
 	}
@@ -2259,7 +2379,7 @@ static void rtl8169_tx_clear(struct rtl8169_private *tp)
 				dev_kfree_skb(skb);
 				tx_skb->skb = NULL;
 			}
-			tp->stats.tx_dropped++;
+			tp->dev->stats.tx_dropped++;
 		}
 	}
 	tp->cur_tx = tp->dirty_tx = 0;
@@ -2310,7 +2430,7 @@ static void rtl8169_reinit_task(struct work_struct *work)
 	ret = rtl8169_open(dev);
 	if (unlikely(ret < 0)) {
 		if (net_ratelimit() && netif_msg_drv(tp)) {
-			printk(PFX KERN_ERR "%s: reinit failure (status = %d)."
+			printk(KERN_ERR PFX "%s: reinit failure (status = %d)."
 			       " Rescheduling.\n", dev->name, ret);
 		}
 		rtl8169_schedule_work(dev, rtl8169_reinit_task);
@@ -2340,9 +2460,10 @@ static void rtl8169_reset_task(struct work_struct *work)
 		rtl8169_init_ring_indexes(tp);
 		rtl_hw_start(dev);
 		netif_wake_queue(dev);
+		rtl8169_check_link_status(dev, tp, tp->mmio_addr);
 	} else {
 		if (net_ratelimit() && netif_msg_intr(tp)) {
-			printk(PFX KERN_EMERG "%s: Rx buffers shortage\n",
+			printk(KERN_EMERG PFX "%s: Rx buffers shortage\n",
 			       dev->name);
 		}
 		rtl8169_schedule_work(dev, rtl8169_reset_task);
@@ -2496,7 +2617,7 @@ err_stop:
 	netif_stop_queue(dev);
 	ret = NETDEV_TX_BUSY;
 err_update_stats:
-	tp->stats.tx_dropped++;
+	dev->stats.tx_dropped++;
 	goto out;
 }
 
@@ -2571,8 +2692,8 @@ static void rtl8169_tx_interrupt(struct net_device *dev,
 		if (status & DescOwn)
 			break;
 
-		tp->stats.tx_bytes += len;
-		tp->stats.tx_packets++;
+		dev->stats.tx_bytes += len;
+		dev->stats.tx_packets++;
 
 		rtl8169_unmap_tx_skb(tp->pci_dev, tx_skb, tp->TxDescArray + entry);
 
@@ -2672,14 +2793,14 @@ static int rtl8169_rx_interrupt(struct net_device *dev,
 				       "%s: Rx ERROR. status = %08x\n",
 				       dev->name, status);
 			}
-			tp->stats.rx_errors++;
+			dev->stats.rx_errors++;
 			if (status & (RxRWT | RxRUNT))
-				tp->stats.rx_length_errors++;
+				dev->stats.rx_length_errors++;
 			if (status & RxCRC)
-				tp->stats.rx_crc_errors++;
+				dev->stats.rx_crc_errors++;
 			if (status & RxFOVF) {
 				rtl8169_schedule_work(dev, rtl8169_reset_task);
-				tp->stats.rx_fifo_errors++;
+				dev->stats.rx_fifo_errors++;
 			}
 			rtl8169_mark_to_asic(desc, tp->rx_buf_sz);
 		} else {
@@ -2694,8 +2815,8 @@ static int rtl8169_rx_interrupt(struct net_device *dev,
 			 * sized frames.
 			 */
 			if (unlikely(rtl8169_fragmented_frame(status))) {
-				tp->stats.rx_dropped++;
-				tp->stats.rx_length_errors++;
+				dev->stats.rx_dropped++;
+				dev->stats.rx_length_errors++;
 				rtl8169_mark_to_asic(desc, tp->rx_buf_sz);
 				continue;
 			}
@@ -2719,8 +2840,8 @@ static int rtl8169_rx_interrupt(struct net_device *dev,
 				rtl8169_rx_skb(skb);
 
 			dev->last_rx = jiffies;
-			tp->stats.rx_bytes += pkt_size;
-			tp->stats.rx_packets++;
+			dev->stats.rx_bytes += pkt_size;
+			dev->stats.rx_packets++;
 		}
 
 		/* Work around for AMD plateform. */
@@ -2881,7 +3002,7 @@ core_down:
 	rtl8169_asic_down(ioaddr);
 
 	/* Update the error counts. */
-	tp->stats.rx_missed_errors += RTL_R32(RxMissed);
+	dev->stats.rx_missed_errors += RTL_R32(RxMissed);
 	RTL_W32(RxMissed, 0);
 
 	spin_unlock_irq(&tp->lock);
@@ -2984,7 +3105,9 @@ static void rtl_set_rx_mode(struct net_device *dev)
 	    (tp->mac_version == RTL_GIGA_MAC_VER_12) ||
 	    (tp->mac_version == RTL_GIGA_MAC_VER_13) ||
 	    (tp->mac_version == RTL_GIGA_MAC_VER_14) ||
-	    (tp->mac_version == RTL_GIGA_MAC_VER_15)) {
+	    (tp->mac_version == RTL_GIGA_MAC_VER_15) ||
+	    (tp->mac_version == RTL_GIGA_MAC_VER_16) ||
+	    (tp->mac_version == RTL_GIGA_MAC_VER_17)) {
 		mc_filter[0] = 0xffffffff;
 		mc_filter[1] = 0xffffffff;
 	}
@@ -3011,12 +3134,12 @@ static struct net_device_stats *rtl8169_get_stats(struct net_device *dev)
 
 	if (netif_running(dev)) {
 		spin_lock_irqsave(&tp->lock, flags);
-		tp->stats.rx_missed_errors += RTL_R32(RxMissed);
+		dev->stats.rx_missed_errors += RTL_R32(RxMissed);
 		RTL_W32(RxMissed, 0);
 		spin_unlock_irqrestore(&tp->lock, flags);
 	}
 
-	return &tp->stats;
+	return &dev->stats;
 }
 
 #ifdef CONFIG_PM
@@ -3037,14 +3160,15 @@ static int rtl8169_suspend(struct pci_dev *pdev, pm_message_t state)
 
 	rtl8169_asic_down(ioaddr);
 
-	tp->stats.rx_missed_errors += RTL_R32(RxMissed);
+	dev->stats.rx_missed_errors += RTL_R32(RxMissed);
 	RTL_W32(RxMissed, 0);
 
 	spin_unlock_irq(&tp->lock);
 
 out_pci_suspend:
 	pci_save_state(pdev);
-	pci_enable_wake(pdev, pci_choose_state(pdev, state), tp->wol_enabled);
+	pci_enable_wake(pdev, pci_choose_state(pdev, state),
+		(tp->features & RTL_FEATURE_WOL) ? 1 : 0);
 	pci_set_power_state(pdev, pci_choose_state(pdev, state));
 
 	return 0;
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 014dc2cfe4d..09440d783e6 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -64,8 +64,8 @@
 
 #define DRV_MODULE_NAME		"tg3"
 #define PFX DRV_MODULE_NAME	": "
-#define DRV_MODULE_VERSION	"3.84"
-#define DRV_MODULE_RELDATE	"October 12, 2007"
+#define DRV_MODULE_VERSION	"3.85"
+#define DRV_MODULE_RELDATE	"October 18, 2007"
 
 #define TG3_DEF_MAC_MODE	0
 #define TG3_DEF_RX_MODE		0
@@ -200,6 +200,7 @@ static struct pci_device_id tg3_pci_tbl[] = {
 	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5906M)},
 	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5784)},
 	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5764)},
+	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5723)},
 	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5761)},
 	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5761E)},
 	{PCI_DEVICE(PCI_VENDOR_ID_SYSKONNECT, PCI_DEVICE_ID_SYSKONNECT_9DXX)},
@@ -5028,10 +5029,7 @@ static int tg3_poll_fw(struct tg3 *tp)
 /* Save PCI command register before chip reset */
 static void tg3_save_pci_state(struct tg3 *tp)
 {
-	u32 val;
-
-	pci_read_config_dword(tp->pdev, TG3PCI_COMMAND, &val);
-	tp->pci_cmd = val;
+	pci_read_config_word(tp->pdev, PCI_COMMAND, &tp->pci_cmd);
 }
 
 /* Restore PCI state after chip reset */
@@ -5054,7 +5052,7 @@ static void tg3_restore_pci_state(struct tg3 *tp)
 		       PCISTATE_ALLOW_APE_SHMEM_WR;
 	pci_write_config_dword(tp->pdev, TG3PCI_PCISTATE, val);
 
-	pci_write_config_dword(tp->pdev, TG3PCI_COMMAND, tp->pci_cmd);
+	pci_write_config_word(tp->pdev, PCI_COMMAND, tp->pci_cmd);
 
 	if (!(tp->tg3_flags2 & TG3_FLG2_PCI_EXPRESS)) {
 		pci_write_config_byte(tp->pdev, PCI_CACHE_LINE_SIZE,
@@ -10820,9 +10818,24 @@ out_not_found:
 		strcpy(tp->board_part_number, "none");
 }
 
+static int __devinit tg3_fw_img_is_valid(struct tg3 *tp, u32 offset)
+{
+	u32 val;
+
+	if (tg3_nvram_read_swab(tp, offset, &val) ||
+	    (val & 0xfc000000) != 0x0c000000 ||
+	    tg3_nvram_read_swab(tp, offset + 4, &val) ||
+	    val != 0)
+		return 0;
+
+	return 1;
+}
+
 static void __devinit tg3_read_fw_ver(struct tg3 *tp)
 {
 	u32 val, offset, start;
+	u32 ver_offset;
+	int i, bcnt;
 
 	if (tg3_nvram_read_swab(tp, 0, &val))
 		return;
@@ -10835,29 +10848,71 @@ static void __devinit tg3_read_fw_ver(struct tg3 *tp)
 		return;
 
 	offset = tg3_nvram_logical_addr(tp, offset);
-	if (tg3_nvram_read_swab(tp, offset, &val))
+
+	if (!tg3_fw_img_is_valid(tp, offset) ||
+	    tg3_nvram_read_swab(tp, offset + 8, &ver_offset))
 		return;
 
-	if ((val & 0xfc000000) == 0x0c000000) {
-		u32 ver_offset, addr;
-		int i;
+	offset = offset + ver_offset - start;
+	for (i = 0; i < 16; i += 4) {
+		if (tg3_nvram_read(tp, offset + i, &val))
+			return;
 
-		if (tg3_nvram_read_swab(tp, offset + 4, &val) ||
-		    tg3_nvram_read_swab(tp, offset + 8, &ver_offset))
+		val = le32_to_cpu(val);
+		memcpy(tp->fw_ver + i, &val, 4);
+	}
+
+	if (!(tp->tg3_flags & TG3_FLAG_ENABLE_ASF) ||
+	     (tp->tg3_flags & TG3_FLG3_ENABLE_APE))
+		return;
+
+	for (offset = TG3_NVM_DIR_START;
+	     offset < TG3_NVM_DIR_END;
+	     offset += TG3_NVM_DIRENT_SIZE) {
+		if (tg3_nvram_read_swab(tp, offset, &val))
 			return;
 
-		if (val != 0)
+		if ((val >> TG3_NVM_DIRTYPE_SHIFT) == TG3_NVM_DIRTYPE_ASFINI)
+			break;
+	}
+
+	if (offset == TG3_NVM_DIR_END)
+		return;
+
+	if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS))
+		start = 0x08000000;
+	else if (tg3_nvram_read_swab(tp, offset - 4, &start))
+		return;
+
+	if (tg3_nvram_read_swab(tp, offset + 4, &offset) ||
+	    !tg3_fw_img_is_valid(tp, offset) ||
+	    tg3_nvram_read_swab(tp, offset + 8, &val))
+		return;
+
+	offset += val - start;
+
+	bcnt = strlen(tp->fw_ver);
+
+	tp->fw_ver[bcnt++] = ',';
+	tp->fw_ver[bcnt++] = ' ';
+
+	for (i = 0; i < 4; i++) {
+		if (tg3_nvram_read(tp, offset, &val))
 			return;
 
-		addr = offset + ver_offset - start;
-		for (i = 0; i < 16; i += 4) {
-			if (tg3_nvram_read(tp, addr + i, &val))
-				return;
+		val = le32_to_cpu(val);
+		offset += sizeof(val);
 
-			val = cpu_to_le32(val);
-			memcpy(tp->fw_ver + i, &val, 4);
+		if (bcnt > TG3_VER_SIZE - sizeof(val)) {
+			memcpy(&tp->fw_ver[bcnt], &val, TG3_VER_SIZE - bcnt);
+			break;
 		}
+
+		memcpy(&tp->fw_ver[bcnt], &val, sizeof(val));
+		bcnt += sizeof(val);
 	}
+
+	tp->fw_ver[TG3_VER_SIZE - 1] = 0;
 }
 
 static struct pci_dev * __devinit tg3_find_peer(struct tg3 *);
diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h
index 6dbdad2b8f8..1d5b2a3dd29 100644
--- a/drivers/net/tg3.h
+++ b/drivers/net/tg3.h
@@ -1540,6 +1540,12 @@
 #define TG3_EEPROM_MAGIC_HW		0xabcd
 #define TG3_EEPROM_MAGIC_HW_MSK		0xffff
 
+#define TG3_NVM_DIR_START		0x18
+#define TG3_NVM_DIR_END			0x78
+#define TG3_NVM_DIRENT_SIZE		0xc
+#define TG3_NVM_DIRTYPE_SHIFT		24
+#define TG3_NVM_DIRTYPE_ASFINI		1
+
 /* 32K Window into NIC internal memory */
 #define NIC_SRAM_WIN_BASE		0x00008000
 
@@ -2415,10 +2421,11 @@ struct tg3 {
 #define PHY_REV_BCM5411_X0		0x1 /* Found on Netgear GA302T */
 
 	u32				led_ctrl;
-	u32				pci_cmd;
+	u16				pci_cmd;
 
 	char				board_part_number[24];
-	char				fw_ver[16];
+#define TG3_VER_SIZE 32
+	char				fw_ver[TG3_VER_SIZE];
 	u32				nic_sram_data_cfg;
 	u32				pci_clock_ctrl;
 	struct pci_dev			*pdev_peer;
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
new file mode 100644
index 00000000000..e396c9d2af8
--- /dev/null
+++ b/drivers/net/virtio_net.c
@@ -0,0 +1,435 @@
+/* A simple network driver using virtio.
+ *
+ * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+//#define DEBUG
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/module.h>
+#include <linux/virtio.h>
+#include <linux/virtio_net.h>
+#include <linux/scatterlist.h>
+
+/* FIXME: MTU in config. */
+#define MAX_PACKET_LEN (ETH_HLEN+ETH_DATA_LEN)
+
+struct virtnet_info
+{
+	struct virtio_device *vdev;
+	struct virtqueue *rvq, *svq;
+	struct net_device *dev;
+	struct napi_struct napi;
+
+	/* Number of input buffers, and max we've ever had. */
+	unsigned int num, max;
+
+	/* Receive & send queues. */
+	struct sk_buff_head recv;
+	struct sk_buff_head send;
+};
+
+static inline struct virtio_net_hdr *skb_vnet_hdr(struct sk_buff *skb)
+{
+	return (struct virtio_net_hdr *)skb->cb;
+}
+
+static inline void vnet_hdr_to_sg(struct scatterlist *sg, struct sk_buff *skb)
+{
+	sg_init_one(sg, skb_vnet_hdr(skb), sizeof(struct virtio_net_hdr));
+}
+
+static bool skb_xmit_done(struct virtqueue *rvq)
+{
+	struct virtnet_info *vi = rvq->vdev->priv;
+
+	/* In case we were waiting for output buffers. */
+	netif_wake_queue(vi->dev);
+	return true;
+}
+
+static void receive_skb(struct net_device *dev, struct sk_buff *skb,
+			unsigned len)
+{
+	struct virtio_net_hdr *hdr = skb_vnet_hdr(skb);
+
+	if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
+		pr_debug("%s: short packet %i\n", dev->name, len);
+		dev->stats.rx_length_errors++;
+		goto drop;
+	}
+	len -= sizeof(struct virtio_net_hdr);
+	BUG_ON(len > MAX_PACKET_LEN);
+
+	skb_trim(skb, len);
+	skb->protocol = eth_type_trans(skb, dev);
+	pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
+		 ntohs(skb->protocol), skb->len, skb->pkt_type);
+	dev->stats.rx_bytes += skb->len;
+	dev->stats.rx_packets++;
+
+	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+		pr_debug("Needs csum!\n");
+		skb->ip_summed = CHECKSUM_PARTIAL;
+		skb->csum_start = hdr->csum_start;
+		skb->csum_offset = hdr->csum_offset;
+		if (skb->csum_start > skb->len - 2
+		    || skb->csum_offset > skb->len - 2) {
+			if (net_ratelimit())
+				printk(KERN_WARNING "%s: csum=%u/%u len=%u\n",
+				       dev->name, skb->csum_start,
+				       skb->csum_offset, skb->len);
+			goto frame_err;
+		}
+	}
+
+	if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+		pr_debug("GSO!\n");
+		switch (hdr->gso_type) {
+		case VIRTIO_NET_HDR_GSO_TCPV4:
+			skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+			break;
+		case VIRTIO_NET_HDR_GSO_TCPV4_ECN:
+			skb_shinfo(skb)->gso_type = SKB_GSO_TCP_ECN;
+			break;
+		case VIRTIO_NET_HDR_GSO_UDP:
+			skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
+			break;
+		case VIRTIO_NET_HDR_GSO_TCPV6:
+			skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
+			break;
+		default:
+			if (net_ratelimit())
+				printk(KERN_WARNING "%s: bad gso type %u.\n",
+				       dev->name, hdr->gso_type);
+			goto frame_err;
+		}
+
+		skb_shinfo(skb)->gso_size = hdr->gso_size;
+		if (skb_shinfo(skb)->gso_size == 0) {
+			if (net_ratelimit())
+				printk(KERN_WARNING "%s: zero gso size.\n",
+				       dev->name);
+			goto frame_err;
+		}
+
+		/* Header must be checked, and gso_segs computed. */
+		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+		skb_shinfo(skb)->gso_segs = 0;
+	}
+
+	netif_receive_skb(skb);
+	return;
+
+frame_err:
+	dev->stats.rx_frame_errors++;
+drop:
+	dev_kfree_skb(skb);
+}
+
+static void try_fill_recv(struct virtnet_info *vi)
+{
+	struct sk_buff *skb;
+	struct scatterlist sg[1+MAX_SKB_FRAGS];
+	int num, err;
+
+	for (;;) {
+		skb = netdev_alloc_skb(vi->dev, MAX_PACKET_LEN);
+		if (unlikely(!skb))
+			break;
+
+		skb_put(skb, MAX_PACKET_LEN);
+		vnet_hdr_to_sg(sg, skb);
+		num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1;
+		skb_queue_head(&vi->recv, skb);
+
+		err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, num, skb);
+		if (err) {
+			skb_unlink(skb, &vi->recv);
+			kfree_skb(skb);
+			break;
+		}
+		vi->num++;
+	}
+	if (unlikely(vi->num > vi->max))
+		vi->max = vi->num;
+	vi->rvq->vq_ops->kick(vi->rvq);
+}
+
+static bool skb_recv_done(struct virtqueue *rvq)
+{
+	struct virtnet_info *vi = rvq->vdev->priv;
+	netif_rx_schedule(vi->dev, &vi->napi);
+	/* Suppress further interrupts. */
+	return false;
+}
+
+static int virtnet_poll(struct napi_struct *napi, int budget)
+{
+	struct virtnet_info *vi = container_of(napi, struct virtnet_info, napi);
+	struct sk_buff *skb = NULL;
+	unsigned int len, received = 0;
+
+again:
+	while (received < budget &&
+	       (skb = vi->rvq->vq_ops->get_buf(vi->rvq, &len)) != NULL) {
+		__skb_unlink(skb, &vi->recv);
+		receive_skb(vi->dev, skb, len);
+		vi->num--;
+		received++;
+	}
+
+	/* FIXME: If we oom and completely run out of inbufs, we need
+	 * to start a timer trying to fill more. */
+	if (vi->num < vi->max / 2)
+		try_fill_recv(vi);
+
+	/* All done? */
+	if (!skb) {
+		netif_rx_complete(vi->dev, napi);
+		if (unlikely(!vi->rvq->vq_ops->restart(vi->rvq))
+		    && netif_rx_reschedule(vi->dev, napi))
+			goto again;
+	}
+
+	return received;
+}
+
+static void free_old_xmit_skbs(struct virtnet_info *vi)
+{
+	struct sk_buff *skb;
+	unsigned int len;
+
+	while ((skb = vi->svq->vq_ops->get_buf(vi->svq, &len)) != NULL) {
+		pr_debug("Sent skb %p\n", skb);
+		__skb_unlink(skb, &vi->send);
+		vi->dev->stats.tx_bytes += len;
+		vi->dev->stats.tx_packets++;
+		kfree_skb(skb);
+	}
+}
+
+static int start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	int num, err;
+	struct scatterlist sg[1+MAX_SKB_FRAGS];
+	struct virtio_net_hdr *hdr;
+	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
+	DECLARE_MAC_BUF(mac);
+
+	pr_debug("%s: xmit %p %s\n", dev->name, skb, print_mac(mac, dest));
+
+	free_old_xmit_skbs(vi);
+
+	/* Encode metadata header at front. */
+	hdr = skb_vnet_hdr(skb);
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+		hdr->csum_start = skb->csum_start - skb_headroom(skb);
+		hdr->csum_offset = skb->csum_offset;
+	} else {
+		hdr->flags = 0;
+		hdr->csum_offset = hdr->csum_start = 0;
+	}
+
+	if (skb_is_gso(skb)) {
+		hdr->gso_size = skb_shinfo(skb)->gso_size;
+		if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_ECN)
+			hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4_ECN;
+		else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)
+			hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+		else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)
+			hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+		else if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
+			hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
+		else
+			BUG();
+	} else {
+		hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
+		hdr->gso_size = 0;
+	}
+
+	vnet_hdr_to_sg(sg, skb);
+	num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1;
+	__skb_queue_head(&vi->send, skb);
+	err = vi->svq->vq_ops->add_buf(vi->svq, sg, num, 0, skb);
+	if (err) {
+		pr_debug("%s: virtio not prepared to send\n", dev->name);
+		skb_unlink(skb, &vi->send);
+		netif_stop_queue(dev);
+		return NETDEV_TX_BUSY;
+	}
+	vi->svq->vq_ops->kick(vi->svq);
+
+	return 0;
+}
+
+static int virtnet_open(struct net_device *dev)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+
+	try_fill_recv(vi);
+
+	/* If we didn't even get one input buffer, we're useless. */
+	if (vi->num == 0)
+		return -ENOMEM;
+
+	napi_enable(&vi->napi);
+	return 0;
+}
+
+static int virtnet_close(struct net_device *dev)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	struct sk_buff *skb;
+
+	napi_disable(&vi->napi);
+
+	/* networking core has neutered skb_xmit_done/skb_recv_done, so don't
+	 * worry about races vs. get(). */
+	vi->rvq->vq_ops->shutdown(vi->rvq);
+	while ((skb = __skb_dequeue(&vi->recv)) != NULL) {
+		kfree_skb(skb);
+		vi->num--;
+	}
+	vi->svq->vq_ops->shutdown(vi->svq);
+	while ((skb = __skb_dequeue(&vi->send)) != NULL)
+		kfree_skb(skb);
+
+	BUG_ON(vi->num != 0);
+	return 0;
+}
+
+static int virtnet_probe(struct virtio_device *vdev)
+{
+	int err;
+	unsigned int len;
+	struct net_device *dev;
+	struct virtnet_info *vi;
+	void *token;
+
+	/* Allocate ourselves a network device with room for our info */
+	dev = alloc_etherdev(sizeof(struct virtnet_info));
+	if (!dev)
+		return -ENOMEM;
+
+	/* Set up network device as normal. */
+	ether_setup(dev);
+	dev->open = virtnet_open;
+	dev->stop = virtnet_close;
+	dev->hard_start_xmit = start_xmit;
+	dev->features = NETIF_F_HIGHDMA;
+	SET_NETDEV_DEV(dev, &vdev->dev);
+
+	/* Do we support "hardware" checksums? */
+	token = vdev->config->find(vdev, VIRTIO_CONFIG_NET_F, &len);
+	if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_NO_CSUM)) {
+		/* This opens up the world of extra features. */
+		dev->features |= NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST;
+		if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_TSO4))
+			dev->features |= NETIF_F_TSO;
+		if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_UFO))
+			dev->features |= NETIF_F_UFO;
+		if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_TSO4_ECN))
+			dev->features |= NETIF_F_TSO_ECN;
+		if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_TSO6))
+			dev->features |= NETIF_F_TSO6;
+	}
+
+	/* Configuration may specify what MAC to use.  Otherwise random. */
+	token = vdev->config->find(vdev, VIRTIO_CONFIG_NET_MAC_F, &len);
+	if (token) {
+		dev->addr_len = len;
+		vdev->config->get(vdev, token, dev->dev_addr, len);
+	} else
+		random_ether_addr(dev->dev_addr);
+
+	/* Set up our device-specific information */
+	vi = netdev_priv(dev);
+	netif_napi_add(dev, &vi->napi, virtnet_poll, 16);
+	vi->dev = dev;
+	vi->vdev = vdev;
+
+	/* We expect two virtqueues, receive then send. */
+	vi->rvq = vdev->config->find_vq(vdev, skb_recv_done);
+	if (IS_ERR(vi->rvq)) {
+		err = PTR_ERR(vi->rvq);
+		goto free;
+	}
+
+	vi->svq = vdev->config->find_vq(vdev, skb_xmit_done);
+	if (IS_ERR(vi->svq)) {
+		err = PTR_ERR(vi->svq);
+		goto free_recv;
+	}
+
+	/* Initialize our empty receive and send queues. */
+	skb_queue_head_init(&vi->recv);
+	skb_queue_head_init(&vi->send);
+
+	err = register_netdev(dev);
+	if (err) {
+		pr_debug("virtio_net: registering device failed\n");
+		goto free_send;
+	}
+	pr_debug("virtnet: registered device %s\n", dev->name);
+	vdev->priv = vi;
+	return 0;
+
+free_send:
+	vdev->config->del_vq(vi->svq);
+free_recv:
+	vdev->config->del_vq(vi->rvq);
+free:
+	free_netdev(dev);
+	return err;
+}
+
+static void virtnet_remove(struct virtio_device *vdev)
+{
+	unregister_netdev(vdev->priv);
+	free_netdev(vdev->priv);
+}
+
+static struct virtio_device_id id_table[] = {
+	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
+	{ 0 },
+};
+
+static struct virtio_driver virtio_net = {
+	.driver.name =	KBUILD_MODNAME,
+	.driver.owner =	THIS_MODULE,
+	.id_table =	id_table,
+	.probe =	virtnet_probe,
+	.remove =	__devexit_p(virtnet_remove),
+};
+
+static int __init init(void)
+{
+	return register_virtio_driver(&virtio_net);
+}
+
+static void __exit fini(void)
+{
+	unregister_virtio_driver(&virtio_net);
+}
+module_init(init);
+module_exit(fini);
+
+MODULE_DEVICE_TABLE(virtio, id_table);
+MODULE_DESCRIPTION("Virtio network driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c
index b3c4dbff26b..7c60cbd85dc 100644
--- a/drivers/parisc/ccio-dma.c
+++ b/drivers/parisc/ccio-dma.c
@@ -42,6 +42,7 @@
 #include <linux/reboot.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/scatterlist.h>
 
 #include <asm/byteorder.h>
 #include <asm/cache.h>		/* for L1_CACHE_BYTES */
diff --git a/drivers/parisc/lba_pci.c b/drivers/parisc/lba_pci.c
index 5b86ee5c1ee..5eace9e66e1 100644
--- a/drivers/parisc/lba_pci.c
+++ b/drivers/parisc/lba_pci.c
@@ -557,44 +557,6 @@ lba_bios_init(void)
 #ifdef CONFIG_64BIT
 
 /*
-** Determine if a device is already configured.
-** If so, reserve it resources.
-**
-** Read PCI cfg command register and see if I/O or MMIO is enabled.
-** PAT has to enable the devices it's using.
-**
-** Note: resources are fixed up before we try to claim them.
-*/
-static void
-lba_claim_dev_resources(struct pci_dev *dev)
-{
-	u16 cmd;
-	int i, srch_flags;
-
-	(void) pci_read_config_word(dev, PCI_COMMAND, &cmd);
-
-	srch_flags  = (cmd & PCI_COMMAND_IO) ? IORESOURCE_IO : 0;
-	if (cmd & PCI_COMMAND_MEMORY)
-		srch_flags |= IORESOURCE_MEM;
-
-	if (!srch_flags)
-		return;
-
-	for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
-		if (dev->resource[i].flags & srch_flags) {
-			pci_claim_resource(dev, i);
-			DBG("   claimed %s %d [%lx,%lx]/%lx\n",
-				pci_name(dev), i,
-				dev->resource[i].start,
-				dev->resource[i].end,
-				dev->resource[i].flags
-				);
-		}
-	}
-}
-
-
-/*
  * truncate_pat_collision:  Deal with overlaps or outright collisions
  *			between PAT PDC reported ranges.
  *
@@ -653,7 +615,6 @@ truncate_pat_collision(struct resource *root, struct resource *new)
 }
 
 #else
-#define lba_claim_dev_resources(dev) do { } while (0)
 #define truncate_pat_collision(r,n)  (0)
 #endif
 
@@ -684,8 +645,12 @@ lba_fixup_bus(struct pci_bus *bus)
 	** pci_alloc_primary_bus() mangles this.
 	*/
 	if (bus->self) {
+		int i;
 		/* PCI-PCI Bridge */
 		pci_read_bridge_bases(bus);
+		for (i = PCI_BRIDGE_RESOURCES; i < PCI_NUM_RESOURCES; i++) {
+			pci_claim_resource(bus->self, i);
+		}
 	} else {
 		/* Host-PCI Bridge */
 		int err, i;
@@ -803,6 +768,9 @@ lba_fixup_bus(struct pci_bus *bus)
 				DBG("lba_fixup_bus() WTF? 0x%lx [%lx/%lx] XXX",
 					res->flags, res->start, res->end);
 			}
+			if ((i != PCI_ROM_RESOURCE) ||
+			    (res->flags & IORESOURCE_ROM_ENABLE))
+				pci_claim_resource(dev, i);
 		}
 
 #ifdef FBB_SUPPORT
@@ -814,11 +782,6 @@ lba_fixup_bus(struct pci_bus *bus)
 		bus->bridge_ctl &= ~(status & PCI_STATUS_FAST_BACK);
 #endif
 
-		if (is_pdc_pat()) {
-			/* Claim resources for PDC's devices */
-			lba_claim_dev_resources(dev);
-		}
-
                 /*
 		** P2PB's have no IRQs. ignore them.
 		*/
diff --git a/drivers/parisc/pdc_stable.c b/drivers/parisc/pdc_stable.c
index fc4bde259dc..ebb09e98d21 100644
--- a/drivers/parisc/pdc_stable.c
+++ b/drivers/parisc/pdc_stable.c
@@ -282,6 +282,7 @@ pdcspath_hwpath_write(struct pdcspath_entry *entry, const char *buf, size_t coun
 	unsigned short i;
 	char in[count+1], *temp;
 	struct device *dev;
+	int ret;
 
 	if (!entry || !buf || !count)
 		return -EINVAL;
@@ -333,7 +334,9 @@ pdcspath_hwpath_write(struct pdcspath_entry *entry, const char *buf, size_t coun
 	
 	/* Update the symlink to the real device */
 	sysfs_remove_link(&entry->kobj, "device");
-	sysfs_create_link(&entry->kobj, &entry->dev->kobj, "device");
+	ret = sysfs_create_link(&entry->kobj, &entry->dev->kobj, "device");
+	WARN_ON(ret);
+
 	write_unlock(&entry->rw_lock);
 	
 	printk(KERN_INFO PDCS_PREFIX ": changed \"%s\" path to \"%s\"\n",
@@ -1003,8 +1006,10 @@ pdcs_register_pathentries(void)
 		entry->ready = 2;
 		
 		/* Add a nice symlink to the real device */
-		if (entry->dev)
-			sysfs_create_link(&entry->kobj, &entry->dev->kobj, "device");
+		if (entry->dev) {
+			err = sysfs_create_link(&entry->kobj, &entry->dev->kobj, "device");
+			WARN_ON(err);
+		}
 
 		write_unlock(&entry->rw_lock);
 	}
diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c
index d044c48323e..e527a0e1d6c 100644
--- a/drivers/parisc/sba_iommu.c
+++ b/drivers/parisc/sba_iommu.c
@@ -28,6 +28,7 @@
 #include <linux/mm.h>
 #include <linux/string.h>
 #include <linux/pci.h>
+#include <linux/scatterlist.h>
 
 #include <asm/byteorder.h>
 #include <asm/io.h>
@@ -1909,8 +1910,8 @@ sba_driver_callback(struct parisc_device *dev)
 			global_ioc_cnt *= 2;
 	}
 
-	printk(KERN_INFO "%s found %s at 0x%lx\n",
-		MODULE_NAME, version, dev->hpa.start);
+	printk(KERN_INFO "%s found %s at 0x%llx\n",
+		MODULE_NAME, version, (unsigned long long)dev->hpa.start);
 
 	sba_dev = kzalloc(sizeof(struct sba_device), GFP_KERNEL);
 	if (!sba_dev) {
diff --git a/drivers/parisc/superio.c b/drivers/parisc/superio.c
index 38cdf9fa36a..1e8d2d17f04 100644
--- a/drivers/parisc/superio.c
+++ b/drivers/parisc/superio.c
@@ -155,6 +155,7 @@ superio_init(struct pci_dev *pcidev)
 	struct superio_device *sio = &sio_dev;
 	struct pci_dev *pdev = sio->lio_pdev;
 	u16 word;
+	int ret;
 
 	if (sio->suckyio_irq_enabled)
 		return;
@@ -200,7 +201,8 @@ superio_init(struct pci_dev *pcidev)
 	pci_write_config_word (pdev, PCI_COMMAND, word);
 
 	pci_set_master (pdev);
-	pci_enable_device(pdev);
+	ret = pci_enable_device(pdev);
+	BUG_ON(ret < 0);	/* not too much we can do about this... */
 
 	/*
 	 * Next project is programming the onboard interrupt controllers.
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 006054a4099..55505565073 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -20,6 +20,9 @@ obj-$(CONFIG_PCI_MSI) += msi.o
 # Build the Hypertransport interrupt support
 obj-$(CONFIG_HT_IRQ) += htirq.o
 
+# Build Intel IOMMU support
+obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o
+
 #
 # Some architectures use the generic PCI setup functions
 #
diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
new file mode 100644
index 00000000000..5dfdfdac92e
--- /dev/null
+++ b/drivers/pci/dmar.c
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * 	Copyright (C) Ashok Raj <ashok.raj@intel.com>
+ *	Copyright (C) Shaohua Li <shaohua.li@intel.com>
+ *	Copyright (C) Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+ *
+ * 	This file implements early detection/parsing of DMA Remapping Devices
+ * reported to OS through BIOS via DMA remapping reporting (DMAR) ACPI
+ * tables.
+ */
+
+#include <linux/pci.h>
+#include <linux/dmar.h>
+
+#undef PREFIX
+#define PREFIX "DMAR:"
+
+/* No locks are needed as DMA remapping hardware unit
+ * list is constructed at boot time and hotplug of
+ * these units are not supported by the architecture.
+ */
+LIST_HEAD(dmar_drhd_units);
+LIST_HEAD(dmar_rmrr_units);
+
+static struct acpi_table_header * __initdata dmar_tbl;
+
+static void __init dmar_register_drhd_unit(struct dmar_drhd_unit *drhd)
+{
+	/*
+	 * add INCLUDE_ALL at the tail, so scan the list will find it at
+	 * the very end.
+	 */
+	if (drhd->include_all)
+		list_add_tail(&drhd->list, &dmar_drhd_units);
+	else
+		list_add(&drhd->list, &dmar_drhd_units);
+}
+
+static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
+{
+	list_add(&rmrr->list, &dmar_rmrr_units);
+}
+
+static int __init dmar_parse_one_dev_scope(struct acpi_dmar_device_scope *scope,
+					   struct pci_dev **dev, u16 segment)
+{
+	struct pci_bus *bus;
+	struct pci_dev *pdev = NULL;
+	struct acpi_dmar_pci_path *path;
+	int count;
+
+	bus = pci_find_bus(segment, scope->bus);
+	path = (struct acpi_dmar_pci_path *)(scope + 1);
+	count = (scope->length - sizeof(struct acpi_dmar_device_scope))
+		/ sizeof(struct acpi_dmar_pci_path);
+
+	while (count) {
+		if (pdev)
+			pci_dev_put(pdev);
+		/*
+		 * Some BIOSes list non-exist devices in DMAR table, just
+		 * ignore it
+		 */
+		if (!bus) {
+			printk(KERN_WARNING
+			PREFIX "Device scope bus [%d] not found\n",
+			scope->bus);
+			break;
+		}
+		pdev = pci_get_slot(bus, PCI_DEVFN(path->dev, path->fn));
+		if (!pdev) {
+			printk(KERN_WARNING PREFIX
+			"Device scope device [%04x:%02x:%02x.%02x] not found\n",
+				segment, bus->number, path->dev, path->fn);
+			break;
+		}
+		path ++;
+		count --;
+		bus = pdev->subordinate;
+	}
+	if (!pdev) {
+		printk(KERN_WARNING PREFIX
+		"Device scope device [%04x:%02x:%02x.%02x] not found\n",
+		segment, scope->bus, path->dev, path->fn);
+		*dev = NULL;
+		return 0;
+	}
+	if ((scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT && \
+			pdev->subordinate) || (scope->entry_type == \
+			ACPI_DMAR_SCOPE_TYPE_BRIDGE && !pdev->subordinate)) {
+		pci_dev_put(pdev);
+		printk(KERN_WARNING PREFIX
+			"Device scope type does not match for %s\n",
+			 pci_name(pdev));
+		return -EINVAL;
+	}
+	*dev = pdev;
+	return 0;
+}
+
+static int __init dmar_parse_dev_scope(void *start, void *end, int *cnt,
+				       struct pci_dev ***devices, u16 segment)
+{
+	struct acpi_dmar_device_scope *scope;
+	void * tmp = start;
+	int index;
+	int ret;
+
+	*cnt = 0;
+	while (start < end) {
+		scope = start;
+		if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT ||
+		    scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE)
+			(*cnt)++;
+		else
+			printk(KERN_WARNING PREFIX
+				"Unsupported device scope\n");
+		start += scope->length;
+	}
+	if (*cnt == 0)
+		return 0;
+
+	*devices = kcalloc(*cnt, sizeof(struct pci_dev *), GFP_KERNEL);
+	if (!*devices)
+		return -ENOMEM;
+
+	start = tmp;
+	index = 0;
+	while (start < end) {
+		scope = start;
+		if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT ||
+		    scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE) {
+			ret = dmar_parse_one_dev_scope(scope,
+				&(*devices)[index], segment);
+			if (ret) {
+				kfree(*devices);
+				return ret;
+			}
+			index ++;
+		}
+		start += scope->length;
+	}
+
+	return 0;
+}
+
+/**
+ * dmar_parse_one_drhd - parses exactly one DMA remapping hardware definition
+ * structure which uniquely represent one DMA remapping hardware unit
+ * present in the platform
+ */
+static int __init
+dmar_parse_one_drhd(struct acpi_dmar_header *header)
+{
+	struct acpi_dmar_hardware_unit *drhd;
+	struct dmar_drhd_unit *dmaru;
+	int ret = 0;
+	static int include_all;
+
+	dmaru = kzalloc(sizeof(*dmaru), GFP_KERNEL);
+	if (!dmaru)
+		return -ENOMEM;
+
+	drhd = (struct acpi_dmar_hardware_unit *)header;
+	dmaru->reg_base_addr = drhd->address;
+	dmaru->include_all = drhd->flags & 0x1; /* BIT0: INCLUDE_ALL */
+
+	if (!dmaru->include_all)
+		ret = dmar_parse_dev_scope((void *)(drhd + 1),
+				((void *)drhd) + header->length,
+				&dmaru->devices_cnt, &dmaru->devices,
+				drhd->segment);
+	else {
+		/* Only allow one INCLUDE_ALL */
+		if (include_all) {
+			printk(KERN_WARNING PREFIX "Only one INCLUDE_ALL "
+				"device scope is allowed\n");
+			ret = -EINVAL;
+		}
+		include_all = 1;
+	}
+
+	if (ret || (dmaru->devices_cnt == 0 && !dmaru->include_all))
+		kfree(dmaru);
+	else
+		dmar_register_drhd_unit(dmaru);
+	return ret;
+}
+
+static int __init
+dmar_parse_one_rmrr(struct acpi_dmar_header *header)
+{
+	struct acpi_dmar_reserved_memory *rmrr;
+	struct dmar_rmrr_unit *rmrru;
+	int ret = 0;
+
+	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
+	if (!rmrru)
+		return -ENOMEM;
+
+	rmrr = (struct acpi_dmar_reserved_memory *)header;
+	rmrru->base_address = rmrr->base_address;
+	rmrru->end_address = rmrr->end_address;
+	ret = dmar_parse_dev_scope((void *)(rmrr + 1),
+		((void *)rmrr) + header->length,
+		&rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
+
+	if (ret || (rmrru->devices_cnt == 0))
+		kfree(rmrru);
+	else
+		dmar_register_rmrr_unit(rmrru);
+	return ret;
+}
+
+static void __init
+dmar_table_print_dmar_entry(struct acpi_dmar_header *header)
+{
+	struct acpi_dmar_hardware_unit *drhd;
+	struct acpi_dmar_reserved_memory *rmrr;
+
+	switch (header->type) {
+	case ACPI_DMAR_TYPE_HARDWARE_UNIT:
+		drhd = (struct acpi_dmar_hardware_unit *)header;
+		printk (KERN_INFO PREFIX
+			"DRHD (flags: 0x%08x)base: 0x%016Lx\n",
+			drhd->flags, drhd->address);
+		break;
+	case ACPI_DMAR_TYPE_RESERVED_MEMORY:
+		rmrr = (struct acpi_dmar_reserved_memory *)header;
+
+		printk (KERN_INFO PREFIX
+			"RMRR base: 0x%016Lx end: 0x%016Lx\n",
+			rmrr->base_address, rmrr->end_address);
+		break;
+	}
+}
+
+/**
+ * parse_dmar_table - parses the DMA reporting table
+ */
+static int __init
+parse_dmar_table(void)
+{
+	struct acpi_table_dmar *dmar;
+	struct acpi_dmar_header *entry_header;
+	int ret = 0;
+
+	dmar = (struct acpi_table_dmar *)dmar_tbl;
+	if (!dmar)
+		return -ENODEV;
+
+	if (!dmar->width) {
+		printk (KERN_WARNING PREFIX "Zero: Invalid DMAR haw\n");
+		return -EINVAL;
+	}
+
+	printk (KERN_INFO PREFIX "Host address width %d\n",
+		dmar->width + 1);
+
+	entry_header = (struct acpi_dmar_header *)(dmar + 1);
+	while (((unsigned long)entry_header) <
+			(((unsigned long)dmar) + dmar_tbl->length)) {
+		dmar_table_print_dmar_entry(entry_header);
+
+		switch (entry_header->type) {
+		case ACPI_DMAR_TYPE_HARDWARE_UNIT:
+			ret = dmar_parse_one_drhd(entry_header);
+			break;
+		case ACPI_DMAR_TYPE_RESERVED_MEMORY:
+			ret = dmar_parse_one_rmrr(entry_header);
+			break;
+		default:
+			printk(KERN_WARNING PREFIX
+				"Unknown DMAR structure type\n");
+			ret = 0; /* for forward compatibility */
+			break;
+		}
+		if (ret)
+			break;
+
+		entry_header = ((void *)entry_header + entry_header->length);
+	}
+	return ret;
+}
+
+
+int __init dmar_table_init(void)
+{
+
+	parse_dmar_table();
+	if (list_empty(&dmar_drhd_units)) {
+		printk(KERN_INFO PREFIX "No DMAR devices found\n");
+		return -ENODEV;
+	}
+	return 0;
+}
+
+/**
+ * early_dmar_detect - checks to see if the platform supports DMAR devices
+ */
+int __init early_dmar_detect(void)
+{
+	acpi_status status = AE_OK;
+
+	/* if we could find DMAR table, then there are DMAR devices */
+	status = acpi_get_table(ACPI_SIG_DMAR, 0,
+				(struct acpi_table_header **)&dmar_tbl);
+
+	if (ACPI_SUCCESS(status) && !dmar_tbl) {
+		printk (KERN_WARNING PREFIX "Unable to map DMAR\n");
+		status = AE_NOT_FOUND;
+	}
+
+	return (ACPI_SUCCESS(status) ? 1 : 0);
+}
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
new file mode 100644
index 00000000000..0c4ab3b0727
--- /dev/null
+++ b/drivers/pci/intel-iommu.c
@@ -0,0 +1,2271 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Ashok Raj <ashok.raj@intel.com>
+ * Copyright (C) Shaohua Li <shaohua.li@intel.com>
+ * Copyright (C) Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+ */
+
+#include <linux/init.h>
+#include <linux/bitmap.h>
+#include <linux/slab.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/sysdev.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/dmar.h>
+#include <linux/dma-mapping.h>
+#include <linux/mempool.h>
+#include "iova.h"
+#include "intel-iommu.h"
+#include <asm/proto.h> /* force_iommu in this header in x86-64*/
+#include <asm/cacheflush.h>
+#include <asm/iommu.h>
+#include "pci.h"
+
+#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
+#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
+
+#define IOAPIC_RANGE_START	(0xfee00000)
+#define IOAPIC_RANGE_END	(0xfeefffff)
+#define IOVA_START_ADDR		(0x1000)
+
+#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
+
+#define DMAR_OPERATION_TIMEOUT (HZ*60) /* 1m */
+
+#define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
+
+static void domain_remove_dev_info(struct dmar_domain *domain);
+
+static int dmar_disabled;
+static int __initdata dmar_map_gfx = 1;
+static int dmar_forcedac;
+
+#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
+static DEFINE_SPINLOCK(device_domain_lock);
+static LIST_HEAD(device_domain_list);
+
+static int __init intel_iommu_setup(char *str)
+{
+	if (!str)
+		return -EINVAL;
+	while (*str) {
+		if (!strncmp(str, "off", 3)) {
+			dmar_disabled = 1;
+			printk(KERN_INFO"Intel-IOMMU: disabled\n");
+		} else if (!strncmp(str, "igfx_off", 8)) {
+			dmar_map_gfx = 0;
+			printk(KERN_INFO
+				"Intel-IOMMU: disable GFX device mapping\n");
+		} else if (!strncmp(str, "forcedac", 8)) {
+			printk (KERN_INFO
+				"Intel-IOMMU: Forcing DAC for PCI devices\n");
+			dmar_forcedac = 1;
+		}
+
+		str += strcspn(str, ",");
+		while (*str == ',')
+			str++;
+	}
+	return 0;
+}
+__setup("intel_iommu=", intel_iommu_setup);
+
+static struct kmem_cache *iommu_domain_cache;
+static struct kmem_cache *iommu_devinfo_cache;
+static struct kmem_cache *iommu_iova_cache;
+
+static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
+{
+	unsigned int flags;
+	void *vaddr;
+
+	/* trying to avoid low memory issues */
+	flags = current->flags & PF_MEMALLOC;
+	current->flags |= PF_MEMALLOC;
+	vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
+	current->flags &= (~PF_MEMALLOC | flags);
+	return vaddr;
+}
+
+
+static inline void *alloc_pgtable_page(void)
+{
+	unsigned int flags;
+	void *vaddr;
+
+	/* trying to avoid low memory issues */
+	flags = current->flags & PF_MEMALLOC;
+	current->flags |= PF_MEMALLOC;
+	vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
+	current->flags &= (~PF_MEMALLOC | flags);
+	return vaddr;
+}
+
+static inline void free_pgtable_page(void *vaddr)
+{
+	free_page((unsigned long)vaddr);
+}
+
+static inline void *alloc_domain_mem(void)
+{
+	return iommu_kmem_cache_alloc(iommu_domain_cache);
+}
+
+static inline void free_domain_mem(void *vaddr)
+{
+	kmem_cache_free(iommu_domain_cache, vaddr);
+}
+
+static inline void * alloc_devinfo_mem(void)
+{
+	return iommu_kmem_cache_alloc(iommu_devinfo_cache);
+}
+
+static inline void free_devinfo_mem(void *vaddr)
+{
+	kmem_cache_free(iommu_devinfo_cache, vaddr);
+}
+
+struct iova *alloc_iova_mem(void)
+{
+	return iommu_kmem_cache_alloc(iommu_iova_cache);
+}
+
+void free_iova_mem(struct iova *iova)
+{
+	kmem_cache_free(iommu_iova_cache, iova);
+}
+
+static inline void __iommu_flush_cache(
+	struct intel_iommu *iommu, void *addr, int size)
+{
+	if (!ecap_coherent(iommu->ecap))
+		clflush_cache_range(addr, size);
+}
+
+/* Gets context entry for a given bus and devfn */
+static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
+		u8 bus, u8 devfn)
+{
+	struct root_entry *root;
+	struct context_entry *context;
+	unsigned long phy_addr;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iommu->lock, flags);
+	root = &iommu->root_entry[bus];
+	context = get_context_addr_from_root(root);
+	if (!context) {
+		context = (struct context_entry *)alloc_pgtable_page();
+		if (!context) {
+			spin_unlock_irqrestore(&iommu->lock, flags);
+			return NULL;
+		}
+		__iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
+		phy_addr = virt_to_phys((void *)context);
+		set_root_value(root, phy_addr);
+		set_root_present(root);
+		__iommu_flush_cache(iommu, root, sizeof(*root));
+	}
+	spin_unlock_irqrestore(&iommu->lock, flags);
+	return &context[devfn];
+}
+
+static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
+{
+	struct root_entry *root;
+	struct context_entry *context;
+	int ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iommu->lock, flags);
+	root = &iommu->root_entry[bus];
+	context = get_context_addr_from_root(root);
+	if (!context) {
+		ret = 0;
+		goto out;
+	}
+	ret = context_present(context[devfn]);
+out:
+	spin_unlock_irqrestore(&iommu->lock, flags);
+	return ret;
+}
+
+static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
+{
+	struct root_entry *root;
+	struct context_entry *context;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iommu->lock, flags);
+	root = &iommu->root_entry[bus];
+	context = get_context_addr_from_root(root);
+	if (context) {
+		context_clear_entry(context[devfn]);
+		__iommu_flush_cache(iommu, &context[devfn], \
+			sizeof(*context));
+	}
+	spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
+static void free_context_table(struct intel_iommu *iommu)
+{
+	struct root_entry *root;
+	int i;
+	unsigned long flags;
+	struct context_entry *context;
+
+	spin_lock_irqsave(&iommu->lock, flags);
+	if (!iommu->root_entry) {
+		goto out;
+	}
+	for (i = 0; i < ROOT_ENTRY_NR; i++) {
+		root = &iommu->root_entry[i];
+		context = get_context_addr_from_root(root);
+		if (context)
+			free_pgtable_page(context);
+	}
+	free_pgtable_page(iommu->root_entry);
+	iommu->root_entry = NULL;
+out:
+	spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
+/* page table handling */
+#define LEVEL_STRIDE		(9)
+#define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
+
+static inline int agaw_to_level(int agaw)
+{
+	return agaw + 2;
+}
+
+static inline int agaw_to_width(int agaw)
+{
+	return 30 + agaw * LEVEL_STRIDE;
+
+}
+
+static inline int width_to_agaw(int width)
+{
+	return (width - 30) / LEVEL_STRIDE;
+}
+
+static inline unsigned int level_to_offset_bits(int level)
+{
+	return (12 + (level - 1) * LEVEL_STRIDE);
+}
+
+static inline int address_level_offset(u64 addr, int level)
+{
+	return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
+}
+
+static inline u64 level_mask(int level)
+{
+	return ((u64)-1 << level_to_offset_bits(level));
+}
+
+static inline u64 level_size(int level)
+{
+	return ((u64)1 << level_to_offset_bits(level));
+}
+
+static inline u64 align_to_level(u64 addr, int level)
+{
+	return ((addr + level_size(level) - 1) & level_mask(level));
+}
+
+static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
+{
+	int addr_width = agaw_to_width(domain->agaw);
+	struct dma_pte *parent, *pte = NULL;
+	int level = agaw_to_level(domain->agaw);
+	int offset;
+	unsigned long flags;
+
+	BUG_ON(!domain->pgd);
+
+	addr &= (((u64)1) << addr_width) - 1;
+	parent = domain->pgd;
+
+	spin_lock_irqsave(&domain->mapping_lock, flags);
+	while (level > 0) {
+		void *tmp_page;
+
+		offset = address_level_offset(addr, level);
+		pte = &parent[offset];
+		if (level == 1)
+			break;
+
+		if (!dma_pte_present(*pte)) {
+			tmp_page = alloc_pgtable_page();
+
+			if (!tmp_page) {
+				spin_unlock_irqrestore(&domain->mapping_lock,
+					flags);
+				return NULL;
+			}
+			__iommu_flush_cache(domain->iommu, tmp_page,
+					PAGE_SIZE_4K);
+			dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
+			/*
+			 * high level table always sets r/w, last level page
+			 * table control read/write
+			 */
+			dma_set_pte_readable(*pte);
+			dma_set_pte_writable(*pte);
+			__iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
+		}
+		parent = phys_to_virt(dma_pte_addr(*pte));
+		level--;
+	}
+
+	spin_unlock_irqrestore(&domain->mapping_lock, flags);
+	return pte;
+}
+
+/* return address's pte at specific level */
+static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
+		int level)
+{
+	struct dma_pte *parent, *pte = NULL;
+	int total = agaw_to_level(domain->agaw);
+	int offset;
+
+	parent = domain->pgd;
+	while (level <= total) {
+		offset = address_level_offset(addr, total);
+		pte = &parent[offset];
+		if (level == total)
+			return pte;
+
+		if (!dma_pte_present(*pte))
+			break;
+		parent = phys_to_virt(dma_pte_addr(*pte));
+		total--;
+	}
+	return NULL;
+}
+
+/* clear one page's page table */
+static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
+{
+	struct dma_pte *pte = NULL;
+
+	/* get last level pte */
+	pte = dma_addr_level_pte(domain, addr, 1);
+
+	if (pte) {
+		dma_clear_pte(*pte);
+		__iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
+	}
+}
+
+/* clear last level pte, a tlb flush should be followed */
+static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
+{
+	int addr_width = agaw_to_width(domain->agaw);
+
+	start &= (((u64)1) << addr_width) - 1;
+	end &= (((u64)1) << addr_width) - 1;
+	/* in case it's partial page */
+	start = PAGE_ALIGN_4K(start);
+	end &= PAGE_MASK_4K;
+
+	/* we don't need lock here, nobody else touches the iova range */
+	while (start < end) {
+		dma_pte_clear_one(domain, start);
+		start += PAGE_SIZE_4K;
+	}
+}
+
+/* free page table pages. last level pte should already be cleared */
+static void dma_pte_free_pagetable(struct dmar_domain *domain,
+	u64 start, u64 end)
+{
+	int addr_width = agaw_to_width(domain->agaw);
+	struct dma_pte *pte;
+	int total = agaw_to_level(domain->agaw);
+	int level;
+	u64 tmp;
+
+	start &= (((u64)1) << addr_width) - 1;
+	end &= (((u64)1) << addr_width) - 1;
+
+	/* we don't need lock here, nobody else touches the iova range */
+	level = 2;
+	while (level <= total) {
+		tmp = align_to_level(start, level);
+		if (tmp >= end || (tmp + level_size(level) > end))
+			return;
+
+		while (tmp < end) {
+			pte = dma_addr_level_pte(domain, tmp, level);
+			if (pte) {
+				free_pgtable_page(
+					phys_to_virt(dma_pte_addr(*pte)));
+				dma_clear_pte(*pte);
+				__iommu_flush_cache(domain->iommu,
+						pte, sizeof(*pte));
+			}
+			tmp += level_size(level);
+		}
+		level++;
+	}
+	/* free pgd */
+	if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
+		free_pgtable_page(domain->pgd);
+		domain->pgd = NULL;
+	}
+}
+
+/* iommu handling */
+static int iommu_alloc_root_entry(struct intel_iommu *iommu)
+{
+	struct root_entry *root;
+	unsigned long flags;
+
+	root = (struct root_entry *)alloc_pgtable_page();
+	if (!root)
+		return -ENOMEM;
+
+	__iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
+
+	spin_lock_irqsave(&iommu->lock, flags);
+	iommu->root_entry = root;
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
+	return 0;
+}
+
+#define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
+{\
+	unsigned long start_time = jiffies;\
+	while (1) {\
+		sts = op (iommu->reg + offset);\
+		if (cond)\
+			break;\
+		if (time_after(jiffies, start_time + DMAR_OPERATION_TIMEOUT))\
+			panic("DMAR hardware is malfunctioning\n");\
+		cpu_relax();\
+	}\
+}
+
+static void iommu_set_root_entry(struct intel_iommu *iommu)
+{
+	void *addr;
+	u32 cmd, sts;
+	unsigned long flag;
+
+	addr = iommu->root_entry;
+
+	spin_lock_irqsave(&iommu->register_lock, flag);
+	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
+
+	cmd = iommu->gcmd | DMA_GCMD_SRTP;
+	writel(cmd, iommu->reg + DMAR_GCMD_REG);
+
+	/* Make sure hardware complete it */
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+		readl, (sts & DMA_GSTS_RTPS), sts);
+
+	spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+static void iommu_flush_write_buffer(struct intel_iommu *iommu)
+{
+	u32 val;
+	unsigned long flag;
+
+	if (!cap_rwbf(iommu->cap))
+		return;
+	val = iommu->gcmd | DMA_GCMD_WBF;
+
+	spin_lock_irqsave(&iommu->register_lock, flag);
+	writel(val, iommu->reg + DMAR_GCMD_REG);
+
+	/* Make sure hardware complete it */
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+			readl, (!(val & DMA_GSTS_WBFS)), val);
+
+	spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+/* return value determine if we need a write buffer flush */
+static int __iommu_flush_context(struct intel_iommu *iommu,
+	u16 did, u16 source_id, u8 function_mask, u64 type,
+	int non_present_entry_flush)
+{
+	u64 val = 0;
+	unsigned long flag;
+
+	/*
+	 * In the non-present entry flush case, if hardware doesn't cache
+	 * non-present entry we do nothing and if hardware cache non-present
+	 * entry, we flush entries of domain 0 (the domain id is used to cache
+	 * any non-present entries)
+	 */
+	if (non_present_entry_flush) {
+		if (!cap_caching_mode(iommu->cap))
+			return 1;
+		else
+			did = 0;
+	}
+
+	switch (type) {
+	case DMA_CCMD_GLOBAL_INVL:
+		val = DMA_CCMD_GLOBAL_INVL;
+		break;
+	case DMA_CCMD_DOMAIN_INVL:
+		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
+		break;
+	case DMA_CCMD_DEVICE_INVL:
+		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
+			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
+		break;
+	default:
+		BUG();
+	}
+	val |= DMA_CCMD_ICC;
+
+	spin_lock_irqsave(&iommu->register_lock, flag);
+	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
+
+	/* Make sure hardware complete it */
+	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
+		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
+
+	spin_unlock_irqrestore(&iommu->register_lock, flag);
+
+	/* flush context entry will implictly flush write buffer */
+	return 0;
+}
+
+static int inline iommu_flush_context_global(struct intel_iommu *iommu,
+	int non_present_entry_flush)
+{
+	return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
+		non_present_entry_flush);
+}
+
+static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did,
+	int non_present_entry_flush)
+{
+	return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
+		non_present_entry_flush);
+}
+
+static int inline iommu_flush_context_device(struct intel_iommu *iommu,
+	u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
+{
+	return __iommu_flush_context(iommu, did, source_id, function_mask,
+		DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
+}
+
+/* return value determine if we need a write buffer flush */
+static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
+	u64 addr, unsigned int size_order, u64 type,
+	int non_present_entry_flush)
+{
+	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
+	u64 val = 0, val_iva = 0;
+	unsigned long flag;
+
+	/*
+	 * In the non-present entry flush case, if hardware doesn't cache
+	 * non-present entry we do nothing and if hardware cache non-present
+	 * entry, we flush entries of domain 0 (the domain id is used to cache
+	 * any non-present entries)
+	 */
+	if (non_present_entry_flush) {
+		if (!cap_caching_mode(iommu->cap))
+			return 1;
+		else
+			did = 0;
+	}
+
+	switch (type) {
+	case DMA_TLB_GLOBAL_FLUSH:
+		/* global flush doesn't need set IVA_REG */
+		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
+		break;
+	case DMA_TLB_DSI_FLUSH:
+		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
+		break;
+	case DMA_TLB_PSI_FLUSH:
+		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
+		/* Note: always flush non-leaf currently */
+		val_iva = size_order | addr;
+		break;
+	default:
+		BUG();
+	}
+	/* Note: set drain read/write */
+#if 0
+	/*
+	 * This is probably to be super secure.. Looks like we can
+	 * ignore it without any impact.
+	 */
+	if (cap_read_drain(iommu->cap))
+		val |= DMA_TLB_READ_DRAIN;
+#endif
+	if (cap_write_drain(iommu->cap))
+		val |= DMA_TLB_WRITE_DRAIN;
+
+	spin_lock_irqsave(&iommu->register_lock, flag);
+	/* Note: Only uses first TLB reg currently */
+	if (val_iva)
+		dmar_writeq(iommu->reg + tlb_offset, val_iva);
+	dmar_writeq(iommu->reg + tlb_offset + 8, val);
+
+	/* Make sure hardware complete it */
+	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
+		dmar_readq, (!(val & DMA_TLB_IVT)), val);
+
+	spin_unlock_irqrestore(&iommu->register_lock, flag);
+
+	/* check IOTLB invalidation granularity */
+	if (DMA_TLB_IAIG(val) == 0)
+		printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
+	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
+		pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
+			DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
+	/* flush context entry will implictly flush write buffer */
+	return 0;
+}
+
+static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu,
+	int non_present_entry_flush)
+{
+	return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
+		non_present_entry_flush);
+}
+
+static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did,
+	int non_present_entry_flush)
+{
+	return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
+		non_present_entry_flush);
+}
+
+static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
+	u64 addr, unsigned int pages, int non_present_entry_flush)
+{
+	unsigned int mask;
+
+	BUG_ON(addr & (~PAGE_MASK_4K));
+	BUG_ON(pages == 0);
+
+	/* Fallback to domain selective flush if no PSI support */
+	if (!cap_pgsel_inv(iommu->cap))
+		return iommu_flush_iotlb_dsi(iommu, did,
+			non_present_entry_flush);
+
+	/*
+	 * PSI requires page size to be 2 ^ x, and the base address is naturally
+	 * aligned to the size
+	 */
+	mask = ilog2(__roundup_pow_of_two(pages));
+	/* Fallback to domain selective flush if size is too big */
+	if (mask > cap_max_amask_val(iommu->cap))
+		return iommu_flush_iotlb_dsi(iommu, did,
+			non_present_entry_flush);
+
+	return __iommu_flush_iotlb(iommu, did, addr, mask,
+		DMA_TLB_PSI_FLUSH, non_present_entry_flush);
+}
+
+static int iommu_enable_translation(struct intel_iommu *iommu)
+{
+	u32 sts;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iommu->register_lock, flags);
+	writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
+
+	/* Make sure hardware complete it */
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+		readl, (sts & DMA_GSTS_TES), sts);
+
+	iommu->gcmd |= DMA_GCMD_TE;
+	spin_unlock_irqrestore(&iommu->register_lock, flags);
+	return 0;
+}
+
+static int iommu_disable_translation(struct intel_iommu *iommu)
+{
+	u32 sts;
+	unsigned long flag;
+
+	spin_lock_irqsave(&iommu->register_lock, flag);
+	iommu->gcmd &= ~DMA_GCMD_TE;
+	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+	/* Make sure hardware complete it */
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+		readl, (!(sts & DMA_GSTS_TES)), sts);
+
+	spin_unlock_irqrestore(&iommu->register_lock, flag);
+	return 0;
+}
+
+/* iommu interrupt handling. Most stuff are MSI-like. */
+
+static char *fault_reason_strings[] =
+{
+	"Software",
+	"Present bit in root entry is clear",
+	"Present bit in context entry is clear",
+	"Invalid context entry",
+	"Access beyond MGAW",
+	"PTE Write access is not set",
+	"PTE Read access is not set",
+	"Next page table ptr is invalid",
+	"Root table address invalid",
+	"Context table ptr is invalid",
+	"non-zero reserved fields in RTP",
+	"non-zero reserved fields in CTP",
+	"non-zero reserved fields in PTE",
+	"Unknown"
+};
+#define MAX_FAULT_REASON_IDX 	ARRAY_SIZE(fault_reason_strings)
+
+char *dmar_get_fault_reason(u8 fault_reason)
+{
+	if (fault_reason > MAX_FAULT_REASON_IDX)
+		return fault_reason_strings[MAX_FAULT_REASON_IDX];
+	else
+		return fault_reason_strings[fault_reason];
+}
+
+void dmar_msi_unmask(unsigned int irq)
+{
+	struct intel_iommu *iommu = get_irq_data(irq);
+	unsigned long flag;
+
+	/* unmask it */
+	spin_lock_irqsave(&iommu->register_lock, flag);
+	writel(0, iommu->reg + DMAR_FECTL_REG);
+	/* Read a reg to force flush the post write */
+	readl(iommu->reg + DMAR_FECTL_REG);
+	spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+void dmar_msi_mask(unsigned int irq)
+{
+	unsigned long flag;
+	struct intel_iommu *iommu = get_irq_data(irq);
+
+	/* mask it */
+	spin_lock_irqsave(&iommu->register_lock, flag);
+	writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
+	/* Read a reg to force flush the post write */
+	readl(iommu->reg + DMAR_FECTL_REG);
+	spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+void dmar_msi_write(int irq, struct msi_msg *msg)
+{
+	struct intel_iommu *iommu = get_irq_data(irq);
+	unsigned long flag;
+
+	spin_lock_irqsave(&iommu->register_lock, flag);
+	writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
+	writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
+	writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
+	spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+void dmar_msi_read(int irq, struct msi_msg *msg)
+{
+	struct intel_iommu *iommu = get_irq_data(irq);
+	unsigned long flag;
+
+	spin_lock_irqsave(&iommu->register_lock, flag);
+	msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
+	msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
+	msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
+	spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
+		u8 fault_reason, u16 source_id, u64 addr)
+{
+	char *reason;
+
+	reason = dmar_get_fault_reason(fault_reason);
+
+	printk(KERN_ERR
+		"DMAR:[%s] Request device [%02x:%02x.%d] "
+		"fault addr %llx \n"
+		"DMAR:[fault reason %02d] %s\n",
+		(type ? "DMA Read" : "DMA Write"),
+		(source_id >> 8), PCI_SLOT(source_id & 0xFF),
+		PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
+	return 0;
+}
+
+#define PRIMARY_FAULT_REG_LEN (16)
+static irqreturn_t iommu_page_fault(int irq, void *dev_id)
+{
+	struct intel_iommu *iommu = dev_id;
+	int reg, fault_index;
+	u32 fault_status;
+	unsigned long flag;
+
+	spin_lock_irqsave(&iommu->register_lock, flag);
+	fault_status = readl(iommu->reg + DMAR_FSTS_REG);
+
+	/* TBD: ignore advanced fault log currently */
+	if (!(fault_status & DMA_FSTS_PPF))
+		goto clear_overflow;
+
+	fault_index = dma_fsts_fault_record_index(fault_status);
+	reg = cap_fault_reg_offset(iommu->cap);
+	while (1) {
+		u8 fault_reason;
+		u16 source_id;
+		u64 guest_addr;
+		int type;
+		u32 data;
+
+		/* highest 32 bits */
+		data = readl(iommu->reg + reg +
+				fault_index * PRIMARY_FAULT_REG_LEN + 12);
+		if (!(data & DMA_FRCD_F))
+			break;
+
+		fault_reason = dma_frcd_fault_reason(data);
+		type = dma_frcd_type(data);
+
+		data = readl(iommu->reg + reg +
+				fault_index * PRIMARY_FAULT_REG_LEN + 8);
+		source_id = dma_frcd_source_id(data);
+
+		guest_addr = dmar_readq(iommu->reg + reg +
+				fault_index * PRIMARY_FAULT_REG_LEN);
+		guest_addr = dma_frcd_page_addr(guest_addr);
+		/* clear the fault */
+		writel(DMA_FRCD_F, iommu->reg + reg +
+			fault_index * PRIMARY_FAULT_REG_LEN + 12);
+
+		spin_unlock_irqrestore(&iommu->register_lock, flag);
+
+		iommu_page_fault_do_one(iommu, type, fault_reason,
+				source_id, guest_addr);
+
+		fault_index++;
+		if (fault_index > cap_num_fault_regs(iommu->cap))
+			fault_index = 0;
+		spin_lock_irqsave(&iommu->register_lock, flag);
+	}
+clear_overflow:
+	/* clear primary fault overflow */
+	fault_status = readl(iommu->reg + DMAR_FSTS_REG);
+	if (fault_status & DMA_FSTS_PFO)
+		writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
+
+	spin_unlock_irqrestore(&iommu->register_lock, flag);
+	return IRQ_HANDLED;
+}
+
+int dmar_set_interrupt(struct intel_iommu *iommu)
+{
+	int irq, ret;
+
+	irq = create_irq();
+	if (!irq) {
+		printk(KERN_ERR "IOMMU: no free vectors\n");
+		return -EINVAL;
+	}
+
+	set_irq_data(irq, iommu);
+	iommu->irq = irq;
+
+	ret = arch_setup_dmar_msi(irq);
+	if (ret) {
+		set_irq_data(irq, NULL);
+		iommu->irq = 0;
+		destroy_irq(irq);
+		return 0;
+	}
+
+	/* Force fault register is cleared */
+	iommu_page_fault(irq, iommu);
+
+	ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
+	if (ret)
+		printk(KERN_ERR "IOMMU: can't request irq\n");
+	return ret;
+}
+
+static int iommu_init_domains(struct intel_iommu *iommu)
+{
+	unsigned long ndomains;
+	unsigned long nlongs;
+
+	ndomains = cap_ndoms(iommu->cap);
+	pr_debug("Number of Domains supportd <%ld>\n", ndomains);
+	nlongs = BITS_TO_LONGS(ndomains);
+
+	/* TBD: there might be 64K domains,
+	 * consider other allocation for future chip
+	 */
+	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
+	if (!iommu->domain_ids) {
+		printk(KERN_ERR "Allocating domain id array failed\n");
+		return -ENOMEM;
+	}
+	iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
+			GFP_KERNEL);
+	if (!iommu->domains) {
+		printk(KERN_ERR "Allocating domain array failed\n");
+		kfree(iommu->domain_ids);
+		return -ENOMEM;
+	}
+
+	/*
+	 * if Caching mode is set, then invalid translations are tagged
+	 * with domainid 0. Hence we need to pre-allocate it.
+	 */
+	if (cap_caching_mode(iommu->cap))
+		set_bit(0, iommu->domain_ids);
+	return 0;
+}
+
+static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
+{
+	struct intel_iommu *iommu;
+	int ret;
+	int map_size;
+	u32 ver;
+
+	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
+	if (!iommu)
+		return NULL;
+	iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
+	if (!iommu->reg) {
+		printk(KERN_ERR "IOMMU: can't map the region\n");
+		goto error;
+	}
+	iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
+	iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
+
+	/* the registers might be more than one page */
+	map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
+		cap_max_fault_reg_offset(iommu->cap));
+	map_size = PAGE_ALIGN_4K(map_size);
+	if (map_size > PAGE_SIZE_4K) {
+		iounmap(iommu->reg);
+		iommu->reg = ioremap(drhd->reg_base_addr, map_size);
+		if (!iommu->reg) {
+			printk(KERN_ERR "IOMMU: can't map the region\n");
+			goto error;
+		}
+	}
+
+	ver = readl(iommu->reg + DMAR_VER_REG);
+	pr_debug("IOMMU %llx: ver %d:%d cap %llx ecap %llx\n",
+		drhd->reg_base_addr, DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
+		iommu->cap, iommu->ecap);
+	ret = iommu_init_domains(iommu);
+	if (ret)
+		goto error_unmap;
+	spin_lock_init(&iommu->lock);
+	spin_lock_init(&iommu->register_lock);
+
+	drhd->iommu = iommu;
+	return iommu;
+error_unmap:
+	iounmap(iommu->reg);
+	iommu->reg = 0;
+error:
+	kfree(iommu);
+	return NULL;
+}
+
+static void domain_exit(struct dmar_domain *domain);
+static void free_iommu(struct intel_iommu *iommu)
+{
+	struct dmar_domain *domain;
+	int i;
+
+	if (!iommu)
+		return;
+
+	i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
+	for (; i < cap_ndoms(iommu->cap); ) {
+		domain = iommu->domains[i];
+		clear_bit(i, iommu->domain_ids);
+		domain_exit(domain);
+		i = find_next_bit(iommu->domain_ids,
+			cap_ndoms(iommu->cap), i+1);
+	}
+
+	if (iommu->gcmd & DMA_GCMD_TE)
+		iommu_disable_translation(iommu);
+
+	if (iommu->irq) {
+		set_irq_data(iommu->irq, NULL);
+		/* This will mask the irq */
+		free_irq(iommu->irq, iommu);
+		destroy_irq(iommu->irq);
+	}
+
+	kfree(iommu->domains);
+	kfree(iommu->domain_ids);
+
+	/* free context mapping */
+	free_context_table(iommu);
+
+	if (iommu->reg)
+		iounmap(iommu->reg);
+	kfree(iommu);
+}
+
+static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
+{
+	unsigned long num;
+	unsigned long ndomains;
+	struct dmar_domain *domain;
+	unsigned long flags;
+
+	domain = alloc_domain_mem();
+	if (!domain)
+		return NULL;
+
+	ndomains = cap_ndoms(iommu->cap);
+
+	spin_lock_irqsave(&iommu->lock, flags);
+	num = find_first_zero_bit(iommu->domain_ids, ndomains);
+	if (num >= ndomains) {
+		spin_unlock_irqrestore(&iommu->lock, flags);
+		free_domain_mem(domain);
+		printk(KERN_ERR "IOMMU: no free domain ids\n");
+		return NULL;
+	}
+
+	set_bit(num, iommu->domain_ids);
+	domain->id = num;
+	domain->iommu = iommu;
+	iommu->domains[num] = domain;
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
+	return domain;
+}
+
+static void iommu_free_domain(struct dmar_domain *domain)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&domain->iommu->lock, flags);
+	clear_bit(domain->id, domain->iommu->domain_ids);
+	spin_unlock_irqrestore(&domain->iommu->lock, flags);
+}
+
+static struct iova_domain reserved_iova_list;
+
+static void dmar_init_reserved_ranges(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iova *iova;
+	int i;
+	u64 addr, size;
+
+	init_iova_domain(&reserved_iova_list);
+
+	/* IOAPIC ranges shouldn't be accessed by DMA */
+	iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
+		IOVA_PFN(IOAPIC_RANGE_END));
+	if (!iova)
+		printk(KERN_ERR "Reserve IOAPIC range failed\n");
+
+	/* Reserve all PCI MMIO to avoid peer-to-peer access */
+	for_each_pci_dev(pdev) {
+		struct resource *r;
+
+		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+			r = &pdev->resource[i];
+			if (!r->flags || !(r->flags & IORESOURCE_MEM))
+				continue;
+			addr = r->start;
+			addr &= PAGE_MASK_4K;
+			size = r->end - addr;
+			size = PAGE_ALIGN_4K(size);
+			iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
+				IOVA_PFN(size + addr) - 1);
+			if (!iova)
+				printk(KERN_ERR "Reserve iova failed\n");
+		}
+	}
+
+}
+
+static void domain_reserve_special_ranges(struct dmar_domain *domain)
+{
+	copy_reserved_iova(&reserved_iova_list, &domain->iovad);
+}
+
+static inline int guestwidth_to_adjustwidth(int gaw)
+{
+	int agaw;
+	int r = (gaw - 12) % 9;
+
+	if (r == 0)
+		agaw = gaw;
+	else
+		agaw = gaw + 9 - r;
+	if (agaw > 64)
+		agaw = 64;
+	return agaw;
+}
+
+static int domain_init(struct dmar_domain *domain, int guest_width)
+{
+	struct intel_iommu *iommu;
+	int adjust_width, agaw;
+	unsigned long sagaw;
+
+	init_iova_domain(&domain->iovad);
+	spin_lock_init(&domain->mapping_lock);
+
+	domain_reserve_special_ranges(domain);
+
+	/* calculate AGAW */
+	iommu = domain->iommu;
+	if (guest_width > cap_mgaw(iommu->cap))
+		guest_width = cap_mgaw(iommu->cap);
+	domain->gaw = guest_width;
+	adjust_width = guestwidth_to_adjustwidth(guest_width);
+	agaw = width_to_agaw(adjust_width);
+	sagaw = cap_sagaw(iommu->cap);
+	if (!test_bit(agaw, &sagaw)) {
+		/* hardware doesn't support it, choose a bigger one */
+		pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
+		agaw = find_next_bit(&sagaw, 5, agaw);
+		if (agaw >= 5)
+			return -ENODEV;
+	}
+	domain->agaw = agaw;
+	INIT_LIST_HEAD(&domain->devices);
+
+	/* always allocate the top pgd */
+	domain->pgd = (struct dma_pte *)alloc_pgtable_page();
+	if (!domain->pgd)
+		return -ENOMEM;
+	__iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
+	return 0;
+}
+
+static void domain_exit(struct dmar_domain *domain)
+{
+	u64 end;
+
+	/* Domain 0 is reserved, so dont process it */
+	if (!domain)
+		return;
+
+	domain_remove_dev_info(domain);
+	/* destroy iovas */
+	put_iova_domain(&domain->iovad);
+	end = DOMAIN_MAX_ADDR(domain->gaw);
+	end = end & (~PAGE_MASK_4K);
+
+	/* clear ptes */
+	dma_pte_clear_range(domain, 0, end);
+
+	/* free page tables */
+	dma_pte_free_pagetable(domain, 0, end);
+
+	iommu_free_domain(domain);
+	free_domain_mem(domain);
+}
+
+static int domain_context_mapping_one(struct dmar_domain *domain,
+		u8 bus, u8 devfn)
+{
+	struct context_entry *context;
+	struct intel_iommu *iommu = domain->iommu;
+	unsigned long flags;
+
+	pr_debug("Set context mapping for %02x:%02x.%d\n",
+		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+	BUG_ON(!domain->pgd);
+	context = device_to_context_entry(iommu, bus, devfn);
+	if (!context)
+		return -ENOMEM;
+	spin_lock_irqsave(&iommu->lock, flags);
+	if (context_present(*context)) {
+		spin_unlock_irqrestore(&iommu->lock, flags);
+		return 0;
+	}
+
+	context_set_domain_id(*context, domain->id);
+	context_set_address_width(*context, domain->agaw);
+	context_set_address_root(*context, virt_to_phys(domain->pgd));
+	context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
+	context_set_fault_enable(*context);
+	context_set_present(*context);
+	__iommu_flush_cache(iommu, context, sizeof(*context));
+
+	/* it's a non-present to present mapping */
+	if (iommu_flush_context_device(iommu, domain->id,
+			(((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1))
+		iommu_flush_write_buffer(iommu);
+	else
+		iommu_flush_iotlb_dsi(iommu, 0, 0);
+	spin_unlock_irqrestore(&iommu->lock, flags);
+	return 0;
+}
+
+static int
+domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
+{
+	int ret;
+	struct pci_dev *tmp, *parent;
+
+	ret = domain_context_mapping_one(domain, pdev->bus->number,
+		pdev->devfn);
+	if (ret)
+		return ret;
+
+	/* dependent device mapping */
+	tmp = pci_find_upstream_pcie_bridge(pdev);
+	if (!tmp)
+		return 0;
+	/* Secondary interface's bus number and devfn 0 */
+	parent = pdev->bus->self;
+	while (parent != tmp) {
+		ret = domain_context_mapping_one(domain, parent->bus->number,
+			parent->devfn);
+		if (ret)
+			return ret;
+		parent = parent->bus->self;
+	}
+	if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
+		return domain_context_mapping_one(domain,
+			tmp->subordinate->number, 0);
+	else /* this is a legacy PCI bridge */
+		return domain_context_mapping_one(domain,
+			tmp->bus->number, tmp->devfn);
+}
+
+static int domain_context_mapped(struct dmar_domain *domain,
+	struct pci_dev *pdev)
+{
+	int ret;
+	struct pci_dev *tmp, *parent;
+
+	ret = device_context_mapped(domain->iommu,
+		pdev->bus->number, pdev->devfn);
+	if (!ret)
+		return ret;
+	/* dependent device mapping */
+	tmp = pci_find_upstream_pcie_bridge(pdev);
+	if (!tmp)
+		return ret;
+	/* Secondary interface's bus number and devfn 0 */
+	parent = pdev->bus->self;
+	while (parent != tmp) {
+		ret = device_context_mapped(domain->iommu, parent->bus->number,
+			parent->devfn);
+		if (!ret)
+			return ret;
+		parent = parent->bus->self;
+	}
+	if (tmp->is_pcie)
+		return device_context_mapped(domain->iommu,
+			tmp->subordinate->number, 0);
+	else
+		return device_context_mapped(domain->iommu,
+			tmp->bus->number, tmp->devfn);
+}
+
+static int
+domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
+			u64 hpa, size_t size, int prot)
+{
+	u64 start_pfn, end_pfn;
+	struct dma_pte *pte;
+	int index;
+
+	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
+		return -EINVAL;
+	iova &= PAGE_MASK_4K;
+	start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K;
+	end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K;
+	index = 0;
+	while (start_pfn < end_pfn) {
+		pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
+		if (!pte)
+			return -ENOMEM;
+		/* We don't need lock here, nobody else
+		 * touches the iova range
+		 */
+		BUG_ON(dma_pte_addr(*pte));
+		dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
+		dma_set_pte_prot(*pte, prot);
+		__iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
+		start_pfn++;
+		index++;
+	}
+	return 0;
+}
+
+static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
+{
+	clear_context_table(domain->iommu, bus, devfn);
+	iommu_flush_context_global(domain->iommu, 0);
+	iommu_flush_iotlb_global(domain->iommu, 0);
+}
+
+static void domain_remove_dev_info(struct dmar_domain *domain)
+{
+	struct device_domain_info *info;
+	unsigned long flags;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	while (!list_empty(&domain->devices)) {
+		info = list_entry(domain->devices.next,
+			struct device_domain_info, link);
+		list_del(&info->link);
+		list_del(&info->global);
+		if (info->dev)
+			info->dev->dev.archdata.iommu = NULL;
+		spin_unlock_irqrestore(&device_domain_lock, flags);
+
+		detach_domain_for_dev(info->domain, info->bus, info->devfn);
+		free_devinfo_mem(info);
+
+		spin_lock_irqsave(&device_domain_lock, flags);
+	}
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+}
+
+/*
+ * find_domain
+ * Note: we use struct pci_dev->dev.archdata.iommu stores the info
+ */
+struct dmar_domain *
+find_domain(struct pci_dev *pdev)
+{
+	struct device_domain_info *info;
+
+	/* No lock here, assumes no domain exit in normal case */
+	info = pdev->dev.archdata.iommu;
+	if (info)
+		return info->domain;
+	return NULL;
+}
+
+static int dmar_pci_device_match(struct pci_dev *devices[], int cnt,
+     struct pci_dev *dev)
+{
+	int index;
+
+	while (dev) {
+		for (index = 0; index < cnt; index ++)
+			if (dev == devices[index])
+				return 1;
+
+		/* Check our parent */
+		dev = dev->bus->self;
+	}
+
+	return 0;
+}
+
+static struct dmar_drhd_unit *
+dmar_find_matched_drhd_unit(struct pci_dev *dev)
+{
+	struct dmar_drhd_unit *drhd = NULL;
+
+	list_for_each_entry(drhd, &dmar_drhd_units, list) {
+		if (drhd->include_all || dmar_pci_device_match(drhd->devices,
+						drhd->devices_cnt, dev))
+			return drhd;
+	}
+
+	return NULL;
+}
+
+/* domain is initialized */
+static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
+{
+	struct dmar_domain *domain, *found = NULL;
+	struct intel_iommu *iommu;
+	struct dmar_drhd_unit *drhd;
+	struct device_domain_info *info, *tmp;
+	struct pci_dev *dev_tmp;
+	unsigned long flags;
+	int bus = 0, devfn = 0;
+
+	domain = find_domain(pdev);
+	if (domain)
+		return domain;
+
+	dev_tmp = pci_find_upstream_pcie_bridge(pdev);
+	if (dev_tmp) {
+		if (dev_tmp->is_pcie) {
+			bus = dev_tmp->subordinate->number;
+			devfn = 0;
+		} else {
+			bus = dev_tmp->bus->number;
+			devfn = dev_tmp->devfn;
+		}
+		spin_lock_irqsave(&device_domain_lock, flags);
+		list_for_each_entry(info, &device_domain_list, global) {
+			if (info->bus == bus && info->devfn == devfn) {
+				found = info->domain;
+				break;
+			}
+		}
+		spin_unlock_irqrestore(&device_domain_lock, flags);
+		/* pcie-pci bridge already has a domain, uses it */
+		if (found) {
+			domain = found;
+			goto found_domain;
+		}
+	}
+
+	/* Allocate new domain for the device */
+	drhd = dmar_find_matched_drhd_unit(pdev);
+	if (!drhd) {
+		printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
+			pci_name(pdev));
+		return NULL;
+	}
+	iommu = drhd->iommu;
+
+	domain = iommu_alloc_domain(iommu);
+	if (!domain)
+		goto error;
+
+	if (domain_init(domain, gaw)) {
+		domain_exit(domain);
+		goto error;
+	}
+
+	/* register pcie-to-pci device */
+	if (dev_tmp) {
+		info = alloc_devinfo_mem();
+		if (!info) {
+			domain_exit(domain);
+			goto error;
+		}
+		info->bus = bus;
+		info->devfn = devfn;
+		info->dev = NULL;
+		info->domain = domain;
+		/* This domain is shared by devices under p2p bridge */
+		domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
+
+		/* pcie-to-pci bridge already has a domain, uses it */
+		found = NULL;
+		spin_lock_irqsave(&device_domain_lock, flags);
+		list_for_each_entry(tmp, &device_domain_list, global) {
+			if (tmp->bus == bus && tmp->devfn == devfn) {
+				found = tmp->domain;
+				break;
+			}
+		}
+		if (found) {
+			free_devinfo_mem(info);
+			domain_exit(domain);
+			domain = found;
+		} else {
+			list_add(&info->link, &domain->devices);
+			list_add(&info->global, &device_domain_list);
+		}
+		spin_unlock_irqrestore(&device_domain_lock, flags);
+	}
+
+found_domain:
+	info = alloc_devinfo_mem();
+	if (!info)
+		goto error;
+	info->bus = pdev->bus->number;
+	info->devfn = pdev->devfn;
+	info->dev = pdev;
+	info->domain = domain;
+	spin_lock_irqsave(&device_domain_lock, flags);
+	/* somebody is fast */
+	found = find_domain(pdev);
+	if (found != NULL) {
+		spin_unlock_irqrestore(&device_domain_lock, flags);
+		if (found != domain) {
+			domain_exit(domain);
+			domain = found;
+		}
+		free_devinfo_mem(info);
+		return domain;
+	}
+	list_add(&info->link, &domain->devices);
+	list_add(&info->global, &device_domain_list);
+	pdev->dev.archdata.iommu = info;
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+	return domain;
+error:
+	/* recheck it here, maybe others set it */
+	return find_domain(pdev);
+}
+
+static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
+{
+	struct dmar_domain *domain;
+	unsigned long size;
+	u64 base;
+	int ret;
+
+	printk(KERN_INFO
+		"IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
+		pci_name(pdev), start, end);
+	/* page table init */
+	domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
+	if (!domain)
+		return -ENOMEM;
+
+	/* The address might not be aligned */
+	base = start & PAGE_MASK_4K;
+	size = end - base;
+	size = PAGE_ALIGN_4K(size);
+	if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
+			IOVA_PFN(base + size) - 1)) {
+		printk(KERN_ERR "IOMMU: reserve iova failed\n");
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	pr_debug("Mapping reserved region %lx@%llx for %s\n",
+		size, base, pci_name(pdev));
+	/*
+	 * RMRR range might have overlap with physical memory range,
+	 * clear it first
+	 */
+	dma_pte_clear_range(domain, base, base + size);
+
+	ret = domain_page_mapping(domain, base, base, size,
+		DMA_PTE_READ|DMA_PTE_WRITE);
+	if (ret)
+		goto error;
+
+	/* context entry init */
+	ret = domain_context_mapping(domain, pdev);
+	if (!ret)
+		return 0;
+error:
+	domain_exit(domain);
+	return ret;
+
+}
+
+static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
+	struct pci_dev *pdev)
+{
+	if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
+		return 0;
+	return iommu_prepare_identity_map(pdev, rmrr->base_address,
+		rmrr->end_address + 1);
+}
+
+#ifdef CONFIG_DMAR_GFX_WA
+extern int arch_get_ram_range(int slot, u64 *addr, u64 *size);
+static void __init iommu_prepare_gfx_mapping(void)
+{
+	struct pci_dev *pdev = NULL;
+	u64 base, size;
+	int slot;
+	int ret;
+
+	for_each_pci_dev(pdev) {
+		if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
+				!IS_GFX_DEVICE(pdev))
+			continue;
+		printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
+			pci_name(pdev));
+		slot = arch_get_ram_range(0, &base, &size);
+		while (slot >= 0) {
+			ret = iommu_prepare_identity_map(pdev,
+					base, base + size);
+			if (ret)
+				goto error;
+			slot = arch_get_ram_range(slot, &base, &size);
+		}
+		continue;
+error:
+		printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
+	}
+}
+#endif
+
+#ifdef CONFIG_DMAR_FLOPPY_WA
+static inline void iommu_prepare_isa(void)
+{
+	struct pci_dev *pdev;
+	int ret;
+
+	pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
+	if (!pdev)
+		return;
+
+	printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
+	ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
+
+	if (ret)
+		printk("IOMMU: Failed to create 0-64M identity map, "
+			"floppy might not work\n");
+
+}
+#else
+static inline void iommu_prepare_isa(void)
+{
+	return;
+}
+#endif /* !CONFIG_DMAR_FLPY_WA */
+
+int __init init_dmars(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct dmar_rmrr_unit *rmrr;
+	struct pci_dev *pdev;
+	struct intel_iommu *iommu;
+	int ret, unit = 0;
+
+	/*
+	 * for each drhd
+	 *    allocate root
+	 *    initialize and program root entry to not present
+	 * endfor
+	 */
+	for_each_drhd_unit(drhd) {
+		if (drhd->ignored)
+			continue;
+		iommu = alloc_iommu(drhd);
+		if (!iommu) {
+			ret = -ENOMEM;
+			goto error;
+		}
+
+		/*
+		 * TBD:
+		 * we could share the same root & context tables
+		 * amoung all IOMMU's. Need to Split it later.
+		 */
+		ret = iommu_alloc_root_entry(iommu);
+		if (ret) {
+			printk(KERN_ERR "IOMMU: allocate root entry failed\n");
+			goto error;
+		}
+	}
+
+	/*
+	 * For each rmrr
+	 *   for each dev attached to rmrr
+	 *   do
+	 *     locate drhd for dev, alloc domain for dev
+	 *     allocate free domain
+	 *     allocate page table entries for rmrr
+	 *     if context not allocated for bus
+	 *           allocate and init context
+	 *           set present in root table for this bus
+	 *     init context with domain, translation etc
+	 *    endfor
+	 * endfor
+	 */
+	for_each_rmrr_units(rmrr) {
+		int i;
+		for (i = 0; i < rmrr->devices_cnt; i++) {
+			pdev = rmrr->devices[i];
+			/* some BIOS lists non-exist devices in DMAR table */
+			if (!pdev)
+				continue;
+			ret = iommu_prepare_rmrr_dev(rmrr, pdev);
+			if (ret)
+				printk(KERN_ERR
+				 "IOMMU: mapping reserved region failed\n");
+		}
+	}
+
+	iommu_prepare_gfx_mapping();
+
+	iommu_prepare_isa();
+
+	/*
+	 * for each drhd
+	 *   enable fault log
+	 *   global invalidate context cache
+	 *   global invalidate iotlb
+	 *   enable translation
+	 */
+	for_each_drhd_unit(drhd) {
+		if (drhd->ignored)
+			continue;
+		iommu = drhd->iommu;
+		sprintf (iommu->name, "dmar%d", unit++);
+
+		iommu_flush_write_buffer(iommu);
+
+		ret = dmar_set_interrupt(iommu);
+		if (ret)
+			goto error;
+
+		iommu_set_root_entry(iommu);
+
+		iommu_flush_context_global(iommu, 0);
+		iommu_flush_iotlb_global(iommu, 0);
+
+		ret = iommu_enable_translation(iommu);
+		if (ret)
+			goto error;
+	}
+
+	return 0;
+error:
+	for_each_drhd_unit(drhd) {
+		if (drhd->ignored)
+			continue;
+		iommu = drhd->iommu;
+		free_iommu(iommu);
+	}
+	return ret;
+}
+
+static inline u64 aligned_size(u64 host_addr, size_t size)
+{
+	u64 addr;
+	addr = (host_addr & (~PAGE_MASK_4K)) + size;
+	return PAGE_ALIGN_4K(addr);
+}
+
+struct iova *
+iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
+{
+	struct iova *piova;
+
+	/* Make sure it's in range */
+	end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
+	if (!size || (IOVA_START_ADDR + size > end))
+		return NULL;
+
+	piova = alloc_iova(&domain->iovad,
+			size >> PAGE_SHIFT_4K, IOVA_PFN(end), 1);
+	return piova;
+}
+
+static struct iova *
+__intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
+		size_t size)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct iova *iova = NULL;
+
+	if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) {
+		iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
+	} else  {
+		/*
+		 * First try to allocate an io virtual address in
+		 * DMA_32BIT_MASK and if that fails then try allocating
+		 * from higer range
+		 */
+		iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
+		if (!iova)
+			iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
+	}
+
+	if (!iova) {
+		printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
+		return NULL;
+	}
+
+	return iova;
+}
+
+static struct dmar_domain *
+get_valid_domain_for_dev(struct pci_dev *pdev)
+{
+	struct dmar_domain *domain;
+	int ret;
+
+	domain = get_domain_for_dev(pdev,
+			DEFAULT_DOMAIN_ADDRESS_WIDTH);
+	if (!domain) {
+		printk(KERN_ERR
+			"Allocating domain for %s failed", pci_name(pdev));
+		return 0;
+	}
+
+	/* make sure context mapping is ok */
+	if (unlikely(!domain_context_mapped(domain, pdev))) {
+		ret = domain_context_mapping(domain, pdev);
+		if (ret) {
+			printk(KERN_ERR
+				"Domain context map for %s failed",
+				pci_name(pdev));
+			return 0;
+		}
+	}
+
+	return domain;
+}
+
+static dma_addr_t intel_map_single(struct device *hwdev, void *addr,
+	size_t size, int dir)
+{
+	struct pci_dev *pdev = to_pci_dev(hwdev);
+	int ret;
+	struct dmar_domain *domain;
+	unsigned long start_addr;
+	struct iova *iova;
+	int prot = 0;
+
+	BUG_ON(dir == DMA_NONE);
+	if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
+		return virt_to_bus(addr);
+
+	domain = get_valid_domain_for_dev(pdev);
+	if (!domain)
+		return 0;
+
+	addr = (void *)virt_to_phys(addr);
+	size = aligned_size((u64)addr, size);
+
+	iova = __intel_alloc_iova(hwdev, domain, size);
+	if (!iova)
+		goto error;
+
+	start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
+
+	/*
+	 * Check if DMAR supports zero-length reads on write only
+	 * mappings..
+	 */
+	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
+			!cap_zlr(domain->iommu->cap))
+		prot |= DMA_PTE_READ;
+	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+		prot |= DMA_PTE_WRITE;
+	/*
+	 * addr - (addr + size) might be partial page, we should map the whole
+	 * page.  Note: if two part of one page are separately mapped, we
+	 * might have two guest_addr mapping to the same host addr, but this
+	 * is not a big problem
+	 */
+	ret = domain_page_mapping(domain, start_addr,
+		((u64)addr) & PAGE_MASK_4K, size, prot);
+	if (ret)
+		goto error;
+
+	pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
+		pci_name(pdev), size, (u64)addr,
+		size, (u64)start_addr, dir);
+
+	/* it's a non-present to present mapping */
+	ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
+			start_addr, size >> PAGE_SHIFT_4K, 1);
+	if (ret)
+		iommu_flush_write_buffer(domain->iommu);
+
+	return (start_addr + ((u64)addr & (~PAGE_MASK_4K)));
+
+error:
+	if (iova)
+		__free_iova(&domain->iovad, iova);
+	printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
+		pci_name(pdev), size, (u64)addr, dir);
+	return 0;
+}
+
+static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
+	size_t size, int dir)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct dmar_domain *domain;
+	unsigned long start_addr;
+	struct iova *iova;
+
+	if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
+		return;
+	domain = find_domain(pdev);
+	BUG_ON(!domain);
+
+	iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
+	if (!iova)
+		return;
+
+	start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
+	size = aligned_size((u64)dev_addr, size);
+
+	pr_debug("Device %s unmapping: %lx@%llx\n",
+		pci_name(pdev), size, (u64)start_addr);
+
+	/*  clear the whole page */
+	dma_pte_clear_range(domain, start_addr, start_addr + size);
+	/* free page tables */
+	dma_pte_free_pagetable(domain, start_addr, start_addr + size);
+
+	if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
+			size >> PAGE_SHIFT_4K, 0))
+		iommu_flush_write_buffer(domain->iommu);
+
+	/* free iova */
+	__free_iova(&domain->iovad, iova);
+}
+
+static void * intel_alloc_coherent(struct device *hwdev, size_t size,
+		       dma_addr_t *dma_handle, gfp_t flags)
+{
+	void *vaddr;
+	int order;
+
+	size = PAGE_ALIGN_4K(size);
+	order = get_order(size);
+	flags &= ~(GFP_DMA | GFP_DMA32);
+
+	vaddr = (void *)__get_free_pages(flags, order);
+	if (!vaddr)
+		return NULL;
+	memset(vaddr, 0, size);
+
+	*dma_handle = intel_map_single(hwdev, vaddr, size, DMA_BIDIRECTIONAL);
+	if (*dma_handle)
+		return vaddr;
+	free_pages((unsigned long)vaddr, order);
+	return NULL;
+}
+
+static void intel_free_coherent(struct device *hwdev, size_t size,
+	void *vaddr, dma_addr_t dma_handle)
+{
+	int order;
+
+	size = PAGE_ALIGN_4K(size);
+	order = get_order(size);
+
+	intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
+	free_pages((unsigned long)vaddr, order);
+}
+
+#define SG_ENT_VIRT_ADDRESS(sg)	(sg_virt((sg)))
+static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
+	int nelems, int dir)
+{
+	int i;
+	struct pci_dev *pdev = to_pci_dev(hwdev);
+	struct dmar_domain *domain;
+	unsigned long start_addr;
+	struct iova *iova;
+	size_t size = 0;
+	void *addr;
+	struct scatterlist *sg;
+
+	if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
+		return;
+
+	domain = find_domain(pdev);
+
+	iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
+	if (!iova)
+		return;
+	for_each_sg(sglist, sg, nelems, i) {
+		addr = SG_ENT_VIRT_ADDRESS(sg);
+		size += aligned_size((u64)addr, sg->length);
+	}
+
+	start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
+
+	/*  clear the whole page */
+	dma_pte_clear_range(domain, start_addr, start_addr + size);
+	/* free page tables */
+	dma_pte_free_pagetable(domain, start_addr, start_addr + size);
+
+	if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
+			size >> PAGE_SHIFT_4K, 0))
+		iommu_flush_write_buffer(domain->iommu);
+
+	/* free iova */
+	__free_iova(&domain->iovad, iova);
+}
+
+static int intel_nontranslate_map_sg(struct device *hddev,
+	struct scatterlist *sglist, int nelems, int dir)
+{
+	int i;
+	struct scatterlist *sg;
+
+	for_each_sg(sglist, sg, nelems, i) {
+		BUG_ON(!sg_page(sg));
+		sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
+		sg->dma_length = sg->length;
+	}
+	return nelems;
+}
+
+static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist,
+				int nelems, int dir)
+{
+	void *addr;
+	int i;
+	struct pci_dev *pdev = to_pci_dev(hwdev);
+	struct dmar_domain *domain;
+	size_t size = 0;
+	int prot = 0;
+	size_t offset = 0;
+	struct iova *iova = NULL;
+	int ret;
+	struct scatterlist *sg;
+	unsigned long start_addr;
+
+	BUG_ON(dir == DMA_NONE);
+	if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
+		return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
+
+	domain = get_valid_domain_for_dev(pdev);
+	if (!domain)
+		return 0;
+
+	for_each_sg(sglist, sg, nelems, i) {
+		addr = SG_ENT_VIRT_ADDRESS(sg);
+		addr = (void *)virt_to_phys(addr);
+		size += aligned_size((u64)addr, sg->length);
+	}
+
+	iova = __intel_alloc_iova(hwdev, domain, size);
+	if (!iova) {
+		sglist->dma_length = 0;
+		return 0;
+	}
+
+	/*
+	 * Check if DMAR supports zero-length reads on write only
+	 * mappings..
+	 */
+	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
+			!cap_zlr(domain->iommu->cap))
+		prot |= DMA_PTE_READ;
+	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+		prot |= DMA_PTE_WRITE;
+
+	start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
+	offset = 0;
+	for_each_sg(sglist, sg, nelems, i) {
+		addr = SG_ENT_VIRT_ADDRESS(sg);
+		addr = (void *)virt_to_phys(addr);
+		size = aligned_size((u64)addr, sg->length);
+		ret = domain_page_mapping(domain, start_addr + offset,
+			((u64)addr) & PAGE_MASK_4K,
+			size, prot);
+		if (ret) {
+			/*  clear the page */
+			dma_pte_clear_range(domain, start_addr,
+				  start_addr + offset);
+			/* free page tables */
+			dma_pte_free_pagetable(domain, start_addr,
+				  start_addr + offset);
+			/* free iova */
+			__free_iova(&domain->iovad, iova);
+			return 0;
+		}
+		sg->dma_address = start_addr + offset +
+				((u64)addr & (~PAGE_MASK_4K));
+		sg->dma_length = sg->length;
+		offset += size;
+	}
+
+	/* it's a non-present to present mapping */
+	if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
+			start_addr, offset >> PAGE_SHIFT_4K, 1))
+		iommu_flush_write_buffer(domain->iommu);
+	return nelems;
+}
+
+static struct dma_mapping_ops intel_dma_ops = {
+	.alloc_coherent = intel_alloc_coherent,
+	.free_coherent = intel_free_coherent,
+	.map_single = intel_map_single,
+	.unmap_single = intel_unmap_single,
+	.map_sg = intel_map_sg,
+	.unmap_sg = intel_unmap_sg,
+};
+
+static inline int iommu_domain_cache_init(void)
+{
+	int ret = 0;
+
+	iommu_domain_cache = kmem_cache_create("iommu_domain",
+					 sizeof(struct dmar_domain),
+					 0,
+					 SLAB_HWCACHE_ALIGN,
+
+					 NULL);
+	if (!iommu_domain_cache) {
+		printk(KERN_ERR "Couldn't create iommu_domain cache\n");
+		ret = -ENOMEM;
+	}
+
+	return ret;
+}
+
+static inline int iommu_devinfo_cache_init(void)
+{
+	int ret = 0;
+
+	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
+					 sizeof(struct device_domain_info),
+					 0,
+					 SLAB_HWCACHE_ALIGN,
+
+					 NULL);
+	if (!iommu_devinfo_cache) {
+		printk(KERN_ERR "Couldn't create devinfo cache\n");
+		ret = -ENOMEM;
+	}
+
+	return ret;
+}
+
+static inline int iommu_iova_cache_init(void)
+{
+	int ret = 0;
+
+	iommu_iova_cache = kmem_cache_create("iommu_iova",
+					 sizeof(struct iova),
+					 0,
+					 SLAB_HWCACHE_ALIGN,
+
+					 NULL);
+	if (!iommu_iova_cache) {
+		printk(KERN_ERR "Couldn't create iova cache\n");
+		ret = -ENOMEM;
+	}
+
+	return ret;
+}
+
+static int __init iommu_init_mempool(void)
+{
+	int ret;
+	ret = iommu_iova_cache_init();
+	if (ret)
+		return ret;
+
+	ret = iommu_domain_cache_init();
+	if (ret)
+		goto domain_error;
+
+	ret = iommu_devinfo_cache_init();
+	if (!ret)
+		return ret;
+
+	kmem_cache_destroy(iommu_domain_cache);
+domain_error:
+	kmem_cache_destroy(iommu_iova_cache);
+
+	return -ENOMEM;
+}
+
+static void __init iommu_exit_mempool(void)
+{
+	kmem_cache_destroy(iommu_devinfo_cache);
+	kmem_cache_destroy(iommu_domain_cache);
+	kmem_cache_destroy(iommu_iova_cache);
+
+}
+
+void __init detect_intel_iommu(void)
+{
+	if (swiotlb || no_iommu || iommu_detected || dmar_disabled)
+		return;
+	if (early_dmar_detect()) {
+		iommu_detected = 1;
+	}
+}
+
+static void __init init_no_remapping_devices(void)
+{
+	struct dmar_drhd_unit *drhd;
+
+	for_each_drhd_unit(drhd) {
+		if (!drhd->include_all) {
+			int i;
+			for (i = 0; i < drhd->devices_cnt; i++)
+				if (drhd->devices[i] != NULL)
+					break;
+			/* ignore DMAR unit if no pci devices exist */
+			if (i == drhd->devices_cnt)
+				drhd->ignored = 1;
+		}
+	}
+
+	if (dmar_map_gfx)
+		return;
+
+	for_each_drhd_unit(drhd) {
+		int i;
+		if (drhd->ignored || drhd->include_all)
+			continue;
+
+		for (i = 0; i < drhd->devices_cnt; i++)
+			if (drhd->devices[i] &&
+				!IS_GFX_DEVICE(drhd->devices[i]))
+				break;
+
+		if (i < drhd->devices_cnt)
+			continue;
+
+		/* bypass IOMMU if it is just for gfx devices */
+		drhd->ignored = 1;
+		for (i = 0; i < drhd->devices_cnt; i++) {
+			if (!drhd->devices[i])
+				continue;
+			drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
+		}
+	}
+}
+
+int __init intel_iommu_init(void)
+{
+	int ret = 0;
+
+	if (no_iommu || swiotlb || dmar_disabled)
+		return -ENODEV;
+
+	if (dmar_table_init())
+		return 	-ENODEV;
+
+	iommu_init_mempool();
+	dmar_init_reserved_ranges();
+
+	init_no_remapping_devices();
+
+	ret = init_dmars();
+	if (ret) {
+		printk(KERN_ERR "IOMMU: dmar init failed\n");
+		put_iova_domain(&reserved_iova_list);
+		iommu_exit_mempool();
+		return ret;
+	}
+	printk(KERN_INFO
+	"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
+
+	force_iommu = 1;
+	dma_ops = &intel_dma_ops;
+	return 0;
+}
+
diff --git a/drivers/pci/intel-iommu.h b/drivers/pci/intel-iommu.h
new file mode 100644
index 00000000000..ee88dd2400c
--- /dev/null
+++ b/drivers/pci/intel-iommu.h
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Ashok Raj <ashok.raj@intel.com>
+ * Copyright (C) Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+ */
+
+#ifndef _INTEL_IOMMU_H_
+#define _INTEL_IOMMU_H_
+
+#include <linux/types.h>
+#include <linux/msi.h>
+#include "iova.h"
+#include <linux/io.h>
+
+/*
+ * Intel IOMMU register specification per version 1.0 public spec.
+ */
+
+#define	DMAR_VER_REG	0x0	/* Arch version supported by this IOMMU */
+#define	DMAR_CAP_REG	0x8	/* Hardware supported capabilities */
+#define	DMAR_ECAP_REG	0x10	/* Extended capabilities supported */
+#define	DMAR_GCMD_REG	0x18	/* Global command register */
+#define	DMAR_GSTS_REG	0x1c	/* Global status register */
+#define	DMAR_RTADDR_REG	0x20	/* Root entry table */
+#define	DMAR_CCMD_REG	0x28	/* Context command reg */
+#define	DMAR_FSTS_REG	0x34	/* Fault Status register */
+#define	DMAR_FECTL_REG	0x38	/* Fault control register */
+#define	DMAR_FEDATA_REG	0x3c	/* Fault event interrupt data register */
+#define	DMAR_FEADDR_REG	0x40	/* Fault event interrupt addr register */
+#define	DMAR_FEUADDR_REG 0x44	/* Upper address register */
+#define	DMAR_AFLOG_REG	0x58	/* Advanced Fault control */
+#define	DMAR_PMEN_REG	0x64	/* Enable Protected Memory Region */
+#define	DMAR_PLMBASE_REG 0x68	/* PMRR Low addr */
+#define	DMAR_PLMLIMIT_REG 0x6c	/* PMRR low limit */
+#define	DMAR_PHMBASE_REG 0x70	/* pmrr high base addr */
+#define	DMAR_PHMLIMIT_REG 0x78	/* pmrr high limit */
+
+#define OFFSET_STRIDE		(9)
+/*
+#define dmar_readl(dmar, reg) readl(dmar + reg)
+#define dmar_readq(dmar, reg) ({ \
+		u32 lo, hi; \
+		lo = readl(dmar + reg); \
+		hi = readl(dmar + reg + 4); \
+		(((u64) hi) << 32) + lo; })
+*/
+static inline u64 dmar_readq(void *addr)
+{
+	u32 lo, hi;
+	lo = readl(addr);
+	hi = readl(addr + 4);
+	return (((u64) hi) << 32) + lo;
+}
+
+static inline void dmar_writeq(void __iomem *addr, u64 val)
+{
+	writel((u32)val, addr);
+	writel((u32)(val >> 32), addr + 4);
+}
+
+#define DMAR_VER_MAJOR(v)		(((v) & 0xf0) >> 4)
+#define DMAR_VER_MINOR(v)		((v) & 0x0f)
+
+/*
+ * Decoding Capability Register
+ */
+#define cap_read_drain(c)	(((c) >> 55) & 1)
+#define cap_write_drain(c)	(((c) >> 54) & 1)
+#define cap_max_amask_val(c)	(((c) >> 48) & 0x3f)
+#define cap_num_fault_regs(c)	((((c) >> 40) & 0xff) + 1)
+#define cap_pgsel_inv(c)	(((c) >> 39) & 1)
+
+#define cap_super_page_val(c)	(((c) >> 34) & 0xf)
+#define cap_super_offset(c)	(((find_first_bit(&cap_super_page_val(c), 4)) \
+					* OFFSET_STRIDE) + 21)
+
+#define cap_fault_reg_offset(c)	((((c) >> 24) & 0x3ff) * 16)
+#define cap_max_fault_reg_offset(c) \
+	(cap_fault_reg_offset(c) + cap_num_fault_regs(c) * 16)
+
+#define cap_zlr(c)		(((c) >> 22) & 1)
+#define cap_isoch(c)		(((c) >> 23) & 1)
+#define cap_mgaw(c)		((((c) >> 16) & 0x3f) + 1)
+#define cap_sagaw(c)		(((c) >> 8) & 0x1f)
+#define cap_caching_mode(c)	(((c) >> 7) & 1)
+#define cap_phmr(c)		(((c) >> 6) & 1)
+#define cap_plmr(c)		(((c) >> 5) & 1)
+#define cap_rwbf(c)		(((c) >> 4) & 1)
+#define cap_afl(c)		(((c) >> 3) & 1)
+#define cap_ndoms(c)		(((unsigned long)1) << (4 + 2 * ((c) & 0x7)))
+/*
+ * Extended Capability Register
+ */
+
+#define ecap_niotlb_iunits(e)	((((e) >> 24) & 0xff) + 1)
+#define ecap_iotlb_offset(e) 	((((e) >> 8) & 0x3ff) * 16)
+#define ecap_max_iotlb_offset(e) \
+	(ecap_iotlb_offset(e) + ecap_niotlb_iunits(e) * 16)
+#define ecap_coherent(e)	((e) & 0x1)
+
+
+/* IOTLB_REG */
+#define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60)
+#define DMA_TLB_DSI_FLUSH (((u64)2) << 60)
+#define DMA_TLB_PSI_FLUSH (((u64)3) << 60)
+#define DMA_TLB_IIRG(type) ((type >> 60) & 7)
+#define DMA_TLB_IAIG(val) (((val) >> 57) & 7)
+#define DMA_TLB_READ_DRAIN (((u64)1) << 49)
+#define DMA_TLB_WRITE_DRAIN (((u64)1) << 48)
+#define DMA_TLB_DID(id)	(((u64)((id) & 0xffff)) << 32)
+#define DMA_TLB_IVT (((u64)1) << 63)
+#define DMA_TLB_IH_NONLEAF (((u64)1) << 6)
+#define DMA_TLB_MAX_SIZE (0x3f)
+
+/* GCMD_REG */
+#define DMA_GCMD_TE (((u32)1) << 31)
+#define DMA_GCMD_SRTP (((u32)1) << 30)
+#define DMA_GCMD_SFL (((u32)1) << 29)
+#define DMA_GCMD_EAFL (((u32)1) << 28)
+#define DMA_GCMD_WBF (((u32)1) << 27)
+
+/* GSTS_REG */
+#define DMA_GSTS_TES (((u32)1) << 31)
+#define DMA_GSTS_RTPS (((u32)1) << 30)
+#define DMA_GSTS_FLS (((u32)1) << 29)
+#define DMA_GSTS_AFLS (((u32)1) << 28)
+#define DMA_GSTS_WBFS (((u32)1) << 27)
+
+/* CCMD_REG */
+#define DMA_CCMD_ICC (((u64)1) << 63)
+#define DMA_CCMD_GLOBAL_INVL (((u64)1) << 61)
+#define DMA_CCMD_DOMAIN_INVL (((u64)2) << 61)
+#define DMA_CCMD_DEVICE_INVL (((u64)3) << 61)
+#define DMA_CCMD_FM(m) (((u64)((m) & 0x3)) << 32)
+#define DMA_CCMD_MASK_NOBIT 0
+#define DMA_CCMD_MASK_1BIT 1
+#define DMA_CCMD_MASK_2BIT 2
+#define DMA_CCMD_MASK_3BIT 3
+#define DMA_CCMD_SID(s) (((u64)((s) & 0xffff)) << 16)
+#define DMA_CCMD_DID(d) ((u64)((d) & 0xffff))
+
+/* FECTL_REG */
+#define DMA_FECTL_IM (((u32)1) << 31)
+
+/* FSTS_REG */
+#define DMA_FSTS_PPF ((u32)2)
+#define DMA_FSTS_PFO ((u32)1)
+#define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff)
+
+/* FRCD_REG, 32 bits access */
+#define DMA_FRCD_F (((u32)1) << 31)
+#define dma_frcd_type(d) ((d >> 30) & 1)
+#define dma_frcd_fault_reason(c) (c & 0xff)
+#define dma_frcd_source_id(c) (c & 0xffff)
+#define dma_frcd_page_addr(d) (d & (((u64)-1) << 12)) /* low 64 bit */
+
+/*
+ * 0: Present
+ * 1-11: Reserved
+ * 12-63: Context Ptr (12 - (haw-1))
+ * 64-127: Reserved
+ */
+struct root_entry {
+	u64	val;
+	u64	rsvd1;
+};
+#define ROOT_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct root_entry))
+static inline bool root_present(struct root_entry *root)
+{
+	return (root->val & 1);
+}
+static inline void set_root_present(struct root_entry *root)
+{
+	root->val |= 1;
+}
+static inline void set_root_value(struct root_entry *root, unsigned long value)
+{
+	root->val |= value & PAGE_MASK_4K;
+}
+
+struct context_entry;
+static inline struct context_entry *
+get_context_addr_from_root(struct root_entry *root)
+{
+	return (struct context_entry *)
+		(root_present(root)?phys_to_virt(
+		root->val & PAGE_MASK_4K):
+		NULL);
+}
+
+/*
+ * low 64 bits:
+ * 0: present
+ * 1: fault processing disable
+ * 2-3: translation type
+ * 12-63: address space root
+ * high 64 bits:
+ * 0-2: address width
+ * 3-6: aval
+ * 8-23: domain id
+ */
+struct context_entry {
+	u64 lo;
+	u64 hi;
+};
+#define context_present(c) ((c).lo & 1)
+#define context_fault_disable(c) (((c).lo >> 1) & 1)
+#define context_translation_type(c) (((c).lo >> 2) & 3)
+#define context_address_root(c) ((c).lo & PAGE_MASK_4K)
+#define context_address_width(c) ((c).hi &  7)
+#define context_domain_id(c) (((c).hi >> 8) & ((1 << 16) - 1))
+
+#define context_set_present(c) do {(c).lo |= 1;} while (0)
+#define context_set_fault_enable(c) \
+	do {(c).lo &= (((u64)-1) << 2) | 1;} while (0)
+#define context_set_translation_type(c, val) \
+	do { \
+		(c).lo &= (((u64)-1) << 4) | 3; \
+		(c).lo |= ((val) & 3) << 2; \
+	} while (0)
+#define CONTEXT_TT_MULTI_LEVEL 0
+#define context_set_address_root(c, val) \
+	do {(c).lo |= (val) & PAGE_MASK_4K;} while (0)
+#define context_set_address_width(c, val) do {(c).hi |= (val) & 7;} while (0)
+#define context_set_domain_id(c, val) \
+	do {(c).hi |= ((val) & ((1 << 16) - 1)) << 8;} while (0)
+#define context_clear_entry(c) do {(c).lo = 0; (c).hi = 0;} while (0)
+
+/*
+ * 0: readable
+ * 1: writable
+ * 2-6: reserved
+ * 7: super page
+ * 8-11: available
+ * 12-63: Host physcial address
+ */
+struct dma_pte {
+	u64 val;
+};
+#define dma_clear_pte(p)	do {(p).val = 0;} while (0)
+
+#define DMA_PTE_READ (1)
+#define DMA_PTE_WRITE (2)
+
+#define dma_set_pte_readable(p) do {(p).val |= DMA_PTE_READ;} while (0)
+#define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while (0)
+#define dma_set_pte_prot(p, prot) \
+		do {(p).val = ((p).val & ~3) | ((prot) & 3); } while (0)
+#define dma_pte_addr(p) ((p).val & PAGE_MASK_4K)
+#define dma_set_pte_addr(p, addr) do {\
+		(p).val |= ((addr) & PAGE_MASK_4K); } while (0)
+#define dma_pte_present(p) (((p).val & 3) != 0)
+
+struct intel_iommu;
+
+struct dmar_domain {
+	int	id;			/* domain id */
+	struct intel_iommu *iommu;	/* back pointer to owning iommu */
+
+	struct list_head devices; 	/* all devices' list */
+	struct iova_domain iovad;	/* iova's that belong to this domain */
+
+	struct dma_pte	*pgd;		/* virtual address */
+	spinlock_t	mapping_lock;	/* page table lock */
+	int		gaw;		/* max guest address width */
+
+	/* adjusted guest address width, 0 is level 2 30-bit */
+	int		agaw;
+
+#define DOMAIN_FLAG_MULTIPLE_DEVICES 1
+	int		flags;
+};
+
+/* PCI domain-device relationship */
+struct device_domain_info {
+	struct list_head link;	/* link to domain siblings */
+	struct list_head global; /* link to global list */
+	u8 bus;			/* PCI bus numer */
+	u8 devfn;		/* PCI devfn number */
+	struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
+	struct dmar_domain *domain; /* pointer to domain */
+};
+
+extern int init_dmars(void);
+
+struct intel_iommu {
+	void __iomem	*reg; /* Pointer to hardware regs, virtual addr */
+	u64		cap;
+	u64		ecap;
+	unsigned long 	*domain_ids; /* bitmap of domains */
+	struct dmar_domain **domains; /* ptr to domains */
+	int		seg;
+	u32		gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */
+	spinlock_t	lock; /* protect context, domain ids */
+	spinlock_t	register_lock; /* protect register handling */
+	struct root_entry *root_entry; /* virtual address */
+
+	unsigned int irq;
+	unsigned char name[7];    /* Device Name */
+	struct msi_msg saved_msg;
+	struct sys_device sysdev;
+};
+
+#ifndef CONFIG_DMAR_GFX_WA
+static inline void iommu_prepare_gfx_mapping(void)
+{
+	return;
+}
+#endif /* !CONFIG_DMAR_GFX_WA */
+
+#endif
diff --git a/drivers/pci/iova.c b/drivers/pci/iova.c
new file mode 100644
index 00000000000..a84571c2936
--- /dev/null
+++ b/drivers/pci/iova.c
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This file is released under the GPLv2.
+ *
+ * Copyright (C) 2006 Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+ */
+
+#include "iova.h"
+
+void
+init_iova_domain(struct iova_domain *iovad)
+{
+	spin_lock_init(&iovad->iova_alloc_lock);
+	spin_lock_init(&iovad->iova_rbtree_lock);
+	iovad->rbroot = RB_ROOT;
+	iovad->cached32_node = NULL;
+
+}
+
+static struct rb_node *
+__get_cached_rbnode(struct iova_domain *iovad, unsigned long *limit_pfn)
+{
+	if ((*limit_pfn != DMA_32BIT_PFN) ||
+		(iovad->cached32_node == NULL))
+		return rb_last(&iovad->rbroot);
+	else {
+		struct rb_node *prev_node = rb_prev(iovad->cached32_node);
+		struct iova *curr_iova =
+			container_of(iovad->cached32_node, struct iova, node);
+		*limit_pfn = curr_iova->pfn_lo - 1;
+		return prev_node;
+	}
+}
+
+static void
+__cached_rbnode_insert_update(struct iova_domain *iovad,
+	unsigned long limit_pfn, struct iova *new)
+{
+	if (limit_pfn != DMA_32BIT_PFN)
+		return;
+	iovad->cached32_node = &new->node;
+}
+
+static void
+__cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free)
+{
+	struct iova *cached_iova;
+	struct rb_node *curr;
+
+	if (!iovad->cached32_node)
+		return;
+	curr = iovad->cached32_node;
+	cached_iova = container_of(curr, struct iova, node);
+
+	if (free->pfn_lo >= cached_iova->pfn_lo)
+		iovad->cached32_node = rb_next(&free->node);
+}
+
+/* Computes the padding size required, to make the
+ * the start address naturally aligned on its size
+ */
+static int
+iova_get_pad_size(int size, unsigned int limit_pfn)
+{
+	unsigned int pad_size = 0;
+	unsigned int order = ilog2(size);
+
+	if (order)
+		pad_size = (limit_pfn + 1) % (1 << order);
+
+	return pad_size;
+}
+
+static int __alloc_iova_range(struct iova_domain *iovad, unsigned long size,
+		unsigned long limit_pfn, struct iova *new, bool size_aligned)
+{
+	struct rb_node *curr = NULL;
+	unsigned long flags;
+	unsigned long saved_pfn;
+	unsigned int pad_size = 0;
+
+	/* Walk the tree backwards */
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+	saved_pfn = limit_pfn;
+	curr = __get_cached_rbnode(iovad, &limit_pfn);
+	while (curr) {
+		struct iova *curr_iova = container_of(curr, struct iova, node);
+		if (limit_pfn < curr_iova->pfn_lo)
+			goto move_left;
+		else if (limit_pfn < curr_iova->pfn_hi)
+			goto adjust_limit_pfn;
+		else {
+			if (size_aligned)
+				pad_size = iova_get_pad_size(size, limit_pfn);
+			if ((curr_iova->pfn_hi + size + pad_size) <= limit_pfn)
+				break;	/* found a free slot */
+		}
+adjust_limit_pfn:
+		limit_pfn = curr_iova->pfn_lo - 1;
+move_left:
+		curr = rb_prev(curr);
+	}
+
+	if (!curr) {
+		if (size_aligned)
+			pad_size = iova_get_pad_size(size, limit_pfn);
+		if ((IOVA_START_PFN + size + pad_size) > limit_pfn) {
+			spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+			return -ENOMEM;
+		}
+	}
+
+	/* pfn_lo will point to size aligned address if size_aligned is set */
+	new->pfn_lo = limit_pfn - (size + pad_size) + 1;
+	new->pfn_hi = new->pfn_lo + size - 1;
+
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+	return 0;
+}
+
+static void
+iova_insert_rbtree(struct rb_root *root, struct iova *iova)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+	/* Figure out where to put new node */
+	while (*new) {
+		struct iova *this = container_of(*new, struct iova, node);
+		parent = *new;
+
+		if (iova->pfn_lo < this->pfn_lo)
+			new = &((*new)->rb_left);
+		else if (iova->pfn_lo > this->pfn_lo)
+			new = &((*new)->rb_right);
+		else
+			BUG(); /* this should not happen */
+	}
+	/* Add new node and rebalance tree. */
+	rb_link_node(&iova->node, parent, new);
+	rb_insert_color(&iova->node, root);
+}
+
+/**
+ * alloc_iova - allocates an iova
+ * @iovad - iova domain in question
+ * @size - size of page frames to allocate
+ * @limit_pfn - max limit address
+ * @size_aligned - set if size_aligned address range is required
+ * This function allocates an iova in the range limit_pfn to IOVA_START_PFN
+ * looking from limit_pfn instead from IOVA_START_PFN. If the size_aligned
+ * flag is set then the allocated address iova->pfn_lo will be naturally
+ * aligned on roundup_power_of_two(size).
+ */
+struct iova *
+alloc_iova(struct iova_domain *iovad, unsigned long size,
+	unsigned long limit_pfn,
+	bool size_aligned)
+{
+	unsigned long flags;
+	struct iova *new_iova;
+	int ret;
+
+	new_iova = alloc_iova_mem();
+	if (!new_iova)
+		return NULL;
+
+	/* If size aligned is set then round the size to
+	 * to next power of two.
+	 */
+	if (size_aligned)
+		size = __roundup_pow_of_two(size);
+
+	spin_lock_irqsave(&iovad->iova_alloc_lock, flags);
+	ret = __alloc_iova_range(iovad, size, limit_pfn, new_iova,
+			size_aligned);
+
+	if (ret) {
+		spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags);
+		free_iova_mem(new_iova);
+		return NULL;
+	}
+
+	/* Insert the new_iova into domain rbtree by holding writer lock */
+	spin_lock(&iovad->iova_rbtree_lock);
+	iova_insert_rbtree(&iovad->rbroot, new_iova);
+	__cached_rbnode_insert_update(iovad, limit_pfn, new_iova);
+	spin_unlock(&iovad->iova_rbtree_lock);
+
+	spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags);
+
+	return new_iova;
+}
+
+/**
+ * find_iova - find's an iova for a given pfn
+ * @iovad - iova domain in question.
+ * pfn - page frame number
+ * This function finds and returns an iova belonging to the
+ * given doamin which matches the given pfn.
+ */
+struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
+{
+	unsigned long flags;
+	struct rb_node *node;
+
+	/* Take the lock so that no other thread is manipulating the rbtree */
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+	node = iovad->rbroot.rb_node;
+	while (node) {
+		struct iova *iova = container_of(node, struct iova, node);
+
+		/* If pfn falls within iova's range, return iova */
+		if ((pfn >= iova->pfn_lo) && (pfn <= iova->pfn_hi)) {
+			spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+			/* We are not holding the lock while this iova
+			 * is referenced by the caller as the same thread
+			 * which called this function also calls __free_iova()
+			 * and it is by desing that only one thread can possibly
+			 * reference a particular iova and hence no conflict.
+			 */
+			return iova;
+		}
+
+		if (pfn < iova->pfn_lo)
+			node = node->rb_left;
+		else if (pfn > iova->pfn_lo)
+			node = node->rb_right;
+	}
+
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+	return NULL;
+}
+
+/**
+ * __free_iova - frees the given iova
+ * @iovad: iova domain in question.
+ * @iova: iova in question.
+ * Frees the given iova belonging to the giving domain
+ */
+void
+__free_iova(struct iova_domain *iovad, struct iova *iova)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+	__cached_rbnode_delete_update(iovad, iova);
+	rb_erase(&iova->node, &iovad->rbroot);
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+	free_iova_mem(iova);
+}
+
+/**
+ * free_iova - finds and frees the iova for a given pfn
+ * @iovad: - iova domain in question.
+ * @pfn: - pfn that is allocated previously
+ * This functions finds an iova for a given pfn and then
+ * frees the iova from that domain.
+ */
+void
+free_iova(struct iova_domain *iovad, unsigned long pfn)
+{
+	struct iova *iova = find_iova(iovad, pfn);
+	if (iova)
+		__free_iova(iovad, iova);
+
+}
+
+/**
+ * put_iova_domain - destroys the iova doamin
+ * @iovad: - iova domain in question.
+ * All the iova's in that domain are destroyed.
+ */
+void put_iova_domain(struct iova_domain *iovad)
+{
+	struct rb_node *node;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+	node = rb_first(&iovad->rbroot);
+	while (node) {
+		struct iova *iova = container_of(node, struct iova, node);
+		rb_erase(node, &iovad->rbroot);
+		free_iova_mem(iova);
+		node = rb_first(&iovad->rbroot);
+	}
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+}
+
+static int
+__is_range_overlap(struct rb_node *node,
+	unsigned long pfn_lo, unsigned long pfn_hi)
+{
+	struct iova *iova = container_of(node, struct iova, node);
+
+	if ((pfn_lo <= iova->pfn_hi) && (pfn_hi >= iova->pfn_lo))
+		return 1;
+	return 0;
+}
+
+static struct iova *
+__insert_new_range(struct iova_domain *iovad,
+	unsigned long pfn_lo, unsigned long pfn_hi)
+{
+	struct iova *iova;
+
+	iova = alloc_iova_mem();
+	if (!iova)
+		return iova;
+
+	iova->pfn_hi = pfn_hi;
+	iova->pfn_lo = pfn_lo;
+	iova_insert_rbtree(&iovad->rbroot, iova);
+	return iova;
+}
+
+static void
+__adjust_overlap_range(struct iova *iova,
+	unsigned long *pfn_lo, unsigned long *pfn_hi)
+{
+	if (*pfn_lo < iova->pfn_lo)
+		iova->pfn_lo = *pfn_lo;
+	if (*pfn_hi > iova->pfn_hi)
+		*pfn_lo = iova->pfn_hi + 1;
+}
+
+/**
+ * reserve_iova - reserves an iova in the given range
+ * @iovad: - iova domain pointer
+ * @pfn_lo: - lower page frame address
+ * @pfn_hi:- higher pfn adderss
+ * This function allocates reserves the address range from pfn_lo to pfn_hi so
+ * that this address is not dished out as part of alloc_iova.
+ */
+struct iova *
+reserve_iova(struct iova_domain *iovad,
+	unsigned long pfn_lo, unsigned long pfn_hi)
+{
+	struct rb_node *node;
+	unsigned long flags;
+	struct iova *iova;
+	unsigned int overlap = 0;
+
+	spin_lock_irqsave(&iovad->iova_alloc_lock, flags);
+	spin_lock(&iovad->iova_rbtree_lock);
+	for (node = rb_first(&iovad->rbroot); node; node = rb_next(node)) {
+		if (__is_range_overlap(node, pfn_lo, pfn_hi)) {
+			iova = container_of(node, struct iova, node);
+			__adjust_overlap_range(iova, &pfn_lo, &pfn_hi);
+			if ((pfn_lo >= iova->pfn_lo) &&
+				(pfn_hi <= iova->pfn_hi))
+				goto finish;
+			overlap = 1;
+
+		} else if (overlap)
+				break;
+	}
+
+	/* We are here either becasue this is the first reserver node
+	 * or need to insert remaining non overlap addr range
+	 */
+	iova = __insert_new_range(iovad, pfn_lo, pfn_hi);
+finish:
+
+	spin_unlock(&iovad->iova_rbtree_lock);
+	spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags);
+	return iova;
+}
+
+/**
+ * copy_reserved_iova - copies the reserved between domains
+ * @from: - source doamin from where to copy
+ * @to: - destination domin where to copy
+ * This function copies reserved iova's from one doamin to
+ * other.
+ */
+void
+copy_reserved_iova(struct iova_domain *from, struct iova_domain *to)
+{
+	unsigned long flags;
+	struct rb_node *node;
+
+	spin_lock_irqsave(&from->iova_alloc_lock, flags);
+	spin_lock(&from->iova_rbtree_lock);
+	for (node = rb_first(&from->rbroot); node; node = rb_next(node)) {
+		struct iova *iova = container_of(node, struct iova, node);
+		struct iova *new_iova;
+		new_iova = reserve_iova(to, iova->pfn_lo, iova->pfn_hi);
+		if (!new_iova)
+			printk(KERN_ERR "Reserve iova range %lx@%lx failed\n",
+				iova->pfn_lo, iova->pfn_lo);
+	}
+	spin_unlock(&from->iova_rbtree_lock);
+	spin_unlock_irqrestore(&from->iova_alloc_lock, flags);
+}
diff --git a/drivers/pci/iova.h b/drivers/pci/iova.h
new file mode 100644
index 00000000000..ae3028d5a94
--- /dev/null
+++ b/drivers/pci/iova.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This file is released under the GPLv2.
+ *
+ * Copyright (C) 2006 Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+ *
+ */
+
+#ifndef _IOVA_H_
+#define _IOVA_H_
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/rbtree.h>
+#include <linux/dma-mapping.h>
+
+/*
+ * We need a fixed PAGE_SIZE of 4K irrespective of
+ * arch PAGE_SIZE for IOMMU page tables.
+ */
+#define PAGE_SHIFT_4K		(12)
+#define PAGE_SIZE_4K		(1UL << PAGE_SHIFT_4K)
+#define PAGE_MASK_4K		(((u64)-1) << PAGE_SHIFT_4K)
+#define PAGE_ALIGN_4K(addr)	(((addr) + PAGE_SIZE_4K - 1) & PAGE_MASK_4K)
+
+/* IO virtual address start page frame number */
+#define IOVA_START_PFN		(1)
+
+#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT_4K)
+#define DMA_32BIT_PFN	IOVA_PFN(DMA_32BIT_MASK)
+#define DMA_64BIT_PFN	IOVA_PFN(DMA_64BIT_MASK)
+
+/* iova structure */
+struct iova {
+	struct rb_node	node;
+	unsigned long	pfn_hi; /* IOMMU dish out addr hi */
+	unsigned long	pfn_lo; /* IOMMU dish out addr lo */
+};
+
+/* holds all the iova translations for a domain */
+struct iova_domain {
+	spinlock_t	iova_alloc_lock;/* Lock to protect iova  allocation */
+	spinlock_t	iova_rbtree_lock; /* Lock to protect update of rbtree */
+	struct rb_root	rbroot;		/* iova domain rbtree root */
+	struct rb_node	*cached32_node; /* Save last alloced node */
+};
+
+struct iova *alloc_iova_mem(void);
+void free_iova_mem(struct iova *iova);
+void free_iova(struct iova_domain *iovad, unsigned long pfn);
+void __free_iova(struct iova_domain *iovad, struct iova *iova);
+struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
+	unsigned long limit_pfn,
+	bool size_aligned);
+struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
+	unsigned long pfn_hi);
+void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
+void init_iova_domain(struct iova_domain *iovad);
+struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
+void put_iova_domain(struct iova_domain *iovad);
+
+#endif
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 6fda33de84e..fc87e14b50d 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -90,3 +90,4 @@ pci_match_one_device(const struct pci_device_id *id, const struct pci_dev *dev)
 	return NULL;
 }
 
+struct pci_dev *pci_find_upstream_pcie_bridge(struct pci_dev *pdev);
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 5db6b6690b5..463a5a9d583 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -837,6 +837,19 @@ static void pci_release_dev(struct device *dev)
 	kfree(pci_dev);
 }
 
+static void set_pcie_port_type(struct pci_dev *pdev)
+{
+	int pos;
+	u16 reg16;
+
+	pos = pci_find_capability(pdev, PCI_CAP_ID_EXP);
+	if (!pos)
+		return;
+	pdev->is_pcie = 1;
+	pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &reg16);
+	pdev->pcie_type = (reg16 & PCI_EXP_FLAGS_TYPE) >> 4;
+}
+
 /**
  * pci_cfg_space_size - get the configuration space size of the PCI device.
  * @dev: PCI device
@@ -951,6 +964,7 @@ pci_scan_device(struct pci_bus *bus, int devfn)
 	dev->device = (l >> 16) & 0xffff;
 	dev->cfg_size = pci_cfg_space_size(dev);
 	dev->error_state = pci_channel_io_normal;
+	set_pcie_port_type(dev);
 
 	/* Assume 32-bit PCI; let 64-bit PCI cards (which are far rarer)
 	   set this higher, assuming the system even supports it.  */
diff --git a/drivers/pci/search.c b/drivers/pci/search.c
index c6e79d01ce3..b001b5922e3 100644
--- a/drivers/pci/search.c
+++ b/drivers/pci/search.c
@@ -14,6 +14,40 @@
 #include "pci.h"
 
 DECLARE_RWSEM(pci_bus_sem);
+/*
+ * find the upstream PCIE-to-PCI bridge of a PCI device
+ * if the device is PCIE, return NULL
+ * if the device isn't connected to a PCIE bridge (that is its parent is a
+ * legacy PCI bridge and the bridge is directly connected to bus 0), return its
+ * parent
+ */
+struct pci_dev *
+pci_find_upstream_pcie_bridge(struct pci_dev *pdev)
+{
+	struct pci_dev *tmp = NULL;
+
+	if (pdev->is_pcie)
+		return NULL;
+	while (1) {
+		if (!pdev->bus->self)
+			break;
+		pdev = pdev->bus->self;
+		/* a p2p bridge */
+		if (!pdev->is_pcie) {
+			tmp = pdev;
+			continue;
+		}
+		/* PCI device should connect to a PCIE bridge */
+		if (pdev->pcie_type != PCI_EXP_TYPE_PCI_BRIDGE) {
+			/* Busted hardware? */
+			WARN_ON_ONCE(1);
+			return NULL;
+		}
+		return pdev;
+	}
+
+	return tmp;
+}
 
 static struct pci_bus *pci_do_find_bus(struct pci_bus *bus, unsigned char busnr)
 {
diff --git a/drivers/power/apm_power.c b/drivers/power/apm_power.c
index 39a90a6f0f8..bbf3ee10da0 100644
--- a/drivers/power/apm_power.c
+++ b/drivers/power/apm_power.c
@@ -26,65 +26,124 @@ static struct power_supply *main_battery;
 static void find_main_battery(void)
 {
 	struct device *dev;
-	struct power_supply *bat, *batm;
+	struct power_supply *bat = NULL;
+	struct power_supply *max_charge_bat = NULL;
+	struct power_supply *max_energy_bat = NULL;
 	union power_supply_propval full;
 	int max_charge = 0;
+	int max_energy = 0;
 
 	main_battery = NULL;
-	batm = NULL;
+
 	list_for_each_entry(dev, &power_supply_class->devices, node) {
 		bat = dev_get_drvdata(dev);
-		/* If none of battery devices cantains 'use_for_apm' flag,
-		   choice one with maximum design charge */
-		if (!PSY_PROP(bat, CHARGE_FULL_DESIGN, &full)) {
+
+		if (bat->use_for_apm) {
+			/* nice, we explicitly asked to report this battery. */
+			main_battery = bat;
+			return;
+		}
+
+		if (!PSY_PROP(bat, CHARGE_FULL_DESIGN, &full) ||
+				!PSY_PROP(bat, CHARGE_FULL, &full)) {
 			if (full.intval > max_charge) {
-				batm = bat;
+				max_charge_bat = bat;
 				max_charge = full.intval;
 			}
+		} else if (!PSY_PROP(bat, ENERGY_FULL_DESIGN, &full) ||
+				!PSY_PROP(bat, ENERGY_FULL, &full)) {
+			if (full.intval > max_energy) {
+				max_energy_bat = bat;
+				max_energy = full.intval;
+			}
 		}
+	}
 
-		if (bat->use_for_apm)
-			main_battery = bat;
+	if ((max_energy_bat && max_charge_bat) &&
+			(max_energy_bat != max_charge_bat)) {
+		/* try guess battery with more capacity */
+		if (!PSY_PROP(max_charge_bat, VOLTAGE_MAX_DESIGN, &full)) {
+			if (max_energy > max_charge * full.intval)
+				main_battery = max_energy_bat;
+			else
+				main_battery = max_charge_bat;
+		} else if (!PSY_PROP(max_energy_bat, VOLTAGE_MAX_DESIGN,
+								  &full)) {
+			if (max_charge > max_energy / full.intval)
+				main_battery = max_charge_bat;
+			else
+				main_battery = max_energy_bat;
+		} else {
+			/* give up, choice any */
+			main_battery = max_energy_bat;
+		}
+	} else if (max_charge_bat) {
+		main_battery = max_charge_bat;
+	} else if (max_energy_bat) {
+		main_battery = max_energy_bat;
+	} else {
+		/* give up, try the last if any */
+		main_battery = bat;
 	}
-	if (!main_battery)
-		main_battery = batm;
 }
 
-static int calculate_time(int status)
+static int calculate_time(int status, int using_charge)
 {
-	union power_supply_propval charge_full, charge_empty;
-	union power_supply_propval charge, I;
+	union power_supply_propval full;
+	union power_supply_propval empty;
+	union power_supply_propval cur;
+	union power_supply_propval I;
+	enum power_supply_property full_prop;
+	enum power_supply_property full_design_prop;
+	enum power_supply_property empty_prop;
+	enum power_supply_property empty_design_prop;
+	enum power_supply_property cur_avg_prop;
+	enum power_supply_property cur_now_prop;
 
-	if (MPSY_PROP(CHARGE_FULL, &charge_full)) {
-		/* if battery can't report this property, use design value */
-		if (MPSY_PROP(CHARGE_FULL_DESIGN, &charge_full))
+	if (MPSY_PROP(CURRENT_AVG, &I)) {
+		/* if battery can't report average value, use momentary */
+		if (MPSY_PROP(CURRENT_NOW, &I))
 			return -1;
 	}
 
-	if (MPSY_PROP(CHARGE_EMPTY, &charge_empty)) {
-		/* if battery can't report this property, use design value */
-		if (MPSY_PROP(CHARGE_EMPTY_DESIGN, &charge_empty))
-			charge_empty.intval = 0;
+	if (using_charge) {
+		full_prop = POWER_SUPPLY_PROP_CHARGE_FULL;
+		full_design_prop = POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN;
+		empty_prop = POWER_SUPPLY_PROP_CHARGE_EMPTY;
+		empty_design_prop = POWER_SUPPLY_PROP_CHARGE_EMPTY;
+		cur_avg_prop = POWER_SUPPLY_PROP_CHARGE_AVG;
+		cur_now_prop = POWER_SUPPLY_PROP_CHARGE_NOW;
+	} else {
+		full_prop = POWER_SUPPLY_PROP_ENERGY_FULL;
+		full_design_prop = POWER_SUPPLY_PROP_ENERGY_FULL_DESIGN;
+		empty_prop = POWER_SUPPLY_PROP_ENERGY_EMPTY;
+		empty_design_prop = POWER_SUPPLY_PROP_CHARGE_EMPTY;
+		cur_avg_prop = POWER_SUPPLY_PROP_ENERGY_AVG;
+		cur_now_prop = POWER_SUPPLY_PROP_ENERGY_NOW;
 	}
 
-	if (MPSY_PROP(CHARGE_AVG, &charge)) {
-		/* if battery can't report average value, use momentary */
-		if (MPSY_PROP(CHARGE_NOW, &charge))
+	if (_MPSY_PROP(full_prop, &full)) {
+		/* if battery can't report this property, use design value */
+		if (_MPSY_PROP(full_design_prop, &full))
 			return -1;
 	}
 
-	if (MPSY_PROP(CURRENT_AVG, &I)) {
+	if (_MPSY_PROP(empty_prop, &empty)) {
+		/* if battery can't report this property, use design value */
+		if (_MPSY_PROP(empty_design_prop, &empty))
+			empty.intval = 0;
+	}
+
+	if (_MPSY_PROP(cur_avg_prop, &cur)) {
 		/* if battery can't report average value, use momentary */
-		if (MPSY_PROP(CURRENT_NOW, &I))
+		if (_MPSY_PROP(cur_now_prop, &cur))
 			return -1;
 	}
 
 	if (status == POWER_SUPPLY_STATUS_CHARGING)
-		return ((charge.intval - charge_full.intval) * 60L) /
-		       I.intval;
+		return ((cur.intval - full.intval) * 60L) / I.intval;
 	else
-		return -((charge.intval - charge_empty.intval) * 60L) /
-			I.intval;
+		return -((cur.intval - empty.intval) * 60L) / I.intval;
 }
 
 static int calculate_capacity(int using_charge)
@@ -200,18 +259,22 @@ static void apm_battery_apm_get_power_status(struct apm_power_info *info)
 	info->units = APM_UNITS_MINS;
 
 	if (status.intval == POWER_SUPPLY_STATUS_CHARGING) {
-		if (MPSY_PROP(TIME_TO_FULL_AVG, &time_to_full)) {
-			if (MPSY_PROP(TIME_TO_FULL_NOW, &time_to_full))
-				info->time = calculate_time(status.intval);
-			else
-				info->time = time_to_full.intval / 60;
+		if (!MPSY_PROP(TIME_TO_FULL_AVG, &time_to_full) ||
+				!MPSY_PROP(TIME_TO_FULL_NOW, &time_to_full)) {
+			info->time = time_to_full.intval / 60;
+		} else {
+			info->time = calculate_time(status.intval, 0);
+			if (info->time == -1)
+				info->time = calculate_time(status.intval, 1);
 		}
 	} else {
-		if (MPSY_PROP(TIME_TO_EMPTY_AVG, &time_to_empty)) {
-			if (MPSY_PROP(TIME_TO_EMPTY_NOW, &time_to_empty))
-				info->time = calculate_time(status.intval);
-			else
-				info->time = time_to_empty.intval / 60;
+		if (!MPSY_PROP(TIME_TO_EMPTY_AVG, &time_to_empty) ||
+			      !MPSY_PROP(TIME_TO_EMPTY_NOW, &time_to_empty)) {
+			info->time = time_to_empty.intval / 60;
+		} else {
+			info->time = calculate_time(status.intval, 0);
+			if (info->time == -1)
+				info->time = calculate_time(status.intval, 1);
 		}
 	}
 
diff --git a/drivers/s390/char/raw3270.c b/drivers/s390/char/raw3270.c
index 2edd5fb6d3d..8d1c64a24de 100644
--- a/drivers/s390/char/raw3270.c
+++ b/drivers/s390/char/raw3270.c
@@ -48,8 +48,8 @@ struct raw3270 {
 	struct timer_list timer;	/* Device timer. */
 
 	unsigned char *ascebc;		/* ascii -> ebcdic table */
-	struct class_device *clttydev;	/* 3270-class tty device ptr */
-	struct class_device *cltubdev;	/* 3270-class tub device ptr */
+	struct device *clttydev;	/* 3270-class tty device ptr */
+	struct device *cltubdev;	/* 3270-class tub device ptr */
 
 	struct raw3270_request init_request;
 	unsigned char init_data[256];
@@ -1107,11 +1107,9 @@ raw3270_delete_device(struct raw3270 *rp)
 	/* Remove from device chain. */
 	mutex_lock(&raw3270_mutex);
 	if (rp->clttydev && !IS_ERR(rp->clttydev))
-		class_device_destroy(class3270,
-				     MKDEV(IBM_TTY3270_MAJOR, rp->minor));
+		device_destroy(class3270, MKDEV(IBM_TTY3270_MAJOR, rp->minor));
 	if (rp->cltubdev && !IS_ERR(rp->cltubdev))
-		class_device_destroy(class3270,
-				     MKDEV(IBM_FS3270_MAJOR, rp->minor));
+		device_destroy(class3270, MKDEV(IBM_FS3270_MAJOR, rp->minor));
 	list_del_init(&rp->list);
 	mutex_unlock(&raw3270_mutex);
 
@@ -1181,24 +1179,22 @@ static int raw3270_create_attributes(struct raw3270 *rp)
 	if (rc)
 		goto out;
 
-	rp->clttydev = class_device_create(class3270, NULL,
-					   MKDEV(IBM_TTY3270_MAJOR, rp->minor),
-					   &rp->cdev->dev, "tty%s",
-					   rp->cdev->dev.bus_id);
+	rp->clttydev = device_create(class3270, &rp->cdev->dev,
+				     MKDEV(IBM_TTY3270_MAJOR, rp->minor),
+				     "tty%s", rp->cdev->dev.bus_id);
 	if (IS_ERR(rp->clttydev)) {
 		rc = PTR_ERR(rp->clttydev);
 		goto out_ttydev;
 	}
 
-	rp->cltubdev = class_device_create(class3270, NULL,
-					   MKDEV(IBM_FS3270_MAJOR, rp->minor),
-					   &rp->cdev->dev, "tub%s",
-					   rp->cdev->dev.bus_id);
+	rp->cltubdev = device_create(class3270, &rp->cdev->dev,
+				     MKDEV(IBM_FS3270_MAJOR, rp->minor),
+				     "tub%s", rp->cdev->dev.bus_id);
 	if (!IS_ERR(rp->cltubdev))
 		goto out;
 
 	rc = PTR_ERR(rp->cltubdev);
-	class_device_destroy(class3270, MKDEV(IBM_TTY3270_MAJOR, rp->minor));
+	device_destroy(class3270, MKDEV(IBM_TTY3270_MAJOR, rp->minor));
 
 out_ttydev:
 	sysfs_remove_group(&rp->cdev->dev.kobj, &raw3270_attr_group);
diff --git a/drivers/s390/char/tape_class.c b/drivers/s390/char/tape_class.c
index 2e0d29730b6..aa7f166f403 100644
--- a/drivers/s390/char/tape_class.c
+++ b/drivers/s390/char/tape_class.c
@@ -69,12 +69,9 @@ struct tape_class_device *register_tape_dev(
 	if (rc)
 		goto fail_with_cdev;
 
-	tcd->class_device = class_device_create(
-				tape_class,
-				NULL,
-				tcd->char_device->dev,
-				device,
-				"%s", tcd->device_name
+	tcd->class_device = device_create(tape_class, device,
+					  tcd->char_device->dev,
+					  "%s", tcd->device_name
 			);
 	rc = IS_ERR(tcd->class_device) ? PTR_ERR(tcd->class_device) : 0;
 	if (rc)
@@ -90,7 +87,7 @@ struct tape_class_device *register_tape_dev(
 	return tcd;
 
 fail_with_class_device:
-	class_device_destroy(tape_class, tcd->char_device->dev);
+	device_destroy(tape_class, tcd->char_device->dev);
 
 fail_with_cdev:
 	cdev_del(tcd->char_device);
@@ -105,11 +102,9 @@ EXPORT_SYMBOL(register_tape_dev);
 void unregister_tape_dev(struct tape_class_device *tcd)
 {
 	if (tcd != NULL && !IS_ERR(tcd)) {
-		sysfs_remove_link(
-			&tcd->class_device->dev->kobj,
-			tcd->mode_name
-		);
-		class_device_destroy(tape_class, tcd->char_device->dev);
+		sysfs_remove_link(&tcd->class_device->kobj,
+				  tcd->mode_name);
+		device_destroy(tape_class, tcd->char_device->dev);
 		cdev_del(tcd->char_device);
 		kfree(tcd);
 	}
diff --git a/drivers/s390/char/tape_class.h b/drivers/s390/char/tape_class.h
index a8bd9b47fad..e2b5ac918ac 100644
--- a/drivers/s390/char/tape_class.h
+++ b/drivers/s390/char/tape_class.h
@@ -24,8 +24,8 @@
 #define TAPECLASS_NAME_LEN	32
 
 struct tape_class_device {
-	struct cdev *		char_device;
-	struct class_device *	class_device;
+	struct cdev		*char_device;
+	struct device		*class_device;
 	char			device_name[TAPECLASS_NAME_LEN];
 	char			mode_name[TAPECLASS_NAME_LEN];
 };
diff --git a/drivers/s390/char/vmlogrdr.c b/drivers/s390/char/vmlogrdr.c
index 12f7a4ce82c..e0c4c508e12 100644
--- a/drivers/s390/char/vmlogrdr.c
+++ b/drivers/s390/char/vmlogrdr.c
@@ -74,7 +74,7 @@ struct vmlogrdr_priv_t {
 	int dev_in_use; /* 1: already opened, 0: not opened*/
 	spinlock_t priv_lock;
 	struct device  *device;
-	struct class_device  *class_device;
+	struct device  *class_device;
 	int autorecording;
 	int autopurge;
 };
@@ -762,12 +762,10 @@ static int vmlogrdr_register_device(struct vmlogrdr_priv_t *priv)
 		device_unregister(dev);
 		return ret;
 	}
-	priv->class_device = class_device_create(
-				vmlogrdr_class,
-				NULL,
-				MKDEV(vmlogrdr_major, priv->minor_num),
-				dev,
-				"%s", dev->bus_id );
+	priv->class_device = device_create(vmlogrdr_class, dev,
+					   MKDEV(vmlogrdr_major,
+						 priv->minor_num),
+					   "%s", dev->bus_id);
 	if (IS_ERR(priv->class_device)) {
 		ret = PTR_ERR(priv->class_device);
 		priv->class_device=NULL;
@@ -783,8 +781,7 @@ static int vmlogrdr_register_device(struct vmlogrdr_priv_t *priv)
 
 static int vmlogrdr_unregister_device(struct vmlogrdr_priv_t *priv)
 {
-	class_device_destroy(vmlogrdr_class,
-			     MKDEV(vmlogrdr_major, priv->minor_num));
+	device_destroy(vmlogrdr_class, MKDEV(vmlogrdr_major, priv->minor_num));
 	if (priv->device != NULL) {
 		sysfs_remove_group(&priv->device->kobj, &vmlogrdr_attr_group);
 		device_unregister(priv->device);
diff --git a/drivers/s390/cio/chp.c b/drivers/s390/cio/chp.c
index 42c1f4659ad..297cdceb0ca 100644
--- a/drivers/s390/cio/chp.c
+++ b/drivers/s390/cio/chp.c
@@ -246,7 +246,7 @@ int chp_add_cmg_attr(struct channel_path *chp)
 static ssize_t chp_status_show(struct device *dev,
 			       struct device_attribute *attr, char *buf)
 {
-	struct channel_path *chp = container_of(dev, struct channel_path, dev);
+	struct channel_path *chp = to_channelpath(dev);
 
 	if (!chp)
 		return 0;
@@ -258,7 +258,7 @@ static ssize_t chp_status_write(struct device *dev,
 				struct device_attribute *attr,
 				const char *buf, size_t count)
 {
-	struct channel_path *cp = container_of(dev, struct channel_path, dev);
+	struct channel_path *cp = to_channelpath(dev);
 	char cmd[10];
 	int num_args;
 	int error;
@@ -286,7 +286,7 @@ static ssize_t chp_configure_show(struct device *dev,
 	struct channel_path *cp;
 	int status;
 
-	cp = container_of(dev, struct channel_path, dev);
+	cp = to_channelpath(dev);
 	status = chp_info_get_status(cp->chpid);
 	if (status < 0)
 		return status;
@@ -308,7 +308,7 @@ static ssize_t chp_configure_write(struct device *dev,
 		return -EINVAL;
 	if (val != 0 && val != 1)
 		return -EINVAL;
-	cp = container_of(dev, struct channel_path, dev);
+	cp = to_channelpath(dev);
 	chp_cfg_schedule(cp->chpid, val);
 	cfg_wait_idle();
 
@@ -320,7 +320,7 @@ static DEVICE_ATTR(configure, 0644, chp_configure_show, chp_configure_write);
 static ssize_t chp_type_show(struct device *dev, struct device_attribute *attr,
 			     char *buf)
 {
-	struct channel_path *chp = container_of(dev, struct channel_path, dev);
+	struct channel_path *chp = to_channelpath(dev);
 
 	if (!chp)
 		return 0;
@@ -374,7 +374,7 @@ static void chp_release(struct device *dev)
 {
 	struct channel_path *cp;
 
-	cp = container_of(dev, struct channel_path, dev);
+	cp = to_channelpath(dev);
 	kfree(cp);
 }
 
diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c
index 5d83dd47146..838f7ac0dc3 100644
--- a/drivers/s390/cio/css.c
+++ b/drivers/s390/cio/css.c
@@ -182,6 +182,15 @@ static int css_register_subchannel(struct subchannel *sch)
 	sch->dev.bus = &css_bus_type;
 	sch->dev.release = &css_subchannel_release;
 	sch->dev.groups = subch_attr_groups;
+	/*
+	 * We don't want to generate uevents for I/O subchannels that don't
+	 * have a working ccw device behind them since they will be
+	 * unregistered before they can be used anyway, so we delay the add
+	 * uevent until after device recognition was successful.
+	 */
+	if (!cio_is_console(sch->schid))
+		/* Console is special, no need to suppress. */
+		sch->dev.uevent_suppress = 1;
 	css_update_ssd_info(sch);
 	/* make it known to the system */
 	ret = css_sch_device_register(sch);
diff --git a/drivers/s390/scsi/zfcp_aux.c b/drivers/s390/scsi/zfcp_aux.c
index 7507067351b..fd5d0c1570d 100644
--- a/drivers/s390/scsi/zfcp_aux.c
+++ b/drivers/s390/scsi/zfcp_aux.c
@@ -559,6 +559,7 @@ zfcp_sg_list_alloc(struct zfcp_sg_list *sg_list, size_t size)
 		retval = -ENOMEM;
 		goto out;
 	}
+	sg_init_table(sg_list->sg, sg_list->count);
 
 	for (i = 0, sg = sg_list->sg; i < sg_list->count; i++, sg++) {
 		sg->length = min(size, PAGE_SIZE);
diff --git a/drivers/s390/scsi/zfcp_def.h b/drivers/s390/scsi/zfcp_def.h
index 57cac7008e0..326e7ee232c 100644
--- a/drivers/s390/scsi/zfcp_def.h
+++ b/drivers/s390/scsi/zfcp_def.h
@@ -63,7 +63,7 @@
 static inline void *
 zfcp_sg_to_address(struct scatterlist *list)
 {
-	return (void *) (page_address(list->page) + list->offset);
+	return sg_virt(list);
 }
 
 /**
@@ -74,7 +74,7 @@ zfcp_sg_to_address(struct scatterlist *list)
 static inline void
 zfcp_address_to_sg(void *address, struct scatterlist *list)
 {
-	list->page = virt_to_page(address);
+	sg_set_page(list, virt_to_page(address));
 	list->offset = ((unsigned long) address) & (PAGE_SIZE - 1);
 }
 
diff --git a/drivers/s390/scsi/zfcp_erp.c b/drivers/s390/scsi/zfcp_erp.c
index a6475a2bb8a..9438d0b2879 100644
--- a/drivers/s390/scsi/zfcp_erp.c
+++ b/drivers/s390/scsi/zfcp_erp.c
@@ -308,13 +308,15 @@ zfcp_erp_adisc(struct zfcp_port *port)
 	if (send_els == NULL)
 		goto nomem;
 
-	send_els->req = kzalloc(sizeof(struct scatterlist), GFP_ATOMIC);
+	send_els->req = kmalloc(sizeof(struct scatterlist), GFP_ATOMIC);
 	if (send_els->req == NULL)
 		goto nomem;
+	sg_init_table(send_els->req, 1);
 
-	send_els->resp = kzalloc(sizeof(struct scatterlist), GFP_ATOMIC);
+	send_els->resp = kmalloc(sizeof(struct scatterlist), GFP_ATOMIC);
 	if (send_els->resp == NULL)
 		goto nomem;
+	sg_init_table(send_els->resp, 1);
 
 	address = (void *) get_zeroed_page(GFP_ATOMIC);
 	if (address == NULL)
@@ -363,7 +365,7 @@ zfcp_erp_adisc(struct zfcp_port *port)
 	retval = -ENOMEM;
  freemem:
 	if (address != NULL)
-		__free_pages(send_els->req->page, 0);
+		__free_pages(sg_page(send_els->req), 0);
 	if (send_els != NULL) {
 		kfree(send_els->req);
 		kfree(send_els->resp);
@@ -437,7 +439,7 @@ zfcp_erp_adisc_handler(unsigned long data)
 
  out:
 	zfcp_port_put(port);
-	__free_pages(send_els->req->page, 0);
+	__free_pages(sg_page(send_els->req), 0);
 	kfree(send_els->req);
 	kfree(send_els->resp);
 	kfree(send_els);
diff --git a/drivers/sbus/char/vfc_dev.c b/drivers/sbus/char/vfc_dev.c
index e7a1642b2aa..d4f8fcded51 100644
--- a/drivers/sbus/char/vfc_dev.c
+++ b/drivers/sbus/char/vfc_dev.c
@@ -134,7 +134,7 @@ int init_vfc_hw(struct vfc_dev *dev)
 int init_vfc_devstruct(struct vfc_dev *dev, int instance) 
 {
 	dev->instance=instance;
-	init_MUTEX(&dev->device_lock_sem);
+	mutex_init(&dev->device_lock_mtx);
 	dev->control_reg=0;
 	dev->busy=0;
 	return 0;
diff --git a/drivers/scsi/3w-9xxx.c b/drivers/scsi/3w-9xxx.c
index fb14014ee16..afb262b4be1 100644
--- a/drivers/scsi/3w-9xxx.c
+++ b/drivers/scsi/3w-9xxx.c
@@ -1840,7 +1840,7 @@ static int twa_scsiop_execute_scsi(TW_Device_Extension *tw_dev, int request_id,
 			    (scsi_bufflen(srb) < TW_MIN_SGL_LENGTH)) {
 				if (srb->sc_data_direction == DMA_TO_DEVICE || srb->sc_data_direction == DMA_BIDIRECTIONAL) {
 					struct scatterlist *sg = scsi_sglist(srb);
-					char *buf = kmap_atomic(sg->page, KM_IRQ0) + sg->offset;
+					char *buf = kmap_atomic(sg_page(sg), KM_IRQ0) + sg->offset;
 					memcpy(tw_dev->generic_buffer_virt[request_id], buf, sg->length);
 					kunmap_atomic(buf - sg->offset, KM_IRQ0);
 				}
@@ -1919,7 +1919,7 @@ static void twa_scsiop_execute_scsi_complete(TW_Device_Extension *tw_dev, int re
 			char *buf;
 			unsigned long flags = 0;
 			local_irq_save(flags);
-			buf = kmap_atomic(sg->page, KM_IRQ0) + sg->offset;
+			buf = kmap_atomic(sg_page(sg), KM_IRQ0) + sg->offset;
 			memcpy(buf, tw_dev->generic_buffer_virt[request_id], sg->length);
 			kunmap_atomic(buf - sg->offset, KM_IRQ0);
 			local_irq_restore(flags);
diff --git a/drivers/scsi/3w-xxxx.c b/drivers/scsi/3w-xxxx.c
index a64153b9603..59716ebeb10 100644
--- a/drivers/scsi/3w-xxxx.c
+++ b/drivers/scsi/3w-xxxx.c
@@ -1469,7 +1469,7 @@ static void tw_transfer_internal(TW_Device_Extension *tw_dev, int request_id,
 	struct scatterlist *sg = scsi_sglist(cmd);
 
 	local_irq_save(flags);
-	buf = kmap_atomic(sg->page, KM_IRQ0) + sg->offset;
+	buf = kmap_atomic(sg_page(sg), KM_IRQ0) + sg->offset;
 	transfer_len = min(sg->length, len);
 
 	memcpy(buf, data, transfer_len);
diff --git a/drivers/scsi/NCR5380.c b/drivers/scsi/NCR5380.c
index 988f0bc5eda..2597209183d 100644
--- a/drivers/scsi/NCR5380.c
+++ b/drivers/scsi/NCR5380.c
@@ -298,8 +298,7 @@ static __inline__ void initialize_SCp(Scsi_Cmnd * cmd)
 	if (cmd->use_sg) {
 		cmd->SCp.buffer = (struct scatterlist *) cmd->request_buffer;
 		cmd->SCp.buffers_residual = cmd->use_sg - 1;
-		cmd->SCp.ptr = page_address(cmd->SCp.buffer->page)+
-			       cmd->SCp.buffer->offset;
+		cmd->SCp.ptr = sg_virt(cmd->SCp.buffer);
 		cmd->SCp.this_residual = cmd->SCp.buffer->length;
 	} else {
 		cmd->SCp.buffer = NULL;
@@ -2143,8 +2142,7 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) {
 					++cmd->SCp.buffer;
 					--cmd->SCp.buffers_residual;
 					cmd->SCp.this_residual = cmd->SCp.buffer->length;
-					cmd->SCp.ptr = page_address(cmd->SCp.buffer->page)+
-						       cmd->SCp.buffer->offset;
+					cmd->SCp.ptr = sg_virt(cmd->SCp.buffer);
 					dprintk(NDEBUG_INFORMATION, ("scsi%d : %d bytes and %d buffers left\n", instance->host_no, cmd->SCp.this_residual, cmd->SCp.buffers_residual));
 				}
 				/*
diff --git a/drivers/scsi/NCR53C9x.c b/drivers/scsi/NCR53C9x.c
index 96e8e29aa05..5b0efc90391 100644
--- a/drivers/scsi/NCR53C9x.c
+++ b/drivers/scsi/NCR53C9x.c
@@ -927,7 +927,7 @@ static void esp_get_dmabufs(struct NCR_ESP *esp, Scsi_Cmnd *sp)
 			esp->dma_mmu_get_scsi_sgl(esp, sp);
 		else
 			sp->SCp.ptr =
-				(char *) virt_to_phys((page_address(sp->SCp.buffer->page) + sp->SCp.buffer->offset));
+				(char *) virt_to_phys(sg_virt(sp->SCp.buffer));
 	}
 }
 
@@ -1748,7 +1748,7 @@ static inline void advance_sg(struct NCR_ESP *esp, Scsi_Cmnd *sp)
 	if (esp->dma_advance_sg)
 		esp->dma_advance_sg (sp);
 	else
-		sp->SCp.ptr = (char *) virt_to_phys((page_address(sp->SCp.buffer->page) + sp->SCp.buffer->offset));
+		sp->SCp.ptr = (char *) virt_to_phys(sg_virt(sp->SCp.buffer));
 
 }
 
diff --git a/drivers/scsi/NCR53c406a.c b/drivers/scsi/NCR53c406a.c
index 3168a179484..137d065db3d 100644
--- a/drivers/scsi/NCR53c406a.c
+++ b/drivers/scsi/NCR53c406a.c
@@ -875,8 +875,7 @@ static void NCR53c406a_intr(void *dev_id)
 			outb(TRANSFER_INFO | DMA_OP, CMD_REG);
 #if USE_PIO
                         scsi_for_each_sg(current_SC, sg, scsi_sg_count(current_SC), i) {
-                                NCR53c406a_pio_write(page_address(sg->page) + sg->offset,
-                                                     sg->length);
+                                NCR53c406a_pio_write(sg_virt(sg), sg->length);
                         }
 			REG0;
 #endif				/* USE_PIO */
@@ -897,8 +896,7 @@ static void NCR53c406a_intr(void *dev_id)
 			outb(TRANSFER_INFO | DMA_OP, CMD_REG);
 #if USE_PIO
                         scsi_for_each_sg(current_SC, sg, scsi_sg_count(current_SC), i) {
-                                NCR53c406a_pio_read(page_address(sg->page) + sg->offset,
-                                                    sg->length);
+                                NCR53c406a_pio_read(sg_virt(sg), sg->length);
                         }
 			REG0;
 #endif				/* USE_PIO */
diff --git a/drivers/scsi/aacraid/aachba.c b/drivers/scsi/aacraid/aachba.c
index 80e448d0f3d..a77ab8d693d 100644
--- a/drivers/scsi/aacraid/aachba.c
+++ b/drivers/scsi/aacraid/aachba.c
@@ -356,7 +356,7 @@ static void aac_internal_transfer(struct scsi_cmnd *scsicmd, void *data, unsigne
 	int transfer_len;
 	struct scatterlist *sg = scsi_sglist(scsicmd);
 
-	buf = kmap_atomic(sg->page, KM_IRQ0) + sg->offset;
+	buf = kmap_atomic(sg_page(sg), KM_IRQ0) + sg->offset;
 	transfer_len = min(sg->length, len + offset);
 
 	transfer_len -= offset;
diff --git a/drivers/scsi/aha152x.c b/drivers/scsi/aha152x.c
index a58c265dc8a..ea8c6994764 100644
--- a/drivers/scsi/aha152x.c
+++ b/drivers/scsi/aha152x.c
@@ -613,7 +613,7 @@ struct aha152x_scdata {
 #define SCNEXT(SCpnt)		SCDATA(SCpnt)->next
 #define SCSEM(SCpnt)		SCDATA(SCpnt)->done
 
-#define SG_ADDRESS(buffer)	((char *) (page_address((buffer)->page)+(buffer)->offset))
+#define SG_ADDRESS(buffer)	((char *) sg_virt((buffer)))
 
 /* state handling */
 static void seldi_run(struct Scsi_Host *shpnt);
diff --git a/drivers/scsi/aha1542.c b/drivers/scsi/aha1542.c
index 961a1882cb7..bbcc2c52d79 100644
--- a/drivers/scsi/aha1542.c
+++ b/drivers/scsi/aha1542.c
@@ -49,7 +49,7 @@
 #include "aha1542.h"
 
 #define SCSI_BUF_PA(address)	isa_virt_to_bus(address)
-#define SCSI_SG_PA(sgent)	(isa_page_to_bus((sgent)->page) + (sgent)->offset)
+#define SCSI_SG_PA(sgent)	(isa_page_to_bus(sg_page((sgent))) + (sgent)->offset)
 
 static void BAD_DMA(void *address, unsigned int length)
 {
@@ -66,8 +66,7 @@ static void BAD_SG_DMA(Scsi_Cmnd * SCpnt,
 		       int badseg)
 {
 	printk(KERN_CRIT "sgpnt[%d:%d] page %p/0x%llx length %u\n",
-	       badseg, nseg,
-	       page_address(sgp->page) + sgp->offset,
+	       badseg, nseg, sg_virt(sgp),
 	       (unsigned long long)SCSI_SG_PA(sgp),
 	       sgp->length);
 
@@ -712,8 +711,7 @@ static int aha1542_queuecommand(Scsi_Cmnd * SCpnt, void (*done) (Scsi_Cmnd *))
 				printk(KERN_CRIT "Bad segment list supplied to aha1542.c (%d, %d)\n", SCpnt->use_sg, i);
 				scsi_for_each_sg(SCpnt, sg, SCpnt->use_sg, i) {
 					printk(KERN_CRIT "%d: %p %d\n", i,
-					       (page_address(sg->page) +
-						sg->offset), sg->length);
+					       sg_virt(sg), sg->length);
 				};
 				printk(KERN_CRIT "cptr %x: ", (unsigned int) cptr);
 				ptr = (unsigned char *) &cptr[i];
diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c
index f81777586b8..f7a252885a5 100644
--- a/drivers/scsi/arcmsr/arcmsr_hba.c
+++ b/drivers/scsi/arcmsr/arcmsr_hba.c
@@ -1343,7 +1343,7 @@ static int arcmsr_iop_message_xfer(struct AdapterControlBlock *acb, \
 						/* 4 bytes: Areca io control code */
 
 	sg = scsi_sglist(cmd);
-	buffer = kmap_atomic(sg->page, KM_IRQ0) + sg->offset;
+	buffer = kmap_atomic(sg_page(sg), KM_IRQ0) + sg->offset;
 	if (scsi_sg_count(cmd) > 1) {
 		retvalue = ARCMSR_MESSAGE_FAIL;
 		goto message_out;
@@ -1593,7 +1593,7 @@ static void arcmsr_handle_virtual_command(struct AdapterControlBlock *acb,
 		strncpy(&inqdata[32], "R001", 4); /* Product Revision */
 
 		sg = scsi_sglist(cmd);
-		buffer = kmap_atomic(sg->page, KM_IRQ0) + sg->offset;
+		buffer = kmap_atomic(sg_page(sg), KM_IRQ0) + sg->offset;
 
 		memcpy(buffer, inqdata, sizeof(inqdata));
 		sg = scsi_sglist(cmd);
diff --git a/drivers/scsi/atari_NCR5380.c b/drivers/scsi/atari_NCR5380.c
index 52d0b87e9aa..d1780980fb2 100644
--- a/drivers/scsi/atari_NCR5380.c
+++ b/drivers/scsi/atari_NCR5380.c
@@ -515,8 +515,7 @@ static inline void initialize_SCp(Scsi_Cmnd *cmd)
 	if (cmd->use_sg) {
 		cmd->SCp.buffer = (struct scatterlist *)cmd->request_buffer;
 		cmd->SCp.buffers_residual = cmd->use_sg - 1;
-		cmd->SCp.ptr = (char *)page_address(cmd->SCp.buffer->page) +
-			       cmd->SCp.buffer->offset;
+		cmd->SCp.ptr = sg_virt(cmd->SCp.buffer);
 		cmd->SCp.this_residual = cmd->SCp.buffer->length;
 		/* ++roman: Try to merge some scatter-buffers if they are at
 		 * contiguous physical addresses.
@@ -2054,8 +2053,7 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
 					++cmd->SCp.buffer;
 					--cmd->SCp.buffers_residual;
 					cmd->SCp.this_residual = cmd->SCp.buffer->length;
-					cmd->SCp.ptr = page_address(cmd->SCp.buffer->page) +
-						   cmd->SCp.buffer->offset;
+					cmd->SCp.ptr = sg_virt(cmd->SCp.buffer);
 					/* ++roman: Try to merge some scatter-buffers if
 					 * they are at contiguous physical addresses.
 					 */
diff --git a/drivers/scsi/eata_pio.c b/drivers/scsi/eata_pio.c
index 96180bb47e4..982c5092be1 100644
--- a/drivers/scsi/eata_pio.c
+++ b/drivers/scsi/eata_pio.c
@@ -172,7 +172,7 @@ static void IncStat(struct scsi_pointer *SCp, unsigned int Increment)
 			SCp->Status = 0;
 		else {
 			SCp->buffer++;
-			SCp->ptr = page_address(SCp->buffer->page) + SCp->buffer->offset;
+			SCp->ptr = sg_virt(SCp->buffer);
 			SCp->this_residual = SCp->buffer->length;
 		}
 	}
@@ -410,7 +410,7 @@ static int eata_pio_queue(struct scsi_cmnd *cmd,
 	} else {
 		cmd->SCp.buffer = cmd->request_buffer;
 		cmd->SCp.buffers_residual = cmd->use_sg;
-		cmd->SCp.ptr = page_address(cmd->SCp.buffer->page) + cmd->SCp.buffer->offset;
+		cmd->SCp.ptr = sg_virt(cmd->SCp.buffer);
 		cmd->SCp.this_residual = cmd->SCp.buffer->length;
 	}
 	cmd->SCp.Status = (cmd->SCp.this_residual != 0);	/* TRUE as long as bytes 
diff --git a/drivers/scsi/fd_mcs.c b/drivers/scsi/fd_mcs.c
index 668569e8856..8335b608e57 100644
--- a/drivers/scsi/fd_mcs.c
+++ b/drivers/scsi/fd_mcs.c
@@ -973,7 +973,7 @@ static irqreturn_t fd_mcs_intr(int irq, void *dev_id)
 				if (current_SC->SCp.buffers_residual) {
 					--current_SC->SCp.buffers_residual;
 					++current_SC->SCp.buffer;
-					current_SC->SCp.ptr = page_address(current_SC->SCp.buffer->page) + current_SC->SCp.buffer->offset;
+					current_SC->SCp.ptr = sg_virt(current_SC->SCp.buffer);
 					current_SC->SCp.this_residual = current_SC->SCp.buffer->length;
 				} else
 					break;
@@ -1006,7 +1006,7 @@ static irqreturn_t fd_mcs_intr(int irq, void *dev_id)
 			if (!current_SC->SCp.this_residual && current_SC->SCp.buffers_residual) {
 				--current_SC->SCp.buffers_residual;
 				++current_SC->SCp.buffer;
-				current_SC->SCp.ptr = page_address(current_SC->SCp.buffer->page) + current_SC->SCp.buffer->offset;
+				current_SC->SCp.ptr = sg_virt(current_SC->SCp.buffer);
 				current_SC->SCp.this_residual = current_SC->SCp.buffer->length;
 			}
 		}
@@ -1109,7 +1109,7 @@ static int fd_mcs_queue(Scsi_Cmnd * SCpnt, void (*done) (Scsi_Cmnd *))
 
 	if (current_SC->use_sg) {
 		current_SC->SCp.buffer = (struct scatterlist *) current_SC->request_buffer;
-		current_SC->SCp.ptr = page_address(current_SC->SCp.buffer->page) + current_SC->SCp.buffer->offset;
+		current_SC->SCp.ptr = sg_virt(current_SC->SCp.buffer);
 		current_SC->SCp.this_residual = current_SC->SCp.buffer->length;
 		current_SC->SCp.buffers_residual = current_SC->use_sg - 1;
 	} else {
diff --git a/drivers/scsi/fdomain.c b/drivers/scsi/fdomain.c
index 5d282e6a6ae..2cd6b4959eb 100644
--- a/drivers/scsi/fdomain.c
+++ b/drivers/scsi/fdomain.c
@@ -1321,7 +1321,7 @@ static irqreturn_t do_fdomain_16x0_intr(int irq, void *dev_id)
 	    if (current_SC->SCp.buffers_residual) {
 	       --current_SC->SCp.buffers_residual;
 	       ++current_SC->SCp.buffer;
-	       current_SC->SCp.ptr = page_address(current_SC->SCp.buffer->page) + current_SC->SCp.buffer->offset;
+	       current_SC->SCp.ptr = sg_virt(current_SC->SCp.buffer);
 	       current_SC->SCp.this_residual = current_SC->SCp.buffer->length;
 	    } else
 		  break;
@@ -1354,7 +1354,7 @@ static irqreturn_t do_fdomain_16x0_intr(int irq, void *dev_id)
 	     && current_SC->SCp.buffers_residual) {
 	    --current_SC->SCp.buffers_residual;
 	    ++current_SC->SCp.buffer;
-	    current_SC->SCp.ptr = page_address(current_SC->SCp.buffer->page) + current_SC->SCp.buffer->offset;
+	    current_SC->SCp.ptr = sg_virt(current_SC->SCp.buffer);
 	    current_SC->SCp.this_residual = current_SC->SCp.buffer->length;
 	 }
       }
@@ -1439,8 +1439,7 @@ static int fdomain_16x0_queue(struct scsi_cmnd *SCpnt,
 
    if (scsi_sg_count(current_SC)) {
 	   current_SC->SCp.buffer = scsi_sglist(current_SC);
-	   current_SC->SCp.ptr = page_address(current_SC->SCp.buffer->page)
-		   + current_SC->SCp.buffer->offset;
+	   current_SC->SCp.ptr = sg_virt(current_SC->SCp.buffer);
 	   current_SC->SCp.this_residual    = current_SC->SCp.buffer->length;
 	   current_SC->SCp.buffers_residual = scsi_sg_count(current_SC) - 1;
    } else {
diff --git a/drivers/scsi/gdth.c b/drivers/scsi/gdth.c
index 3ac080ee6e2..5ab3ce76248 100644
--- a/drivers/scsi/gdth.c
+++ b/drivers/scsi/gdth.c
@@ -2374,18 +2374,18 @@ static void gdth_copy_internal_data(gdth_ha_str *ha, Scsi_Cmnd *scp,
             if (cpsum+cpnow > cpcount) 
                 cpnow = cpcount - cpsum;
             cpsum += cpnow;
-            if (!sl->page) {
+            if (!sg_page(sl)) {
                 printk("GDT-HA %d: invalid sc/gt element in gdth_copy_internal_data()\n",
                        ha->hanum);
                 return;
             }
             local_irq_save(flags);
-            address = kmap_atomic(sl->page, KM_BIO_SRC_IRQ) + sl->offset;
+            address = kmap_atomic(sg_page(sl), KM_BIO_SRC_IRQ) + sl->offset;
             if (to_buffer)
                 memcpy(buffer, address, cpnow);
             else
                 memcpy(address, buffer, cpnow);
-            flush_dcache_page(sl->page);
+            flush_dcache_page(sg_page(sl));
             kunmap_atomic(address, KM_BIO_SRC_IRQ);
             local_irq_restore(flags);
             if (cpsum == cpcount)
diff --git a/drivers/scsi/ibmmca.c b/drivers/scsi/ibmmca.c
index 714e6273a70..db004a45073 100644
--- a/drivers/scsi/ibmmca.c
+++ b/drivers/scsi/ibmmca.c
@@ -1828,7 +1828,7 @@ static int ibmmca_queuecommand(Scsi_Cmnd * cmd, void (*done) (Scsi_Cmnd *))
 		BUG_ON(scsi_sg_count(cmd) > 16);
 
 		scsi_for_each_sg(cmd, sg, scsi_sg_count(cmd), i) {
-			ld(shpnt)[ldn].sge[i].address = (void *) (isa_page_to_bus(sg->page) + sg->offset);
+			ld(shpnt)[ldn].sge[i].address = (void *) (isa_page_to_bus(sg_page(sg)) + sg->offset);
 			ld(shpnt)[ldn].sge[i].byte_length = sg->length;
 		}
 		scb->enable |= IM_POINTER_TO_LIST;
diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c
index 252d1806467..8d0244c2e7d 100644
--- a/drivers/scsi/ide-scsi.c
+++ b/drivers/scsi/ide-scsi.c
@@ -175,18 +175,18 @@ static void idescsi_input_buffers (ide_drive_t *drive, idescsi_pc_t *pc, unsigne
 
 	while (bcount) {
 		count = min(pc->sg->length - pc->b_count, bcount);
-		if (PageHighMem(pc->sg->page)) {
+		if (PageHighMem(sg_page(pc->sg))) {
 			unsigned long flags;
 
 			local_irq_save(flags);
-			buf = kmap_atomic(pc->sg->page, KM_IRQ0) +
+			buf = kmap_atomic(sg_page(pc->sg), KM_IRQ0) +
 					pc->sg->offset;
 			drive->hwif->atapi_input_bytes(drive,
 						buf + pc->b_count, count);
 			kunmap_atomic(buf - pc->sg->offset, KM_IRQ0);
 			local_irq_restore(flags);
 		} else {
-			buf = page_address(pc->sg->page) + pc->sg->offset;
+			buf = sg_virt(pc->sg);
 			drive->hwif->atapi_input_bytes(drive,
 						buf + pc->b_count, count);
 		}
@@ -212,18 +212,18 @@ static void idescsi_output_buffers (ide_drive_t *drive, idescsi_pc_t *pc, unsign
 
 	while (bcount) {
 		count = min(pc->sg->length - pc->b_count, bcount);
-		if (PageHighMem(pc->sg->page)) {
+		if (PageHighMem(sg_page(pc->sg))) {
 			unsigned long flags;
 
 			local_irq_save(flags);
-			buf = kmap_atomic(pc->sg->page, KM_IRQ0) +
+			buf = kmap_atomic(sg_page(pc->sg), KM_IRQ0) +
 						pc->sg->offset;
 			drive->hwif->atapi_output_bytes(drive,
 						buf + pc->b_count, count);
 			kunmap_atomic(buf - pc->sg->offset, KM_IRQ0);
 			local_irq_restore(flags);
 		} else {
-			buf = page_address(pc->sg->page) + pc->sg->offset;
+			buf = sg_virt(pc->sg);
 			drive->hwif->atapi_output_bytes(drive,
 						buf + pc->b_count, count);
 		}
diff --git a/drivers/scsi/imm.c b/drivers/scsi/imm.c
index 74cdc1f0a78..a3d0c6b1495 100644
--- a/drivers/scsi/imm.c
+++ b/drivers/scsi/imm.c
@@ -705,9 +705,7 @@ static int imm_completion(struct scsi_cmnd *cmd)
 				cmd->SCp.buffer++;
 				cmd->SCp.this_residual =
 				    cmd->SCp.buffer->length;
-				cmd->SCp.ptr =
-				    page_address(cmd->SCp.buffer->page) +
-				    cmd->SCp.buffer->offset;
+				cmd->SCp.ptr = sg_virt(cmd->SCp.buffer);
 
 				/*
 				 * Make sure that we transfer even number of bytes
@@ -844,9 +842,7 @@ static int imm_engine(imm_struct *dev, struct scsi_cmnd *cmd)
 			cmd->SCp.buffer =
 			    (struct scatterlist *) cmd->request_buffer;
 			cmd->SCp.this_residual = cmd->SCp.buffer->length;
-			cmd->SCp.ptr =
-			    page_address(cmd->SCp.buffer->page) +
-			    cmd->SCp.buffer->offset;
+			cmd->SCp.ptr = sg_virt(cmd->SCp.buffer);
 		} else {
 			/* else fill the only available buffer */
 			cmd->SCp.buffer = NULL;
diff --git a/drivers/scsi/in2000.c b/drivers/scsi/in2000.c
index ab7cbf3449c..c8b452f2878 100644
--- a/drivers/scsi/in2000.c
+++ b/drivers/scsi/in2000.c
@@ -372,7 +372,7 @@ static int in2000_queuecommand(Scsi_Cmnd * cmd, void (*done) (Scsi_Cmnd *))
 	if (cmd->use_sg) {
 		cmd->SCp.buffer = (struct scatterlist *) cmd->request_buffer;
 		cmd->SCp.buffers_residual = cmd->use_sg - 1;
-		cmd->SCp.ptr = (char *) page_address(cmd->SCp.buffer->page) + cmd->SCp.buffer->offset;
+		cmd->SCp.ptr = sg_virt(cmd->SCp.buffer);
 		cmd->SCp.this_residual = cmd->SCp.buffer->length;
 	} else {
 		cmd->SCp.buffer = NULL;
@@ -764,7 +764,7 @@ static void transfer_bytes(Scsi_Cmnd * cmd, int data_in_dir)
 		++cmd->SCp.buffer;
 		--cmd->SCp.buffers_residual;
 		cmd->SCp.this_residual = cmd->SCp.buffer->length;
-		cmd->SCp.ptr = page_address(cmd->SCp.buffer->page) + cmd->SCp.buffer->offset;
+		cmd->SCp.ptr = sg_virt(cmd->SCp.buffer);
 	}
 
 /* Set up hardware registers */
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index c316a0bcae6..439b97a6a26 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -2872,6 +2872,7 @@ static struct ipr_sglist *ipr_alloc_ucode_buffer(int buf_len)
 	}
 
 	scatterlist = sglist->scatterlist;
+	sg_init_table(scatterlist, num_elem);
 
 	sglist->order = order;
 	sglist->num_sg = num_elem;
@@ -2884,12 +2885,12 @@ static struct ipr_sglist *ipr_alloc_ucode_buffer(int buf_len)
 
 			/* Free up what we already allocated */
 			for (j = i - 1; j >= 0; j--)
-				__free_pages(scatterlist[j].page, order);
+				__free_pages(sg_page(&scatterlist[j]), order);
 			kfree(sglist);
 			return NULL;
 		}
 
-		scatterlist[i].page = page;
+		sg_set_page(&scatterlist[i], page);
 	}
 
 	return sglist;
@@ -2910,7 +2911,7 @@ static void ipr_free_ucode_buffer(struct ipr_sglist *sglist)
 	int i;
 
 	for (i = 0; i < sglist->num_sg; i++)
-		__free_pages(sglist->scatterlist[i].page, sglist->order);
+		__free_pages(sg_page(&sglist->scatterlist[i]), sglist->order);
 
 	kfree(sglist);
 }
@@ -2940,9 +2941,11 @@ static int ipr_copy_ucode_buffer(struct ipr_sglist *sglist,
 	scatterlist = sglist->scatterlist;
 
 	for (i = 0; i < (len / bsize_elem); i++, buffer += bsize_elem) {
-		kaddr = kmap(scatterlist[i].page);
+		struct page *page = sg_page(&scatterlist[i]);
+
+		kaddr = kmap(page);
 		memcpy(kaddr, buffer, bsize_elem);
-		kunmap(scatterlist[i].page);
+		kunmap(page);
 
 		scatterlist[i].length = bsize_elem;
 
@@ -2953,9 +2956,11 @@ static int ipr_copy_ucode_buffer(struct ipr_sglist *sglist,
 	}
 
 	if (len % bsize_elem) {
-		kaddr = kmap(scatterlist[i].page);
+		struct page *page = sg_page(&scatterlist[i]);
+
+		kaddr = kmap(page);
 		memcpy(kaddr, buffer, len % bsize_elem);
-		kunmap(scatterlist[i].page);
+		kunmap(page);
 
 		scatterlist[i].length = len % bsize_elem;
 	}
diff --git a/drivers/scsi/ips.c b/drivers/scsi/ips.c
index edaac2714c5..5c5a9b2628f 100644
--- a/drivers/scsi/ips.c
+++ b/drivers/scsi/ips.c
@@ -1515,7 +1515,7 @@ static int ips_is_passthru(struct scsi_cmnd *SC)
                 /* kmap_atomic() ensures addressability of the user buffer.*/
                 /* local_irq_save() protects the KM_IRQ0 address slot.     */
                 local_irq_save(flags);
-                buffer = kmap_atomic(sg->page, KM_IRQ0) + sg->offset;
+                buffer = kmap_atomic(sg_page(sg), KM_IRQ0) + sg->offset;
                 if (buffer && buffer[0] == 'C' && buffer[1] == 'O' &&
                     buffer[2] == 'P' && buffer[3] == 'P') {
                         kunmap_atomic(buffer - sg->offset, KM_IRQ0);
@@ -3523,7 +3523,7 @@ ips_scmd_buf_write(struct scsi_cmnd *scmd, void *data, unsigned int count)
                 /* kmap_atomic() ensures addressability of the data buffer.*/
                 /* local_irq_save() protects the KM_IRQ0 address slot.     */
                 local_irq_save(flags);
-                buffer = kmap_atomic(sg[i].page, KM_IRQ0) + sg[i].offset;
+                buffer = kmap_atomic(sg_page(&sg[i]), KM_IRQ0) + sg[i].offset;
                 memcpy(buffer, &cdata[xfer_cnt], min_cnt);
                 kunmap_atomic(buffer - sg[i].offset, KM_IRQ0);
                 local_irq_restore(flags);
@@ -3556,7 +3556,7 @@ ips_scmd_buf_read(struct scsi_cmnd *scmd, void *data, unsigned int count)
                 /* kmap_atomic() ensures addressability of the data buffer.*/
                 /* local_irq_save() protects the KM_IRQ0 address slot.     */
                 local_irq_save(flags);
-                buffer = kmap_atomic(sg[i].page, KM_IRQ0) + sg[i].offset;
+                buffer = kmap_atomic(sg_page(&sg[i]), KM_IRQ0) + sg[i].offset;
                 memcpy(&cdata[xfer_cnt], buffer, min_cnt);
                 kunmap_atomic(buffer - sg[i].offset, KM_IRQ0);
                 local_irq_restore(flags);
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index a21455d0274..6ce4109efdf 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -70,9 +70,7 @@ module_param_named(max_lun, iscsi_max_lun, uint, S_IRUGO);
 static inline void
 iscsi_buf_init_iov(struct iscsi_buf *ibuf, char *vbuf, int size)
 {
-	ibuf->sg.page = virt_to_page(vbuf);
-	ibuf->sg.offset = offset_in_page(vbuf);
-	ibuf->sg.length = size;
+	sg_init_one(&ibuf->sg, vbuf, size);
 	ibuf->sent = 0;
 	ibuf->use_sendmsg = 1;
 }
@@ -80,13 +78,14 @@ iscsi_buf_init_iov(struct iscsi_buf *ibuf, char *vbuf, int size)
 static inline void
 iscsi_buf_init_sg(struct iscsi_buf *ibuf, struct scatterlist *sg)
 {
-	ibuf->sg.page = sg->page;
+	sg_init_table(&ibuf->sg, 1);
+	sg_set_page(&ibuf->sg, sg_page(sg));
 	ibuf->sg.offset = sg->offset;
 	ibuf->sg.length = sg->length;
 	/*
 	 * Fastpath: sg element fits into single page
 	 */
-	if (sg->length + sg->offset <= PAGE_SIZE && !PageSlab(sg->page))
+	if (sg->length + sg->offset <= PAGE_SIZE && !PageSlab(sg_page(sg)))
 		ibuf->use_sendmsg = 0;
 	else
 		ibuf->use_sendmsg = 1;
@@ -716,7 +715,7 @@ static int iscsi_scsi_data_in(struct iscsi_conn *conn)
 	for (i = tcp_ctask->sg_count; i < scsi_sg_count(sc); i++) {
 		char *dest;
 
-		dest = kmap_atomic(sg[i].page, KM_SOFTIRQ0);
+		dest = kmap_atomic(sg_page(&sg[i]), KM_SOFTIRQ0);
 		rc = iscsi_ctask_copy(tcp_conn, ctask, dest + sg[i].offset,
 				      sg[i].length, offset);
 		kunmap_atomic(dest, KM_SOFTIRQ0);
@@ -1103,9 +1102,9 @@ iscsi_send(struct iscsi_conn *conn, struct iscsi_buf *buf, int size, int flags)
 	 * slab case.
 	 */
 	if (buf->use_sendmsg)
-		res = sock_no_sendpage(sk, buf->sg.page, offset, size, flags);
+		res = sock_no_sendpage(sk, sg_page(&buf->sg), offset, size, flags);
 	else
-		res = tcp_conn->sendpage(sk, buf->sg.page, offset, size, flags);
+		res = tcp_conn->sendpage(sk, sg_page(&buf->sg), offset, size, flags);
 
 	if (res >= 0) {
 		conn->txdata_octets += res;
diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c
index 10d1aff9938..66c65203573 100644
--- a/drivers/scsi/megaraid.c
+++ b/drivers/scsi/megaraid.c
@@ -658,7 +658,7 @@ mega_build_cmd(adapter_t *adapter, Scsi_Cmnd *cmd, int *busy)
 			struct scatterlist *sg;
 
 			sg = scsi_sglist(cmd);
-			buf = kmap_atomic(sg->page, KM_IRQ0) + sg->offset;
+			buf = kmap_atomic(sg_page(sg), KM_IRQ0) + sg->offset;
 
 			memset(buf, 0, cmd->cmnd[4]);
 			kunmap_atomic(buf - sg->offset, KM_IRQ0);
@@ -1542,10 +1542,8 @@ mega_cmd_done(adapter_t *adapter, u8 completed[], int nstatus, int status)
 		if( cmd->cmnd[0] == INQUIRY && !islogical ) {
 
 			sgl = scsi_sglist(cmd);
-			if( sgl->page ) {
-				c = *(unsigned char *)
-					page_address((&sgl[0])->page) +
-					(&sgl[0])->offset; 
+			if( sg_page(sgl) ) {
+				c = *(unsigned char *) sg_virt(&sgl[0]);
 			} else {
 				printk(KERN_WARNING
 				       "megaraid: invalid sg.\n");
diff --git a/drivers/scsi/megaraid/megaraid_mbox.c b/drivers/scsi/megaraid/megaraid_mbox.c
index 78779209ac8..c8923108183 100644
--- a/drivers/scsi/megaraid/megaraid_mbox.c
+++ b/drivers/scsi/megaraid/megaraid_mbox.c
@@ -1584,10 +1584,8 @@ megaraid_mbox_build_cmd(adapter_t *adapter, struct scsi_cmnd *scp, int *busy)
 			caddr_t			vaddr;
 
 			sgl = scsi_sglist(scp);
-			if (sgl->page) {
-				vaddr = (caddr_t)
-					(page_address((&sgl[0])->page)
-					 + (&sgl[0])->offset);
+			if (sg_page(sgl)) {
+				vaddr = (caddr_t) sg_virt(&sgl[0]);
 
 				memset(vaddr, 0, scp->cmnd[4]);
 			}
@@ -2328,10 +2326,8 @@ megaraid_mbox_dpc(unsigned long devp)
 				&& IS_RAID_CH(raid_dev, scb->dev_channel)) {
 
 			sgl = scsi_sglist(scp);
-			if (sgl->page) {
-				c = *(unsigned char *)
-					(page_address((&sgl[0])->page) +
-					 (&sgl[0])->offset);
+			if (sg_page(sgl)) {
+				c = *(unsigned char *) sg_virt(&sgl[0]);
 			} else {
 				con_log(CL_ANN, (KERN_WARNING
 						 "megaraid mailbox: invalid sg:%d\n",
diff --git a/drivers/scsi/oktagon_esp.c b/drivers/scsi/oktagon_esp.c
index 26a6d55faf3..8e5eadbd5c5 100644
--- a/drivers/scsi/oktagon_esp.c
+++ b/drivers/scsi/oktagon_esp.c
@@ -550,8 +550,7 @@ void dma_mmu_get_scsi_one(struct NCR_ESP *esp, Scsi_Cmnd *sp)
 
 void dma_mmu_get_scsi_sgl(struct NCR_ESP *esp, Scsi_Cmnd *sp)
 {
-        sp->SCp.ptr = page_address(sp->SCp.buffer->page)+
-		      sp->SCp.buffer->offset;
+        sp->SCp.ptr = sg_virt(sp->SCp.buffer);
 }
 
 void dma_mmu_release_scsi_one(struct NCR_ESP *esp, Scsi_Cmnd *sp)
@@ -564,8 +563,7 @@ void dma_mmu_release_scsi_sgl(struct NCR_ESP *esp, Scsi_Cmnd *sp)
 
 void dma_advance_sg(Scsi_Cmnd *sp)
 {
-	sp->SCp.ptr = page_address(sp->SCp.buffer->page)+
-		      sp->SCp.buffer->offset;
+	sp->SCp.ptr = sg_virt(sp->SCp.buffer);
 }
 
 
diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c
index 331b789937c..1c5c4b68f20 100644
--- a/drivers/scsi/osst.c
+++ b/drivers/scsi/osst.c
@@ -542,7 +542,7 @@ static int osst_verify_frame(struct osst_tape * STp, int frame_seq_number, int q
 	if (STp->raw) {
 		if (STp->buffer->syscall_result) {
 			for (i=0; i < STp->buffer->sg_segs; i++)
-				memset(page_address(STp->buffer->sg[i].page),
+				memset(page_address(sg_page(&STp->buffer->sg[i])),
 				       0, STp->buffer->sg[i].length);
 			strcpy(STp->buffer->b_data, "READ ERROR ON FRAME");
                 } else
@@ -4437,7 +4437,7 @@ static int os_scsi_tape_open(struct inode * inode, struct file * filp)
 		for (i = 0, b_size = 0; 
 		     (i < STp->buffer->sg_segs) && ((b_size + STp->buffer->sg[i].length) <= OS_DATA_SIZE); 
 		     b_size += STp->buffer->sg[i++].length);
-		STp->buffer->aux = (os_aux_t *) (page_address(STp->buffer->sg[i].page) + OS_DATA_SIZE - b_size);
+		STp->buffer->aux = (os_aux_t *) (page_address(sg_page(&STp->buffer->sg[i])) + OS_DATA_SIZE - b_size);
 #if DEBUG
 		printk(OSST_DEB_MSG "%s:D: b_data points to %p in segment 0 at %p\n", name,
 			STp->buffer->b_data, page_address(STp->buffer->sg[0].page));
@@ -5252,25 +5252,26 @@ static int enlarge_buffer(struct osst_buffer *STbuffer, int need_dma)
 	/* Try to allocate the first segment up to OS_DATA_SIZE and the others
 	   big enough to reach the goal (code assumes no segments in place) */
 	for (b_size = OS_DATA_SIZE, order = OSST_FIRST_ORDER; b_size >= PAGE_SIZE; order--, b_size /= 2) {
-		STbuffer->sg[0].page = alloc_pages(priority, order);
+		struct page *page = alloc_pages(priority, order);
+
 		STbuffer->sg[0].offset = 0;
-		if (STbuffer->sg[0].page != NULL) {
+		if (page != NULL) {
+		    sg_set_page(&STbuffer->sg[0], page);
 		    STbuffer->sg[0].length = b_size;
-		    STbuffer->b_data = page_address(STbuffer->sg[0].page);
+		    STbuffer->b_data = page_address(page);
 		    break;
 		}
 	}
-	if (STbuffer->sg[0].page == NULL) {
+	if (sg_page(&STbuffer->sg[0]) == NULL) {
 		printk(KERN_NOTICE "osst :I: Can't allocate tape buffer main segment.\n");
 		return 0;
 	}
 	/* Got initial segment of 'bsize,order', continue with same size if possible, except for AUX */
 	for (segs=STbuffer->sg_segs=1, got=b_size;
 	     segs < max_segs && got < OS_FRAME_SIZE; ) {
-		STbuffer->sg[segs].page =
-				alloc_pages(priority, (OS_FRAME_SIZE - got <= PAGE_SIZE) ? 0 : order);
+		struct page *page = alloc_pages(priority, (OS_FRAME_SIZE - got <= PAGE_SIZE) ? 0 : order);
 		STbuffer->sg[segs].offset = 0;
-		if (STbuffer->sg[segs].page == NULL) {
+		if (page == NULL) {
 			if (OS_FRAME_SIZE - got <= (max_segs - segs) * b_size / 2 && order) {
 				b_size /= 2;  /* Large enough for the rest of the buffers */
 				order--;
@@ -5284,6 +5285,7 @@ static int enlarge_buffer(struct osst_buffer *STbuffer, int need_dma)
 			normalize_buffer(STbuffer);
 			return 0;
 		}
+		sg_set_page(&STbuffer->sg[segs], page);
 		STbuffer->sg[segs].length = (OS_FRAME_SIZE - got <= PAGE_SIZE / 2) ? (OS_FRAME_SIZE - got) : b_size;
 		got += STbuffer->sg[segs].length;
 		STbuffer->buffer_size = got;
@@ -5316,7 +5318,7 @@ static void normalize_buffer(struct osst_buffer *STbuffer)
 		     b_size < STbuffer->sg[i].length;
 		     b_size *= 2, order++);
 
-		__free_pages(STbuffer->sg[i].page, order);
+		__free_pages(sg_page(&STbuffer->sg[i]), order);
 		STbuffer->buffer_size -= STbuffer->sg[i].length;
 	}
 #if DEBUG
@@ -5344,7 +5346,7 @@ static int append_to_buffer(const char __user *ubp, struct osst_buffer *st_bp, i
 	for ( ; i < st_bp->sg_segs && do_count > 0; i++) {
 		cnt = st_bp->sg[i].length - offset < do_count ?
 		      st_bp->sg[i].length - offset : do_count;
-		res = copy_from_user(page_address(st_bp->sg[i].page) + offset, ubp, cnt);
+		res = copy_from_user(page_address(sg_page(&st_bp->sg[i])) + offset, ubp, cnt);
 		if (res)
 			return (-EFAULT);
 		do_count -= cnt;
@@ -5377,7 +5379,7 @@ static int from_buffer(struct osst_buffer *st_bp, char __user *ubp, int do_count
 	for ( ; i < st_bp->sg_segs && do_count > 0; i++) {
 		cnt = st_bp->sg[i].length - offset < do_count ?
 		      st_bp->sg[i].length - offset : do_count;
-		res = copy_to_user(ubp, page_address(st_bp->sg[i].page) + offset, cnt);
+		res = copy_to_user(ubp, page_address(sg_page(&st_bp->sg[i])) + offset, cnt);
 		if (res)
 			return (-EFAULT);
 		do_count -= cnt;
@@ -5410,7 +5412,7 @@ static int osst_zero_buffer_tail(struct osst_buffer *st_bp)
 	     i < st_bp->sg_segs && do_count > 0; i++) {
 		cnt = st_bp->sg[i].length - offset < do_count ?
 		      st_bp->sg[i].length - offset : do_count ;
-		memset(page_address(st_bp->sg[i].page) + offset, 0, cnt);
+		memset(page_address(sg_page(&st_bp->sg[i])) + offset, 0, cnt);
 		do_count -= cnt;
 		offset = 0;
 	}
@@ -5430,7 +5432,7 @@ static int osst_copy_to_buffer(struct osst_buffer *st_bp, unsigned char *ptr)
 	for (i = 0; i < st_bp->sg_segs && do_count > 0; i++) {
 		cnt = st_bp->sg[i].length < do_count ?
 		      st_bp->sg[i].length : do_count ;
-		memcpy(page_address(st_bp->sg[i].page), ptr, cnt);
+		memcpy(page_address(sg_page(&st_bp->sg[i])), ptr, cnt);
 		do_count -= cnt;
 		ptr      += cnt;
 	}
@@ -5451,7 +5453,7 @@ static int osst_copy_from_buffer(struct osst_buffer *st_bp, unsigned char *ptr)
 	for (i = 0; i < st_bp->sg_segs && do_count > 0; i++) {
 		cnt = st_bp->sg[i].length < do_count ?
 		      st_bp->sg[i].length : do_count ;
-		memcpy(ptr, page_address(st_bp->sg[i].page), cnt);
+		memcpy(ptr, page_address(sg_page(&st_bp->sg[i])), cnt);
 		do_count -= cnt;
 		ptr      += cnt;
 	}
diff --git a/drivers/scsi/pcmcia/nsp_cs.h b/drivers/scsi/pcmcia/nsp_cs.h
index 98397559c53..7db28cd4944 100644
--- a/drivers/scsi/pcmcia/nsp_cs.h
+++ b/drivers/scsi/pcmcia/nsp_cs.h
@@ -393,7 +393,7 @@ enum _burst_mode {
 #define MSG_EXT_SDTR         0x01
 
 /* scatter-gather table */
-#  define BUFFER_ADDR ((char *)((unsigned int)(SCpnt->SCp.buffer->page) + SCpnt->SCp.buffer->offset))
+#  define BUFFER_ADDR ((char *)((sg_virt(SCpnt->SCp.buffer))))
 
 #endif  /*__nsp_cs__*/
 /* end */
diff --git a/drivers/scsi/pcmcia/sym53c500_cs.c b/drivers/scsi/pcmcia/sym53c500_cs.c
index 190e2a7d706..969b9387a0c 100644
--- a/drivers/scsi/pcmcia/sym53c500_cs.c
+++ b/drivers/scsi/pcmcia/sym53c500_cs.c
@@ -443,8 +443,7 @@ SYM53C500_intr(int irq, void *dev_id)
 
 			scsi_for_each_sg(curSC, sg, scsi_sg_count(curSC), i) {
 				SYM53C500_pio_write(fast_pio, port_base,
-						    page_address(sg->page) + sg->offset,
-						    sg->length);
+				    sg_virt(sg), sg->length);
 			}
 			REG0(port_base);
 		}
@@ -463,8 +462,7 @@ SYM53C500_intr(int irq, void *dev_id)
 
 			scsi_for_each_sg(curSC, sg, scsi_sg_count(curSC), i) {
 				SYM53C500_pio_read(fast_pio, port_base,
-						   page_address(sg->page) + sg->offset,
-						   sg->length);
+					sg_virt(sg), sg->length);
 			}
 			REG0(port_base);
 		}
diff --git a/drivers/scsi/ppa.c b/drivers/scsi/ppa.c
index 67b6d76a6c8..67ee51a3d7e 100644
--- a/drivers/scsi/ppa.c
+++ b/drivers/scsi/ppa.c
@@ -608,9 +608,7 @@ static int ppa_completion(struct scsi_cmnd *cmd)
 				cmd->SCp.buffer++;
 				cmd->SCp.this_residual =
 				    cmd->SCp.buffer->length;
-				cmd->SCp.ptr =
-				    page_address(cmd->SCp.buffer->page) +
-				    cmd->SCp.buffer->offset;
+				cmd->SCp.ptr = sg_virt(cmd->SCp.buffer);
 			}
 		}
 		/* Now check to see if the drive is ready to comunicate */
@@ -756,8 +754,7 @@ static int ppa_engine(ppa_struct *dev, struct scsi_cmnd *cmd)
 			/* if many buffers are available, start filling the first */
 			cmd->SCp.buffer = (struct scatterlist *) cmd->request_buffer;
 			cmd->SCp.this_residual = cmd->SCp.buffer->length;
-			cmd->SCp.ptr = page_address(cmd->SCp.buffer->page) +
-			    cmd->SCp.buffer->offset;
+			cmd->SCp.ptr = sg_virt(cmd->SCp.buffer);
 		} else {
 			/* else fill the only available buffer */
 			cmd->SCp.buffer = NULL;
diff --git a/drivers/scsi/ps3rom.c b/drivers/scsi/ps3rom.c
index 0f43d1d046d..17b4a7c4618 100644
--- a/drivers/scsi/ps3rom.c
+++ b/drivers/scsi/ps3rom.c
@@ -111,14 +111,14 @@ static int fill_from_dev_buffer(struct scsi_cmnd *cmd, const void *buf)
 	req_len = act_len = 0;
 	scsi_for_each_sg(cmd, sgpnt, scsi_sg_count(cmd), k) {
 		if (active) {
-			kaddr = kmap_atomic(sgpnt->page, KM_IRQ0);
+			kaddr = kmap_atomic(sg_page(sgpnt), KM_IRQ0);
 			len = sgpnt->length;
 			if ((req_len + len) > buflen) {
 				active = 0;
 				len = buflen - req_len;
 			}
 			memcpy(kaddr + sgpnt->offset, buf + req_len, len);
-			flush_kernel_dcache_page(sgpnt->page);
+			flush_kernel_dcache_page(sg_page(sgpnt));
 			kunmap_atomic(kaddr, KM_IRQ0);
 			act_len += len;
 		}
@@ -147,7 +147,7 @@ static int fetch_to_dev_buffer(struct scsi_cmnd *cmd, void *buf)
 
 	req_len = fin = 0;
 	scsi_for_each_sg(cmd, sgpnt, scsi_sg_count(cmd), k) {
-		kaddr = kmap_atomic(sgpnt->page, KM_IRQ0);
+		kaddr = kmap_atomic(sg_page(sgpnt), KM_IRQ0);
 		len = sgpnt->length;
 		if ((req_len + len) > buflen) {
 			len = buflen - req_len;
diff --git a/drivers/scsi/qlogicfas408.c b/drivers/scsi/qlogicfas408.c
index 2bfbf26c00e..de7b3bc2cbc 100644
--- a/drivers/scsi/qlogicfas408.c
+++ b/drivers/scsi/qlogicfas408.c
@@ -317,7 +317,7 @@ static unsigned int ql_pcmd(struct scsi_cmnd *cmd)
 				return ((priv->qabort == 1 ?
 					 DID_ABORT : DID_RESET) << 16);
 			}
-			buf = page_address(sg->page) + sg->offset;
+			buf = sg_virt(sg);
 			if (ql_pdma(priv, phase, buf, sg->length))
 				break;
 		}
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 72ee4c9cfb1..46cae5a212d 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -625,7 +625,7 @@ static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
 	scsi_for_each_sg(scp, sg, scp->use_sg, k) {
 		if (active) {
 			kaddr = (unsigned char *)
-				kmap_atomic(sg->page, KM_USER0);
+				kmap_atomic(sg_page(sg), KM_USER0);
 			if (NULL == kaddr)
 				return (DID_ERROR << 16);
 			kaddr_off = (unsigned char *)kaddr + sg->offset;
@@ -672,7 +672,7 @@ static int fetch_to_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
 	sg = scsi_sglist(scp);
 	req_len = fin = 0;
 	for (k = 0; k < scp->use_sg; ++k, sg = sg_next(sg)) {
-		kaddr = (unsigned char *)kmap_atomic(sg->page, KM_USER0);
+		kaddr = (unsigned char *)kmap_atomic(sg_page(sg), KM_USER0);
 		if (NULL == kaddr)
 			return -1;
 		kaddr_off = (unsigned char *)kaddr + sg->offset;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index aac8a02cbe8..61fdaf02f25 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -295,7 +295,7 @@ static int scsi_req_map_sg(struct request *rq, struct scatterlist *sgl,
 	int i, err, nr_vecs = 0;
 
 	for_each_sg(sgl, sg, nsegs, i) {
-		page = sg->page;
+		page = sg_page(sg);
 		off = sg->offset;
 		len = sg->length;
  		data_len += len;
@@ -764,7 +764,7 @@ struct scatterlist *scsi_alloc_sgtable(struct scsi_cmnd *cmd, gfp_t gfp_mask)
 		if (unlikely(!sgl))
 			goto enomem;
 
-		memset(sgl, 0, sizeof(*sgl) * sgp->size);
+		sg_init_table(sgl, sgp->size);
 
 		/*
 		 * first loop through, set initial index and return value
@@ -781,6 +781,13 @@ struct scatterlist *scsi_alloc_sgtable(struct scsi_cmnd *cmd, gfp_t gfp_mask)
 			sg_chain(prev, SCSI_MAX_SG_SEGMENTS, sgl);
 
 		/*
+		 * if we have nothing left, mark the last segment as
+		 * end-of-list
+		 */
+		if (!left)
+			sg_mark_end(sgl, this);
+
+		/*
 		 * don't allow subsequent mempool allocs to sleep, it would
 		 * violate the mempool principle.
 		 */
@@ -2353,7 +2360,7 @@ void *scsi_kmap_atomic_sg(struct scatterlist *sgl, int sg_count,
 	*offset = *offset - len_complete + sg->offset;
 
 	/* Assumption: contiguous pages can be accessed as "page + i" */
-	page = nth_page(sg->page, (*offset >> PAGE_SHIFT));
+	page = nth_page(sg_page(sg), (*offset >> PAGE_SHIFT));
 	*offset &= ~PAGE_MASK;
 
 	/* Bytes in this sg-entry from *offset to the end of the page */
diff --git a/drivers/scsi/seagate.c b/drivers/scsi/seagate.c
index ce80fa9ad81..b11324479b5 100644
--- a/drivers/scsi/seagate.c
+++ b/drivers/scsi/seagate.c
@@ -999,14 +999,14 @@ connect_loop:
 				for (i = 0; i < nobuffs; ++i)
 					printk("scsi%d : buffer %d address = %p length = %d\n",
 					     hostno, i,
-					     page_address(buffer[i].page) + buffer[i].offset,
+					     sg_virt(&buffer[i]),
 					     buffer[i].length);
 			}
 #endif
 
 			buffer = (struct scatterlist *) SCint->request_buffer;
 			len = buffer->length;
-			data = page_address(buffer->page) + buffer->offset;
+			data = sg_virt(buffer);
 		} else {
 			DPRINTK (DEBUG_SG, "scsi%d : scatter gather not requested.\n", hostno);
 			buffer = NULL;
@@ -1239,7 +1239,7 @@ connect_loop:
 					--nobuffs;
 					++buffer;
 					len = buffer->length;
-					data = page_address(buffer->page) + buffer->offset;
+					data = sg_virt(buffer);
 					DPRINTK (DEBUG_SG,
 						 "scsi%d : next scatter-gather buffer len = %d address = %08x\n",
 						 hostno, len, data);
@@ -1396,7 +1396,7 @@ connect_loop:
 					--nobuffs;
 					++buffer;
 					len = buffer->length;
-					data = page_address(buffer->page) + buffer->offset;
+					data = sg_virt(buffer);
 					DPRINTK (DEBUG_SG, "scsi%d : next scatter-gather buffer len = %d address = %08x\n", hostno, len, data);
 				}
 				break;
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 7238b2dfc49..cc197100284 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1169,7 +1169,7 @@ sg_vma_nopage(struct vm_area_struct *vma, unsigned long addr, int *type)
 		len = vma->vm_end - sa;
 		len = (len < sg->length) ? len : sg->length;
 		if (offset < len) {
-			page = virt_to_page(page_address(sg->page) + offset);
+			page = virt_to_page(page_address(sg_page(sg)) + offset);
 			get_page(page);	/* increment page count */
 			break;
 		}
@@ -1717,13 +1717,13 @@ st_map_user_pages(struct scatterlist *sgl, const unsigned int max_pages,
 		   goto out_unlock; */
         }
 
-	sgl[0].page = pages[0];
+	sg_set_page(sgl, pages[0]);
 	sgl[0].offset = uaddr & ~PAGE_MASK;
 	if (nr_pages > 1) {
 		sgl[0].length = PAGE_SIZE - sgl[0].offset;
 		count -= sgl[0].length;
 		for (i=1; i < nr_pages ; i++) {
-			sgl[i].page = pages[i]; 
+			sg_set_page(&sgl[i], pages[i]);
 			sgl[i].length = count < PAGE_SIZE ? count : PAGE_SIZE;
 			count -= PAGE_SIZE;
 		}
@@ -1754,7 +1754,7 @@ st_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_pages,
 	int i;
 
 	for (i=0; i < nr_pages; i++) {
-		struct page *page = sgl[i].page;
+		struct page *page = sg_page(&sgl[i]);
 
 		if (dirtied)
 			SetPageDirty(page);
@@ -1854,7 +1854,7 @@ sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size)
 				scatter_elem_sz_prev = ret_sz;
 			}
 		}
-		sg->page = p;
+		sg_set_page(sg, p);
 		sg->length = (ret_sz > num) ? num : ret_sz;
 
 		SCSI_LOG_TIMEOUT(5, printk("sg_build_indirect: k=%d, num=%d, "
@@ -1907,14 +1907,14 @@ sg_write_xfer(Sg_request * srp)
 		onum = 1;
 
 	ksglen = sg->length;
-	p = page_address(sg->page);
+	p = page_address(sg_page(sg));
 	for (j = 0, k = 0; j < onum; ++j) {
 		res = sg_u_iovec(hp, iovec_count, j, 1, &usglen, &up);
 		if (res)
 			return res;
 
 		for (; p; sg = sg_next(sg), ksglen = sg->length,
-		     p = page_address(sg->page)) {
+		     p = page_address(sg_page(sg))) {
 			if (usglen <= 0)
 				break;
 			if (ksglen > usglen) {
@@ -1991,12 +1991,12 @@ sg_remove_scat(Sg_scatter_hold * schp)
 		} else {
 			int k;
 
-			for (k = 0; (k < schp->k_use_sg) && sg->page;
+			for (k = 0; (k < schp->k_use_sg) && sg_page(sg);
 			     ++k, sg = sg_next(sg)) {
 				SCSI_LOG_TIMEOUT(5, printk(
 				    "sg_remove_scat: k=%d, pg=0x%p, len=%d\n",
-				    k, sg->page, sg->length));
-				sg_page_free(sg->page, sg->length);
+				    k, sg_page(sg), sg->length));
+				sg_page_free(sg_page(sg), sg->length);
 			}
 		}
 		kfree(schp->buffer);
@@ -2038,7 +2038,7 @@ sg_read_xfer(Sg_request * srp)
 	} else
 		onum = 1;
 
-	p = page_address(sg->page);
+	p = page_address(sg_page(sg));
 	ksglen = sg->length;
 	for (j = 0, k = 0; j < onum; ++j) {
 		res = sg_u_iovec(hp, iovec_count, j, 0, &usglen, &up);
@@ -2046,7 +2046,7 @@ sg_read_xfer(Sg_request * srp)
 			return res;
 
 		for (; p; sg = sg_next(sg), ksglen = sg->length,
-		     p = page_address(sg->page)) {
+		     p = page_address(sg_page(sg))) {
 			if (usglen <= 0)
 				break;
 			if (ksglen > usglen) {
@@ -2092,15 +2092,15 @@ sg_read_oxfer(Sg_request * srp, char __user *outp, int num_read_xfer)
 	if ((!outp) || (num_read_xfer <= 0))
 		return 0;
 
-	for (k = 0; (k < schp->k_use_sg) && sg->page; ++k, sg = sg_next(sg)) {
+	for (k = 0; (k < schp->k_use_sg) && sg_page(sg); ++k, sg = sg_next(sg)) {
 		num = sg->length;
 		if (num > num_read_xfer) {
-			if (__copy_to_user(outp, page_address(sg->page),
+			if (__copy_to_user(outp, page_address(sg_page(sg)),
 					   num_read_xfer))
 				return -EFAULT;
 			break;
 		} else {
-			if (__copy_to_user(outp, page_address(sg->page),
+			if (__copy_to_user(outp, page_address(sg_page(sg)),
 					   num))
 				return -EFAULT;
 			num_read_xfer -= num;
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 73c44cbdea4..ce69b9efc10 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -3797,7 +3797,7 @@ static void buf_to_sg(struct st_buffer *STbp, unsigned int length)
 	sg = &(STbp->sg[0]);
 	frp = STbp->frp;
 	for (i=count=0; count < length; i++) {
-		sg[i].page = frp[i].page;
+		sg_set_page(&sg[i], frp[i].page);
 		if (length - count > frp[i].length)
 			sg[i].length = frp[i].length;
 		else
@@ -4446,14 +4446,14 @@ static int sgl_map_user_pages(struct scatterlist *sgl, const unsigned int max_pa
         }
 
 	/* Populate the scatter/gather list */
-	sgl[0].page = pages[0]; 
+	sg_set_page(&sgl[0], pages[0]);
 	sgl[0].offset = uaddr & ~PAGE_MASK;
 	if (nr_pages > 1) {
 		sgl[0].length = PAGE_SIZE - sgl[0].offset;
 		count -= sgl[0].length;
 		for (i=1; i < nr_pages ; i++) {
+			sg_set_page(&sgl[i], pages[i]);;
 			sgl[i].offset = 0;
-			sgl[i].page = pages[i]; 
 			sgl[i].length = count < PAGE_SIZE ? count : PAGE_SIZE;
 			count -= PAGE_SIZE;
 		}
@@ -4483,7 +4483,7 @@ static int sgl_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_p
 	int i;
 
 	for (i=0; i < nr_pages; i++) {
-		struct page *page = sgl[i].page;
+		struct page *page = sg_page(&sgl[i]);
 
 		if (dirtied)
 			SetPageDirty(page);
diff --git a/drivers/scsi/sun3_NCR5380.c b/drivers/scsi/sun3_NCR5380.c
index 4aafe89b557..2dcde373b20 100644
--- a/drivers/scsi/sun3_NCR5380.c
+++ b/drivers/scsi/sun3_NCR5380.c
@@ -272,8 +272,7 @@ static struct scsi_host_template *the_template = NULL;
 #define	HOSTNO		instance->host_no
 #define	H_NO(cmd)	(cmd)->device->host->host_no
 
-#define SGADDR(buffer) (void *)(((unsigned long)page_address((buffer)->page)) + \
-			(buffer)->offset)
+#define SGADDR(buffer) (void *)(((unsigned long)sg_virt(((buffer)))))
 
 #ifdef SUPPORT_TAGS
 
diff --git a/drivers/scsi/sym53c416.c b/drivers/scsi/sym53c416.c
index 8befab7e983..90cee94d952 100644
--- a/drivers/scsi/sym53c416.c
+++ b/drivers/scsi/sym53c416.c
@@ -196,7 +196,7 @@ static unsigned int sym53c416_base_3[2] = {0,0};
 
 #define MAXHOSTS 4
 
-#define SG_ADDRESS(buffer)     ((char *) (page_address((buffer)->page)+(buffer)->offset))
+#define SG_ADDRESS(buffer)     ((char *) sg_virt((buffer)))
 
 enum phases
 {
diff --git a/drivers/scsi/tmscsim.c b/drivers/scsi/tmscsim.c
index 5c72ca31a47..44193049c4a 100644
--- a/drivers/scsi/tmscsim.c
+++ b/drivers/scsi/tmscsim.c
@@ -430,10 +430,7 @@ static __inline__ void dc390_Going_remove (struct dc390_dcb* pDCB, struct dc390_
 
 static struct scatterlist* dc390_sg_build_single(struct scatterlist *sg, void *addr, unsigned int length)
 {
-	memset(sg, 0, sizeof(struct scatterlist));
-	sg->page	= virt_to_page(addr);
-	sg->length	= length;
-	sg->offset	= (unsigned long)addr & ~PAGE_MASK;
+	sg_init_one(sg, addr, length);
 	return sg;
 }
 
diff --git a/drivers/scsi/ultrastor.c b/drivers/scsi/ultrastor.c
index ea72bbeb8f9..6d1f0edd798 100644
--- a/drivers/scsi/ultrastor.c
+++ b/drivers/scsi/ultrastor.c
@@ -681,7 +681,7 @@ static inline void build_sg_list(struct mscp *mscp, struct scsi_cmnd *SCpnt)
 
 	max = scsi_sg_count(SCpnt);
 	scsi_for_each_sg(SCpnt, sg, max, i) {
-		mscp->sglist[i].address = isa_page_to_bus(sg->page) + sg->offset;
+		mscp->sglist[i].address = isa_page_to_bus(sg_page(sg)) + sg->offset;
 		mscp->sglist[i].num_bytes = sg->length;
 		transfer_length += sg->length;
 	}
diff --git a/drivers/scsi/wd33c93.c b/drivers/scsi/wd33c93.c
index 0e8e642fd3b..fdbb92d1f72 100644
--- a/drivers/scsi/wd33c93.c
+++ b/drivers/scsi/wd33c93.c
@@ -410,8 +410,7 @@ wd33c93_queuecommand(struct scsi_cmnd *cmd,
 	if (cmd->use_sg) {
 		cmd->SCp.buffer = (struct scatterlist *) cmd->request_buffer;
 		cmd->SCp.buffers_residual = cmd->use_sg - 1;
-		cmd->SCp.ptr = page_address(cmd->SCp.buffer->page) +
-		    cmd->SCp.buffer->offset;
+		cmd->SCp.ptr = sg_virt(cmd->SCp.buffer);
 		cmd->SCp.this_residual = cmd->SCp.buffer->length;
 	} else {
 		cmd->SCp.buffer = NULL;
@@ -745,8 +744,7 @@ transfer_bytes(const wd33c93_regs regs, struct scsi_cmnd *cmd,
 		++cmd->SCp.buffer;
 		--cmd->SCp.buffers_residual;
 		cmd->SCp.this_residual = cmd->SCp.buffer->length;
-		cmd->SCp.ptr = page_address(cmd->SCp.buffer->page) +
-		    cmd->SCp.buffer->offset;
+		cmd->SCp.ptr = sg_virt(cmd->SCp.buffer);
 	}
 	if (!cmd->SCp.this_residual) /* avoid bogus setups */
 		return;
diff --git a/drivers/scsi/wd7000.c b/drivers/scsi/wd7000.c
index 255c611e78b..03cd44f231d 100644
--- a/drivers/scsi/wd7000.c
+++ b/drivers/scsi/wd7000.c
@@ -1123,7 +1123,7 @@ static int wd7000_queuecommand(struct scsi_cmnd *SCpnt,
 		any2scsi(scb->maxlen, nseg * sizeof(Sgb));
 
 		scsi_for_each_sg(SCpnt, sg, nseg, i) {
-			any2scsi(sgb[i].ptr, isa_page_to_bus(sg->page) + sg->offset);
+			any2scsi(sgb[i].ptr, isa_page_to_bus(sg_page(sg)) + sg->offset);
 			any2scsi(sgb[i].len, sg->length);
 		}
 	} else {
diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index 87665d7df6f..ed438bc7e98 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -624,7 +624,7 @@ choice
 
 config SERIAL_BFIN_DMA
 	bool "DMA mode"
-	depends on DMA_UNCACHED_1M && !KGDB_UART
+	depends on !DMA_UNCACHED_NONE && !KGDB_UART
 	help
 	  This driver works under DMA mode. If this option is selected, the
 	  blackfin simple dma driver is also enabled.
diff --git a/drivers/serial/mcf.c b/drivers/serial/mcf.c
new file mode 100644
index 00000000000..a7d4360ea7d
--- /dev/null
+++ b/drivers/serial/mcf.c
@@ -0,0 +1,653 @@
+/****************************************************************************/
+
+/*
+ *	mcf.c -- Freescale ColdFire UART driver
+ *
+ *	(C) Copyright 2003-2007, Greg Ungerer <gerg@snapgear.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+/****************************************************************************/
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/console.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+#include <linux/serial.h>
+#include <linux/serial_core.h>
+#include <linux/io.h>
+#include <asm/coldfire.h>
+#include <asm/mcfsim.h>
+#include <asm/mcfuart.h>
+#include <asm/nettel.h>
+
+/****************************************************************************/
+
+/*
+ *	Some boards implement the DTR/DCD lines using GPIO lines, most
+ *	don't. Dummy out the access macros for those that don't. Those
+ *	that do should define these macros somewhere in there board
+ *	specific inlude files.
+ */
+#if !defined(mcf_getppdcd)
+#define	mcf_getppdcd(p)		(1)
+#endif
+#if !defined(mcf_getppdtr)
+#define	mcf_getppdtr(p)		(1)
+#endif
+#if !defined(mcf_setppdtr)
+#define	mcf_setppdtr(p, v)	do { } while (0)
+#endif
+
+/****************************************************************************/
+
+/*
+ *	Local per-uart structure.
+ */
+struct mcf_uart {
+	struct uart_port	port;
+	unsigned int		sigs;		/* Local copy of line sigs */
+	unsigned char		imr;		/* Local IMR mirror */
+};
+
+/****************************************************************************/
+
+static unsigned int mcf_tx_empty(struct uart_port *port)
+{
+	return (readb(port->membase + MCFUART_USR) & MCFUART_USR_TXEMPTY) ?
+		TIOCSER_TEMT : 0;
+}
+
+/****************************************************************************/
+
+static unsigned int mcf_get_mctrl(struct uart_port *port)
+{
+	struct mcf_uart *pp = (struct mcf_uart *) port;
+	unsigned long flags;
+	unsigned int sigs;
+
+	spin_lock_irqsave(&port->lock, flags);
+	sigs = (readb(port->membase + MCFUART_UIPR) & MCFUART_UIPR_CTS) ?
+		0 : TIOCM_CTS;
+	sigs |= (pp->sigs & TIOCM_RTS);
+	sigs |= (mcf_getppdcd(port->line) ? TIOCM_CD : 0);
+	sigs |= (mcf_getppdtr(port->line) ? TIOCM_DTR : 0);
+	spin_unlock_irqrestore(&port->lock, flags);
+	return sigs;
+}
+
+/****************************************************************************/
+
+static void mcf_set_mctrl(struct uart_port *port, unsigned int sigs)
+{
+	struct mcf_uart *pp = (struct mcf_uart *) port;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	pp->sigs = sigs;
+	mcf_setppdtr(port->line, (sigs & TIOCM_DTR));
+	if (sigs & TIOCM_RTS)
+		writeb(MCFUART_UOP_RTS, port->membase + MCFUART_UOP1);
+	else
+		writeb(MCFUART_UOP_RTS, port->membase + MCFUART_UOP0);
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+/****************************************************************************/
+
+static void mcf_start_tx(struct uart_port *port)
+{
+	struct mcf_uart *pp = (struct mcf_uart *) port;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	pp->imr |= MCFUART_UIR_TXREADY;
+	writeb(pp->imr, port->membase + MCFUART_UIMR);
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+/****************************************************************************/
+
+static void mcf_stop_tx(struct uart_port *port)
+{
+	struct mcf_uart *pp = (struct mcf_uart *) port;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	pp->imr &= ~MCFUART_UIR_TXREADY;
+	writeb(pp->imr, port->membase + MCFUART_UIMR);
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+/****************************************************************************/
+
+static void mcf_stop_rx(struct uart_port *port)
+{
+	struct mcf_uart *pp = (struct mcf_uart *) port;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	pp->imr &= ~MCFUART_UIR_RXREADY;
+	writeb(pp->imr, port->membase + MCFUART_UIMR);
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+/****************************************************************************/
+
+static void mcf_break_ctl(struct uart_port *port, int break_state)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	if (break_state == -1)
+		writeb(MCFUART_UCR_CMDBREAKSTART, port->membase + MCFUART_UCR);
+	else
+		writeb(MCFUART_UCR_CMDBREAKSTOP, port->membase + MCFUART_UCR);
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+/****************************************************************************/
+
+static void mcf_enable_ms(struct uart_port *port)
+{
+}
+
+/****************************************************************************/
+
+static int mcf_startup(struct uart_port *port)
+{
+	struct mcf_uart *pp = (struct mcf_uart *) port;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	/* Reset UART, get it into known state... */
+	writeb(MCFUART_UCR_CMDRESETRX, port->membase + MCFUART_UCR);
+	writeb(MCFUART_UCR_CMDRESETTX, port->membase + MCFUART_UCR);
+
+	/* Enable the UART transmitter and receiver */
+	writeb(MCFUART_UCR_RXENABLE | MCFUART_UCR_TXENABLE,
+		port->membase + MCFUART_UCR);
+
+	/* Enable RX interrupts now */
+	pp->imr = MCFUART_UIR_RXREADY;
+	writeb(pp->imr, port->membase + MCFUART_UIMR);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	return 0;
+}
+
+/****************************************************************************/
+
+static void mcf_shutdown(struct uart_port *port)
+{
+	struct mcf_uart *pp = (struct mcf_uart *) port;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	/* Disable all interrupts now */
+	pp->imr = 0;
+	writeb(pp->imr, port->membase + MCFUART_UIMR);
+
+	/* Disable UART transmitter and receiver */
+	writeb(MCFUART_UCR_CMDRESETRX, port->membase + MCFUART_UCR);
+	writeb(MCFUART_UCR_CMDRESETTX, port->membase + MCFUART_UCR);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+/****************************************************************************/
+
+static void mcf_set_termios(struct uart_port *port, struct ktermios *termios,
+	struct ktermios *old)
+{
+	unsigned long flags;
+	unsigned int baud, baudclk;
+	unsigned char mr1, mr2;
+
+	baud = uart_get_baud_rate(port, termios, old, 0, 230400);
+	baudclk = ((MCF_BUSCLK / baud) + 16) / 32;
+
+	mr1 = MCFUART_MR1_RXIRQRDY | MCFUART_MR1_RXERRCHAR;
+	mr2 = 0;
+
+	switch (termios->c_cflag & CSIZE) {
+	case CS5: mr1 |= MCFUART_MR1_CS5; break;
+	case CS6: mr1 |= MCFUART_MR1_CS6; break;
+	case CS7: mr1 |= MCFUART_MR1_CS7; break;
+	case CS8:
+	default:  mr1 |= MCFUART_MR1_CS8; break;
+	}
+
+	if (termios->c_cflag & PARENB) {
+		if (termios->c_cflag & CMSPAR) {
+			if (termios->c_cflag & PARODD)
+				mr1 |= MCFUART_MR1_PARITYMARK;
+			else
+				mr1 |= MCFUART_MR1_PARITYSPACE;
+		} else {
+			if (termios->c_cflag & PARODD)
+				mr1 |= MCFUART_MR1_PARITYODD;
+			else
+				mr1 |= MCFUART_MR1_PARITYEVEN;
+		}
+	} else {
+		mr1 |= MCFUART_MR1_PARITYNONE;
+	}
+
+	if (termios->c_cflag & CSTOPB)
+		mr2 |= MCFUART_MR2_STOP2;
+	else
+		mr2 |= MCFUART_MR2_STOP1;
+
+	if (termios->c_cflag & CRTSCTS) {
+		mr1 |= MCFUART_MR1_RXRTS;
+		mr2 |= MCFUART_MR2_TXCTS;
+	}
+
+	spin_lock_irqsave(&port->lock, flags);
+	writeb(MCFUART_UCR_CMDRESETRX, port->membase + MCFUART_UCR);
+	writeb(MCFUART_UCR_CMDRESETTX, port->membase + MCFUART_UCR);
+	writeb(MCFUART_UCR_CMDRESETMRPTR, port->membase + MCFUART_UCR);
+	writeb(mr1, port->membase + MCFUART_UMR);
+	writeb(mr2, port->membase + MCFUART_UMR);
+	writeb((baudclk & 0xff00) >> 8, port->membase + MCFUART_UBG1);
+	writeb((baudclk & 0xff), port->membase + MCFUART_UBG2);
+	writeb(MCFUART_UCSR_RXCLKTIMER | MCFUART_UCSR_TXCLKTIMER,
+		port->membase + MCFUART_UCSR);
+	writeb(MCFUART_UCR_RXENABLE | MCFUART_UCR_TXENABLE,
+		port->membase + MCFUART_UCR);
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+/****************************************************************************/
+
+static void mcf_rx_chars(struct mcf_uart *pp)
+{
+	struct uart_port *port = (struct uart_port *) pp;
+	unsigned char status, ch, flag;
+
+	while ((status = readb(port->membase + MCFUART_USR)) & MCFUART_USR_RXREADY) {
+		ch = readb(port->membase + MCFUART_URB);
+		flag = TTY_NORMAL;
+		port->icount.rx++;
+
+		if (status & MCFUART_USR_RXERR) {
+			writeb(MCFUART_UCR_CMDRESETERR,
+				port->membase + MCFUART_UCR);
+
+			if (status & MCFUART_USR_RXBREAK) {
+				port->icount.brk++;
+				if (uart_handle_break(port))
+					continue;
+			} else if (status & MCFUART_USR_RXPARITY) {
+				port->icount.parity++;
+			} else if (status & MCFUART_USR_RXOVERRUN) {
+				port->icount.overrun++;
+			} else if (status & MCFUART_USR_RXFRAMING) {
+				port->icount.frame++;
+			}
+
+			status &= port->read_status_mask;
+
+			if (status & MCFUART_USR_RXBREAK)
+				flag = TTY_BREAK;
+			else if (status & MCFUART_USR_RXPARITY)
+				flag = TTY_PARITY;
+			else if (status & MCFUART_USR_RXFRAMING)
+				flag = TTY_FRAME;
+		}
+
+		if (uart_handle_sysrq_char(port, ch))
+			continue;
+		uart_insert_char(port, status, MCFUART_USR_RXOVERRUN, ch, flag);
+	}
+
+	tty_flip_buffer_push(port->info->tty);
+}
+
+/****************************************************************************/
+
+static void mcf_tx_chars(struct mcf_uart *pp)
+{
+	struct uart_port *port = (struct uart_port *) pp;
+	struct circ_buf *xmit = &port->info->xmit;
+
+	if (port->x_char) {
+		/* Send special char - probably flow control */
+		writeb(port->x_char, port->membase + MCFUART_UTB);
+		port->x_char = 0;
+		port->icount.tx++;
+		return;
+	}
+
+	while (readb(port->membase + MCFUART_USR) & MCFUART_USR_TXREADY) {
+		if (xmit->head == xmit->tail)
+			break;
+		writeb(xmit->buf[xmit->tail], port->membase + MCFUART_UTB);
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE -1);
+		port->icount.tx++;
+	}
+
+	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		uart_write_wakeup(port);
+
+	if (xmit->head == xmit->tail) {
+		pp->imr &= ~MCFUART_UIR_TXREADY;
+		writeb(pp->imr, port->membase + MCFUART_UIMR);
+	}
+}
+
+/****************************************************************************/
+
+static irqreturn_t mcf_interrupt(int irq, void *data)
+{
+	struct uart_port *port = data;
+	struct mcf_uart *pp = (struct mcf_uart *) port;
+	unsigned int isr;
+
+	isr = readb(port->membase + MCFUART_UISR) & pp->imr;
+	if (isr & MCFUART_UIR_RXREADY)
+		mcf_rx_chars(pp);
+	if (isr & MCFUART_UIR_TXREADY)
+		mcf_tx_chars(pp);
+	return IRQ_HANDLED;
+}
+
+/****************************************************************************/
+
+static void mcf_config_port(struct uart_port *port, int flags)
+{
+	port->type = PORT_MCF;
+
+	/* Clear mask, so no surprise interrupts. */
+	writeb(0, port->membase + MCFUART_UIMR);
+
+	if (request_irq(port->irq, mcf_interrupt, IRQF_DISABLED, "UART", port))
+		printk(KERN_ERR "MCF: unable to attach ColdFire UART %d "
+			"interrupt vector=%d\n", port->line, port->irq);
+}
+
+/****************************************************************************/
+
+static const char *mcf_type(struct uart_port *port)
+{
+	return (port->type == PORT_MCF) ? "ColdFire UART" : NULL;
+}
+
+/****************************************************************************/
+
+static int mcf_request_port(struct uart_port *port)
+{
+	/* UARTs always present */
+	return 0;
+}
+
+/****************************************************************************/
+
+static void mcf_release_port(struct uart_port *port)
+{
+	/* Nothing to release... */
+}
+
+/****************************************************************************/
+
+static int mcf_verify_port(struct uart_port *port, struct serial_struct *ser)
+{
+	if ((ser->type != PORT_UNKNOWN) && (ser->type != PORT_MCF))
+		return -EINVAL;
+	return 0;
+}
+
+/****************************************************************************/
+
+/*
+ *	Define the basic serial functions we support.
+ */
+static struct uart_ops mcf_uart_ops = {
+	.tx_empty	= mcf_tx_empty,
+	.get_mctrl	= mcf_get_mctrl,
+	.set_mctrl	= mcf_set_mctrl,
+	.start_tx	= mcf_start_tx,
+	.stop_tx	= mcf_stop_tx,
+	.stop_rx	= mcf_stop_rx,
+	.enable_ms	= mcf_enable_ms,
+	.break_ctl	= mcf_break_ctl,
+	.startup	= mcf_startup,
+	.shutdown	= mcf_shutdown,
+	.set_termios	= mcf_set_termios,
+	.type		= mcf_type,
+	.request_port	= mcf_request_port,
+	.release_port	= mcf_release_port,
+	.config_port	= mcf_config_port,
+	.verify_port	= mcf_verify_port,
+};
+
+static struct mcf_uart mcf_ports[3];
+
+#define	MCF_MAXPORTS	(sizeof(mcf_ports) / sizeof(struct mcf_uart))
+
+/****************************************************************************/
+#if defined(CONFIG_SERIAL_MCF_CONSOLE)
+/****************************************************************************/
+
+int __init early_mcf_setup(struct mcf_platform_uart *platp)
+{
+	struct uart_port *port;
+	int i;
+
+	for (i = 0; ((i < MCF_MAXPORTS) && (platp[i].mapbase)); i++) {
+		port = &mcf_ports[i].port;
+
+		port->line = i;
+		port->type = PORT_MCF;
+		port->mapbase = platp[i].mapbase;
+		port->membase = (platp[i].membase) ? platp[i].membase :
+			(unsigned char __iomem *) port->mapbase;
+		port->iotype = SERIAL_IO_MEM;
+		port->irq = platp[i].irq;
+		port->uartclk = MCF_BUSCLK;
+		port->flags = ASYNC_BOOT_AUTOCONF;
+		port->ops = &mcf_uart_ops;
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+
+static void mcf_console_putc(struct console *co, const char c)
+{
+	struct uart_port *port = &(mcf_ports + co->index)->port;
+	int i;
+
+	for (i = 0; (i < 0x10000); i++) {
+		if (readb(port->membase + MCFUART_USR) & MCFUART_USR_TXREADY)
+			break;
+	}
+	writeb(c, port->membase + MCFUART_UTB);
+	for (i = 0; (i < 0x10000); i++) {
+		if (readb(port->membase + MCFUART_USR) & MCFUART_USR_TXREADY)
+			break;
+	}
+}
+
+/****************************************************************************/
+
+static void mcf_console_write(struct console *co, const char *s, unsigned int count)
+{
+	for (; (count); count--, s++) {
+		mcf_console_putc(co, *s);
+		if (*s == '\n')
+			mcf_console_putc(co, '\r');
+	}
+}
+
+/****************************************************************************/
+
+static int __init mcf_console_setup(struct console *co, char *options)
+{
+	struct uart_port *port;
+	int baud = CONFIG_SERIAL_MCF_BAUDRATE;
+	int bits = 8;
+	int parity = 'n';
+	int flow = 'n';
+
+	if ((co->index >= 0) && (co->index <= MCF_MAXPORTS))
+		co->index = 0;
+	port = &mcf_ports[co->index].port;
+	if (port->membase == 0)
+		return -ENODEV;
+
+	if (options)
+		uart_parse_options(options, &baud, &parity, &bits, &flow);
+
+	return uart_set_options(port, co, baud, parity, bits, flow);
+}
+
+/****************************************************************************/
+
+static struct uart_driver mcf_driver;
+
+static struct console mcf_console = {
+	.name		= "ttyS",
+	.write		= mcf_console_write,
+	.device		= uart_console_device,
+	.setup		= mcf_console_setup,
+	.flags		= CON_PRINTBUFFER,
+	.index		= -1,
+	.data		= &mcf_driver,
+};
+
+static int __init mcf_console_init(void)
+{
+	register_console(&mcf_console);
+	return 0;
+}
+
+console_initcall(mcf_console_init);
+
+#define	MCF_CONSOLE	&mcf_console
+
+/****************************************************************************/
+#else
+/****************************************************************************/
+
+#define	MCF_CONSOLE	NULL
+
+/****************************************************************************/
+#endif /* CONFIG_MCF_CONSOLE */
+/****************************************************************************/
+
+/*
+ *	Define the mcf UART driver structure.
+ */
+static struct uart_driver mcf_driver = {
+	.owner		= THIS_MODULE,
+	.driver_name	= "mcf",
+	.dev_name	= "ttyS",
+	.major		= TTY_MAJOR,
+	.minor		= 64,
+	.nr		= MCF_MAXPORTS,
+	.cons		= MCF_CONSOLE,
+};
+
+/****************************************************************************/
+
+static int __devinit mcf_probe(struct platform_device *pdev)
+{
+	struct mcf_platform_uart *platp = pdev->dev.platform_data;
+	struct uart_port *port;
+	int i;
+
+	for (i = 0; ((i < MCF_MAXPORTS) && (platp[i].mapbase)); i++) {
+		port = &mcf_ports[i].port;
+
+		port->line = i;
+		port->type = PORT_MCF;
+		port->mapbase = platp[i].mapbase;
+		port->membase = (platp[i].membase) ? platp[i].membase :
+			(unsigned char __iomem *) platp[i].mapbase;
+		port->iotype = SERIAL_IO_MEM;
+		port->irq = platp[i].irq;
+		port->uartclk = MCF_BUSCLK;
+		port->ops = &mcf_uart_ops;
+		port->flags = ASYNC_BOOT_AUTOCONF;
+
+		uart_add_one_port(&mcf_driver, port);
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+
+static int mcf_remove(struct platform_device *pdev)
+{
+	struct uart_port *port;
+	int i;
+
+	for (i = 0; (i < MCF_MAXPORTS); i++) {
+		port = &mcf_ports[i].port;
+		if (port)
+			uart_remove_one_port(&mcf_driver, port);
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+
+static struct platform_driver mcf_platform_driver = {
+	.probe		= mcf_probe,
+	.remove		= __devexit_p(mcf_remove),
+	.driver		= {
+		.name	= "mcfuart",
+		.owner	= THIS_MODULE,
+	},
+};
+
+/****************************************************************************/
+
+static int __init mcf_init(void)
+{
+	int rc;
+
+	printk("ColdFire internal UART serial driver\n");
+
+	rc = uart_register_driver(&mcf_driver);
+	if (rc)
+		return rc;
+	rc = platform_driver_register(&mcf_platform_driver);
+	if (rc)
+		return rc;
+	return 0;
+}
+
+/****************************************************************************/
+
+static void __exit mcf_exit(void)
+{
+	platform_driver_unregister(&mcf_platform_driver);
+	uart_unregister_driver(&mcf_driver);
+}
+
+/****************************************************************************/
+
+module_init(mcf_init);
+module_exit(mcf_exit);
+
+MODULE_AUTHOR("Greg Ungerer <gerg@snapgear.com>");
+MODULE_DESCRIPTION("Freescale ColdFire UART driver");
+MODULE_LICENSE("GPL");
+
+/****************************************************************************/
diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index 8dd5a6afd51..8bdaa157ffe 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -11,9 +11,9 @@
 #include <linux/timer.h>
 #include <linux/ctype.h>
 #include <linux/device.h>
+#include <linux/scatterlist.h>
 #include <linux/usb/quirks.h>
 #include <asm/byteorder.h>
-#include <asm/scatterlist.h>
 
 #include "hcd.h"	/* for usbcore internals */
 #include "usb.h"
@@ -437,13 +437,11 @@ int usb_sg_init (
 #if defined(CONFIG_HIGHMEM) || defined(CONFIG_IOMMU)
 			io->urbs[i]->transfer_buffer = NULL;
 #else
-			io->urbs[i]->transfer_buffer =
-				page_address(sg[i].page) + sg[i].offset;
+			io->urbs[i]->transfer_buffer = sg_virt(&sg[i]);
 #endif
 		} else {
 			/* hc may use _only_ transfer_buffer */
-			io->urbs [i]->transfer_buffer =
-				page_address (sg [i].page) + sg [i].offset;
+			io->urbs [i]->transfer_buffer = sg_virt(&sg[i]);
 			len = sg [i].length;
 		}
 
diff --git a/drivers/usb/image/microtek.c b/drivers/usb/image/microtek.c
index e7d982a7154..91e999c9f68 100644
--- a/drivers/usb/image/microtek.c
+++ b/drivers/usb/image/microtek.c
@@ -519,8 +519,7 @@ static void mts_do_sg (struct urb* transfer)
 	context->fragment++;
 	mts_int_submit_urb(transfer,
 			   context->data_pipe,
-			   page_address(sg[context->fragment].page) +
-			   sg[context->fragment].offset,
+			   sg_virt(&sg[context->fragment]),
 			   sg[context->fragment].length,
 			   context->fragment + 1 == scsi_sg_count(context->srb) ?
 			   mts_data_done : mts_do_sg);
@@ -557,7 +556,7 @@ mts_build_transfer_context(struct scsi_cmnd *srb, struct mts_desc* desc)
 		return;
 	} else {
 		sg = scsi_sglist(srb);
-		desc->context.data = page_address(sg[0].page) + sg[0].offset;
+		desc->context.data = sg_virt(&sg[0]);
 		desc->context.data_length = sg[0].length;
 	}
 
diff --git a/drivers/usb/misc/usbtest.c b/drivers/usb/misc/usbtest.c
index e901d31e051..ea316214648 100644
--- a/drivers/usb/misc/usbtest.c
+++ b/drivers/usb/misc/usbtest.c
@@ -360,9 +360,9 @@ static void free_sglist (struct scatterlist *sg, int nents)
 	if (!sg)
 		return;
 	for (i = 0; i < nents; i++) {
-		if (!sg [i].page)
+		if (!sg_page(&sg[i]))
 			continue;
-		kfree (page_address (sg [i].page) + sg [i].offset);
+		kfree (sg_virt(&sg[i]));
 	}
 	kfree (sg);
 }
diff --git a/drivers/usb/storage/protocol.c b/drivers/usb/storage/protocol.c
index cc8f7c52c72..889622baac2 100644
--- a/drivers/usb/storage/protocol.c
+++ b/drivers/usb/storage/protocol.c
@@ -195,7 +195,7 @@ unsigned int usb_stor_access_xfer_buf(unsigned char *buffer,
 		 * the *offset and *index values for the next loop. */
 		cnt = 0;
 		while (cnt < buflen) {
-			struct page *page = sg->page +
+			struct page *page = sg_page(sg) +
 					((sg->offset + *offset) >> PAGE_SHIFT);
 			unsigned int poff =
 					(sg->offset + *offset) & (PAGE_SIZE-1);
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
new file mode 100644
index 00000000000..9e33fc4da87
--- /dev/null
+++ b/drivers/virtio/Kconfig
@@ -0,0 +1,8 @@
+# Virtio always gets selected by whoever wants it.
+config VIRTIO
+	bool
+
+# Similarly the virtio ring implementation.
+config VIRTIO_RING
+	bool
+	depends on VIRTIO
diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
new file mode 100644
index 00000000000..f70e40971dd
--- /dev/null
+++ b/drivers/virtio/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_VIRTIO) += virtio.o
+obj-$(CONFIG_VIRTIO_RING) += virtio_ring.o
diff --git a/drivers/virtio/config.c b/drivers/virtio/config.c
new file mode 100644
index 00000000000..983d482fba4
--- /dev/null
+++ b/drivers/virtio/config.c
@@ -0,0 +1,13 @@
+/* Configuration space parsing helpers for virtio.
+ *
+ * The configuration is [type][len][... len bytes ...] fields.
+ *
+ * Copyright 2007 Rusty Russell, IBM Corporation.
+ * GPL v2 or later.
+ */
+#include <linux/err.h>
+#include <linux/virtio.h>
+#include <linux/virtio_config.h>
+#include <linux/bug.h>
+#include <asm/system.h>
+
diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
new file mode 100644
index 00000000000..15d7787dea8
--- /dev/null
+++ b/drivers/virtio/virtio.c
@@ -0,0 +1,189 @@
+#include <linux/virtio.h>
+#include <linux/spinlock.h>
+#include <linux/virtio_config.h>
+
+static ssize_t device_show(struct device *_d,
+			   struct device_attribute *attr, char *buf)
+{
+	struct virtio_device *dev = container_of(_d,struct virtio_device,dev);
+	return sprintf(buf, "%hu", dev->id.device);
+}
+static ssize_t vendor_show(struct device *_d,
+			   struct device_attribute *attr, char *buf)
+{
+	struct virtio_device *dev = container_of(_d,struct virtio_device,dev);
+	return sprintf(buf, "%hu", dev->id.vendor);
+}
+static ssize_t status_show(struct device *_d,
+			   struct device_attribute *attr, char *buf)
+{
+	struct virtio_device *dev = container_of(_d,struct virtio_device,dev);
+	return sprintf(buf, "0x%08x", dev->config->get_status(dev));
+}
+static ssize_t modalias_show(struct device *_d,
+			     struct device_attribute *attr, char *buf)
+{
+	struct virtio_device *dev = container_of(_d,struct virtio_device,dev);
+
+	return sprintf(buf, "virtio:d%08Xv%08X\n",
+		       dev->id.device, dev->id.vendor);
+}
+static struct device_attribute virtio_dev_attrs[] = {
+	__ATTR_RO(device),
+	__ATTR_RO(vendor),
+	__ATTR_RO(status),
+	__ATTR_RO(modalias),
+	__ATTR_NULL
+};
+
+static inline int virtio_id_match(const struct virtio_device *dev,
+				  const struct virtio_device_id *id)
+{
+	if (id->device != dev->id.device)
+		return 0;
+
+	return id->vendor == VIRTIO_DEV_ANY_ID || id->vendor != dev->id.vendor;
+}
+
+/* This looks through all the IDs a driver claims to support.  If any of them
+ * match, we return 1 and the kernel will call virtio_dev_probe(). */
+static int virtio_dev_match(struct device *_dv, struct device_driver *_dr)
+{
+	unsigned int i;
+	struct virtio_device *dev = container_of(_dv,struct virtio_device,dev);
+	const struct virtio_device_id *ids;
+
+	ids = container_of(_dr, struct virtio_driver, driver)->id_table;
+	for (i = 0; ids[i].device; i++)
+		if (virtio_id_match(dev, &ids[i]))
+			return 1;
+	return 0;
+}
+
+static int virtio_uevent(struct device *_dv, struct kobj_uevent_env *env)
+{
+	struct virtio_device *dev = container_of(_dv,struct virtio_device,dev);
+
+	return add_uevent_var(env, "MODALIAS=virtio:d%08Xv%08X",
+			      dev->id.device, dev->id.vendor);
+}
+
+static struct bus_type virtio_bus = {
+	.name  = "virtio",
+	.match = virtio_dev_match,
+	.dev_attrs = virtio_dev_attrs,
+	.uevent = virtio_uevent,
+};
+
+static void add_status(struct virtio_device *dev, unsigned status)
+{
+	dev->config->set_status(dev, dev->config->get_status(dev) | status);
+}
+
+static int virtio_dev_probe(struct device *_d)
+{
+	int err;
+	struct virtio_device *dev = container_of(_d,struct virtio_device,dev);
+	struct virtio_driver *drv = container_of(dev->dev.driver,
+						 struct virtio_driver, driver);
+
+	add_status(dev, VIRTIO_CONFIG_S_DRIVER);
+	err = drv->probe(dev);
+	if (err)
+		add_status(dev, VIRTIO_CONFIG_S_FAILED);
+	else
+		add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
+	return err;
+}
+
+int register_virtio_driver(struct virtio_driver *driver)
+{
+	driver->driver.bus = &virtio_bus;
+	driver->driver.probe = virtio_dev_probe;
+	return driver_register(&driver->driver);
+}
+EXPORT_SYMBOL_GPL(register_virtio_driver);
+
+void unregister_virtio_driver(struct virtio_driver *driver)
+{
+	driver_unregister(&driver->driver);
+}
+EXPORT_SYMBOL_GPL(unregister_virtio_driver);
+
+int register_virtio_device(struct virtio_device *dev)
+{
+	int err;
+
+	dev->dev.bus = &virtio_bus;
+	sprintf(dev->dev.bus_id, "%u", dev->index);
+
+	/* Acknowledge that we've seen the device. */
+	add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
+
+	/* device_register() causes the bus infrastructure to look for a
+	 * matching driver. */
+	err = device_register(&dev->dev);
+	if (err)
+		add_status(dev, VIRTIO_CONFIG_S_FAILED);
+	return err;
+}
+EXPORT_SYMBOL_GPL(register_virtio_device);
+
+void unregister_virtio_device(struct virtio_device *dev)
+{
+	device_unregister(&dev->dev);
+}
+EXPORT_SYMBOL_GPL(unregister_virtio_device);
+
+int __virtio_config_val(struct virtio_device *vdev,
+			u8 type, void *val, size_t size)
+{
+	void *token;
+	unsigned int len;
+
+	token = vdev->config->find(vdev, type, &len);
+	if (!token)
+		return -ENOENT;
+
+	if (len != size)
+		return -EIO;
+
+	vdev->config->get(vdev, token, val, size);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__virtio_config_val);
+
+int virtio_use_bit(struct virtio_device *vdev,
+		   void *token, unsigned int len, unsigned int bitnum)
+{
+	unsigned long bits[16];
+
+	/* This makes it convenient to pass-through find() results. */
+	if (!token)
+		return 0;
+
+	/* bit not in range of this bitfield? */
+	if (bitnum * 8 >= len / 2)
+		return 0;
+
+	/* Giant feature bitfields are silly. */
+	BUG_ON(len > sizeof(bits));
+	vdev->config->get(vdev, token, bits, len);
+
+	if (!test_bit(bitnum, bits))
+		return 0;
+
+	/* Set acknowledge bit, and write it back. */
+	set_bit(bitnum + len * 8 / 2, bits);
+	vdev->config->set(vdev, token, bits, len);
+	return 1;
+}
+EXPORT_SYMBOL_GPL(virtio_use_bit);
+
+static int virtio_init(void)
+{
+	if (bus_register(&virtio_bus) != 0)
+		panic("virtio bus registration failed");
+	return 0;
+}
+core_initcall(virtio_init);
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
new file mode 100644
index 00000000000..0e4baca21b8
--- /dev/null
+++ b/drivers/virtio/virtio_ring.c
@@ -0,0 +1,313 @@
+/* Virtio ring implementation.
+ *
+ *  Copyright 2007 Rusty Russell IBM Corporation
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/virtio.h>
+#include <linux/virtio_ring.h>
+#include <linux/device.h>
+
+#ifdef DEBUG
+/* For development, we want to crash whenever the ring is screwed. */
+#define BAD_RING(vq, fmt...)			\
+	do { dev_err(&vq->vq.vdev->dev, fmt); BUG(); } while(0)
+#define START_USE(vq) \
+	do { if ((vq)->in_use) panic("in_use = %i\n", (vq)->in_use); (vq)->in_use = __LINE__; mb(); } while(0)
+#define END_USE(vq) \
+	do { BUG_ON(!(vq)->in_use); (vq)->in_use = 0; mb(); } while(0)
+#else
+#define BAD_RING(vq, fmt...)			\
+	do { dev_err(&vq->vq.vdev->dev, fmt); (vq)->broken = true; } while(0)
+#define START_USE(vq)
+#define END_USE(vq)
+#endif
+
+struct vring_virtqueue
+{
+	struct virtqueue vq;
+
+	/* Actual memory layout for this queue */
+	struct vring vring;
+
+	/* Other side has made a mess, don't try any more. */
+	bool broken;
+
+	/* Number of free buffers */
+	unsigned int num_free;
+	/* Head of free buffer list. */
+	unsigned int free_head;
+	/* Number we've added since last sync. */
+	unsigned int num_added;
+
+	/* Last used index we've seen. */
+	unsigned int last_used_idx;
+
+	/* How to notify other side. FIXME: commonalize hcalls! */
+	void (*notify)(struct virtqueue *vq);
+
+#ifdef DEBUG
+	/* They're supposed to lock for us. */
+	unsigned int in_use;
+#endif
+
+	/* Tokens for callbacks. */
+	void *data[];
+};
+
+#define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
+
+static int vring_add_buf(struct virtqueue *_vq,
+			 struct scatterlist sg[],
+			 unsigned int out,
+			 unsigned int in,
+			 void *data)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	unsigned int i, avail, head, uninitialized_var(prev);
+
+	BUG_ON(data == NULL);
+	BUG_ON(out + in > vq->vring.num);
+	BUG_ON(out + in == 0);
+
+	START_USE(vq);
+
+	if (vq->num_free < out + in) {
+		pr_debug("Can't add buf len %i - avail = %i\n",
+			 out + in, vq->num_free);
+		END_USE(vq);
+		return -ENOSPC;
+	}
+
+	/* We're about to use some buffers from the free list. */
+	vq->num_free -= out + in;
+
+	head = vq->free_head;
+	for (i = vq->free_head; out; i = vq->vring.desc[i].next, out--) {
+		vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
+		vq->vring.desc[i].addr = (page_to_pfn(sg_page(sg))<<PAGE_SHIFT)
+			+ sg->offset;
+		vq->vring.desc[i].len = sg->length;
+		prev = i;
+		sg++;
+	}
+	for (; in; i = vq->vring.desc[i].next, in--) {
+		vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
+		vq->vring.desc[i].addr = (page_to_pfn(sg_page(sg))<<PAGE_SHIFT)
+			+ sg->offset;
+		vq->vring.desc[i].len = sg->length;
+		prev = i;
+		sg++;
+	}
+	/* Last one doesn't continue. */
+	vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT;
+
+	/* Update free pointer */
+	vq->free_head = i;
+
+	/* Set token. */
+	vq->data[head] = data;
+
+	/* Put entry in available array (but don't update avail->idx until they
+	 * do sync).  FIXME: avoid modulus here? */
+	avail = (vq->vring.avail->idx + vq->num_added++) % vq->vring.num;
+	vq->vring.avail->ring[avail] = head;
+
+	pr_debug("Added buffer head %i to %p\n", head, vq);
+	END_USE(vq);
+	return 0;
+}
+
+static void vring_kick(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	START_USE(vq);
+	/* Descriptors and available array need to be set before we expose the
+	 * new available array entries. */
+	wmb();
+
+	vq->vring.avail->idx += vq->num_added;
+	vq->num_added = 0;
+
+	/* Need to update avail index before checking if we should notify */
+	mb();
+
+	if (!(vq->vring.used->flags & VRING_USED_F_NO_NOTIFY))
+		/* Prod other side to tell it about changes. */
+		vq->notify(&vq->vq);
+
+	END_USE(vq);
+}
+
+static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
+{
+	unsigned int i;
+
+	/* Clear data ptr. */
+	vq->data[head] = NULL;
+
+	/* Put back on free list: find end */
+	i = head;
+	while (vq->vring.desc[i].flags & VRING_DESC_F_NEXT) {
+		i = vq->vring.desc[i].next;
+		vq->num_free++;
+	}
+
+	vq->vring.desc[i].next = vq->free_head;
+	vq->free_head = head;
+	/* Plus final descriptor */
+	vq->num_free++;
+}
+
+/* FIXME: We need to tell other side about removal, to synchronize. */
+static void vring_shutdown(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	unsigned int i;
+
+	for (i = 0; i < vq->vring.num; i++)
+		detach_buf(vq, i);
+}
+
+static inline bool more_used(const struct vring_virtqueue *vq)
+{
+	return vq->last_used_idx != vq->vring.used->idx;
+}
+
+static void *vring_get_buf(struct virtqueue *_vq, unsigned int *len)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	void *ret;
+	unsigned int i;
+
+	START_USE(vq);
+
+	if (!more_used(vq)) {
+		pr_debug("No more buffers in queue\n");
+		END_USE(vq);
+		return NULL;
+	}
+
+	i = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].id;
+	*len = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].len;
+
+	if (unlikely(i >= vq->vring.num)) {
+		BAD_RING(vq, "id %u out of range\n", i);
+		return NULL;
+	}
+	if (unlikely(!vq->data[i])) {
+		BAD_RING(vq, "id %u is not a head!\n", i);
+		return NULL;
+	}
+
+	/* detach_buf clears data, so grab it now. */
+	ret = vq->data[i];
+	detach_buf(vq, i);
+	vq->last_used_idx++;
+	END_USE(vq);
+	return ret;
+}
+
+static bool vring_restart(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	START_USE(vq);
+	BUG_ON(!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT));
+
+	/* We optimistically turn back on interrupts, then check if there was
+	 * more to do. */
+	vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
+	mb();
+	if (unlikely(more_used(vq))) {
+		vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
+		END_USE(vq);
+		return false;
+	}
+
+	END_USE(vq);
+	return true;
+}
+
+irqreturn_t vring_interrupt(int irq, void *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	if (!more_used(vq)) {
+		pr_debug("virtqueue interrupt with no work for %p\n", vq);
+		return IRQ_NONE;
+	}
+
+	if (unlikely(vq->broken))
+		return IRQ_HANDLED;
+
+	pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback);
+	if (vq->vq.callback && !vq->vq.callback(&vq->vq))
+		vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
+
+	return IRQ_HANDLED;
+}
+
+static struct virtqueue_ops vring_vq_ops = {
+	.add_buf = vring_add_buf,
+	.get_buf = vring_get_buf,
+	.kick = vring_kick,
+	.restart = vring_restart,
+	.shutdown = vring_shutdown,
+};
+
+struct virtqueue *vring_new_virtqueue(unsigned int num,
+				      struct virtio_device *vdev,
+				      void *pages,
+				      void (*notify)(struct virtqueue *),
+				      bool (*callback)(struct virtqueue *))
+{
+	struct vring_virtqueue *vq;
+	unsigned int i;
+
+	vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL);
+	if (!vq)
+		return NULL;
+
+	vring_init(&vq->vring, num, pages);
+	vq->vq.callback = callback;
+	vq->vq.vdev = vdev;
+	vq->vq.vq_ops = &vring_vq_ops;
+	vq->notify = notify;
+	vq->broken = false;
+	vq->last_used_idx = 0;
+	vq->num_added = 0;
+#ifdef DEBUG
+	vq->in_use = false;
+#endif
+
+	/* No callback?  Tell other side not to bother us. */
+	if (!callback)
+		vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
+
+	/* Put everything in free lists. */
+	vq->num_free = num;
+	vq->free_head = 0;
+	for (i = 0; i < num-1; i++)
+		vq->vring.desc[i].next = i+1;
+
+	return &vq->vq;
+}
+
+void vring_del_virtqueue(struct virtqueue *vq)
+{
+	kfree(to_vvq(vq));
+}
+
diff --git a/drivers/watchdog/mpc5200_wdt.c b/drivers/watchdog/mpc5200_wdt.c
index 9cfb9757662..11f6a111e75 100644
--- a/drivers/watchdog/mpc5200_wdt.c
+++ b/drivers/watchdog/mpc5200_wdt.c
@@ -176,6 +176,8 @@ static int mpc5200_wdt_probe(struct of_device *op, const struct of_device_id *ma
 
 	has_wdt = of_get_property(op->node, "has-wdt", NULL);
 	if (!has_wdt)
+		has_wdt = of_get_property(op->node, "fsl,has-wdt", NULL);
+	if (!has_wdt)
 		return -ENODEV;
 
 	wdt = kzalloc(sizeof(*wdt), GFP_KERNEL);
@@ -254,6 +256,7 @@ static int mpc5200_wdt_shutdown(struct of_device *op)
 
 static struct of_device_id mpc5200_wdt_match[] = {
 	{ .compatible = "mpc5200-gpt", },
+	{ .compatible = "fsl,mpc5200-gpt", },
 	{},
 };
 static struct of_platform_driver mpc5200_wdt_driver = {