93 files changed, 5412 insertions, 558 deletions
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index 3ec76586877..d12346aaa88 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -113,6 +113,10 @@ config BOARD_ATNGW100
 	bool "ATNGW100 Network Gateway"
 endchoice
 
+if BOARD_ATSTK1000
+source "arch/avr32/boards/atstk1000/Kconfig"
+endif
+
 choice
 	prompt "Boot loader type"
 	default LOADER_U_BOOT
@@ -185,6 +189,27 @@ config CMDLINE
 
 endmenu
 
+menu "Power managment options"
+
+menu "CPU Frequency scaling"
+
+source "drivers/cpufreq/Kconfig"
+
+config CPU_FREQ_AT32AP
+	bool "CPU frequency driver for AT32AP"
+	depends on CPU_FREQ && PLATFORM_AT32AP
+	default n
+	help
+	  This enables the CPU frequency driver for AT32AP processors.
+
+	  For details, take a look in <file:Documentation/cpu-freq>.
+
+	  If in doubt, say N.
+
+endmenu
+
+endmenu
+
 menu "Bus options"
 
 config PCI
diff --git a/arch/avr32/boards/atstk1000/Kconfig b/arch/avr32/boards/atstk1000/Kconfig
new file mode 100644
index 00000000000..71bc7d364fb
--- /dev/null
+++ b/arch/avr32/boards/atstk1000/Kconfig
@@ -0,0 +1,53 @@
+# STK1000 customization
+
+if BOARD_ATSTK1002
+
+config BOARD_ATSTK1002_CUSTOM
+	bool "Non-default STK-1002 jumper settings"
+	help
+	  You will normally leave the jumpers on the CPU card at their
+	  default settings.  If you need to use certain peripherals,
+	  you will need to change some of those jumpers.
+
+if BOARD_ATSTK1002_CUSTOM
+
+config BOARD_ATSTK1002_SW1_CUSTOM
+	bool "SW1: use SSC1 (not SPI0)"
+	help
+	  This also prevents using the external DAC as an audio interface,
+	  and means you can't initialize the on-board QVGA display.
+
+config BOARD_ATSTK1002_SW2_CUSTOM
+	bool "SW2: use IRDA or TIMER0 (not UART-A, MMC/SD, and PS2-A)"
+	help
+	  If you change this you'll want an updated boot loader putting
+	  the console on UART-C not UART-A.
+
+config BOARD_ATSTK1002_SW3_CUSTOM
+	bool "SW3: use TIMER1 (not SSC0 and GCLK)"
+	help
+	  This also prevents using the external DAC as an audio interface.
+
+config BOARD_ATSTK1002_SW4_CUSTOM
+	bool "SW4: use ISI/Camera (not GPIOs, SPI1, and PS2-B)"
+	help
+	  To use the camera interface you'll need a custom card (on the
+	  PCI-format connector) connect a video sensor.
+
+config BOARD_ATSTK1002_SW5_CUSTOM
+	bool "SW5: use MACB1 (not LCDC)"
+
+config BOARD_ATSTK1002_SW6_CUSTOM
+	bool "SW6: more GPIOs (not MACB0)"
+
+endif	# custom
+
+config BOARD_ATSTK1002_SPI1
+	bool "Configure SPI1 controller"
+	depends on !BOARD_ATSTK1002_SW4_CUSTOM
+	help
+	  All the signals for the second SPI controller are available on
+	  GPIO lines and accessed through the J1 jumper block.  Say "y"
+	  here to configure that SPI controller.
+
+endif	# stk 1002
diff --git a/arch/avr32/boards/atstk1000/atstk1002.c b/arch/avr32/boards/atstk1000/atstk1002.c
index e253e86a1a3..cb93eabb9c6 100644
--- a/arch/avr32/boards/atstk1000/atstk1002.c
+++ b/arch/avr32/boards/atstk1000/atstk1002.c
@@ -27,15 +27,27 @@
 
 #include "atstk1000.h"
 
-#define	SW2_DEFAULT		/* MMCI and UART_A available */
 
 struct eth_addr {
 	u8 addr[6];
 };
 
 static struct eth_addr __initdata hw_addr[2];
-static struct eth_platform_data __initdata eth_data[2];
+static struct eth_platform_data __initdata eth_data[2] = {
+	{
+		/*
+		 * The MDIO pullups on STK1000 are a bit too weak for
+		 * the autodetection to work properly, so we have to
+		 * mask out everything but the correct address.
+		 */
+		.phy_mask	= ~(1U << 16),
+	},
+	{
+		.phy_mask	= ~(1U << 17),
+	},
+};
 
+#ifndef CONFIG_BOARD_ATSTK1002_SW1_CUSTOM
 static struct spi_board_info spi0_board_info[] __initdata = {
 	{
 		/* QVGA display */
@@ -45,6 +57,13 @@ static struct spi_board_info spi0_board_info[] __initdata = {
 		.mode		= SPI_MODE_3,
 	},
 };
+#endif
+
+#ifdef CONFIG_BOARD_ATSTK1002_SPI1
+static struct spi_board_info spi1_board_info[] __initdata = { {
+	/* patch in custom entries here */
+} };
+#endif
 
 /*
  * The next two functions should go away as the boot loader is
@@ -103,10 +122,10 @@ static void __init set_hw_addr(struct platform_device *pdev)
 
 void __init setup_board(void)
 {
-#ifdef	SW2_DEFAULT
-	at32_map_usart(1, 0);	/* USART 1/A: /dev/ttyS0, DB9 */
-#else
+#ifdef	CONFIG_BOARD_ATSTK1002_SW2_CUSTOM
 	at32_map_usart(0, 1);	/* USART 0/B: /dev/ttyS1, IRDA */
+#else
+	at32_map_usart(1, 0);	/* USART 1/A: /dev/ttyS0, DB9 */
 #endif
 	/* USART 2/unused: expansion connector */
 	at32_map_usart(3, 2);	/* USART 3/C: /dev/ttyS2, DB9 */
@@ -140,18 +159,31 @@ static int __init atstk1002_init(void)
 
 	at32_add_system_devices();
 
-#ifdef	SW2_DEFAULT
-	at32_add_device_usart(0);
-#else
+#ifdef	CONFIG_BOARD_ATSTK1002_SW2_CUSTOM
 	at32_add_device_usart(1);
+#else
+	at32_add_device_usart(0);
 #endif
 	at32_add_device_usart(2);
 
+#ifndef CONFIG_BOARD_ATSTK1002_SW6_CUSTOM
 	set_hw_addr(at32_add_device_eth(0, &eth_data[0]));
-
+#endif
+#ifndef CONFIG_BOARD_ATSTK1002_SW1_CUSTOM
 	at32_add_device_spi(0, spi0_board_info, ARRAY_SIZE(spi0_board_info));
+#endif
+#ifdef CONFIG_BOARD_ATSTK1002_SPI1
+	at32_add_device_spi(1, spi1_board_info, ARRAY_SIZE(spi1_board_info));
+#endif
+#ifdef CONFIG_BOARD_ATSTK1002_SW5_CUSTOM
+	set_hw_addr(at32_add_device_eth(1, &eth_data[1]));
+#else
 	at32_add_device_lcdc(0, &atstk1000_lcdc_data,
 			     fbmem_start, fbmem_size);
+#endif
+#ifndef CONFIG_BOARD_ATSTK1002_SW3_CUSTOM
+	at32_add_device_ssc(0, ATMEL_SSC_TX);
+#endif
 
 	return 0;
 }
diff --git a/arch/avr32/kernel/setup.c b/arch/avr32/kernel/setup.c
index b279d66acf5..d08b0bc6b2b 100644
--- a/arch/avr32/kernel/setup.c
+++ b/arch/avr32/kernel/setup.c
@@ -313,7 +313,7 @@ __tagtable(ATAG_MEM, parse_tag_mem);
 
 static int __init parse_tag_rdimg(struct tag *tag)
 {
-#ifdef CONFIG_INITRD
+#ifdef CONFIG_BLK_DEV_INITRD
 	struct tag_mem_range *mem = &tag->u.mem_range;
 	int ret;
 
@@ -323,7 +323,7 @@ static int __init parse_tag_rdimg(struct tag *tag)
 		return 0;
 	}
 
-	ret = add_reserved_region(mem->start, mem->start + mem->size - 1,
+	ret = add_reserved_region(mem->addr, mem->addr + mem->size - 1,
 				  "initrd");
 	if (ret) {
 		printk(KERN_WARNING
diff --git a/arch/avr32/mach-at32ap/Makefile b/arch/avr32/mach-at32ap/Makefile
index f1d395724ac..a8b445046e3 100644
--- a/arch/avr32/mach-at32ap/Makefile
+++ b/arch/avr32/mach-at32ap/Makefile
@@ -1,3 +1,4 @@
 obj-y				+= at32ap.o clock.o intc.o extint.o pio.o hsmc.o
 obj-$(CONFIG_CPU_AT32AP7000)	+= at32ap7000.o
 obj-$(CONFIG_CPU_AT32AP7000)	+= time-tc.o
+obj-$(CONFIG_CPU_FREQ_AT32AP)	+= cpufreq.o
diff --git a/arch/avr32/mach-at32ap/at32ap.c b/arch/avr32/mach-at32ap/at32ap.c
index 90f207e8e96..7c4987f3287 100644
--- a/arch/avr32/mach-at32ap/at32ap.c
+++ b/arch/avr32/mach-at32ap/at32ap.c
@@ -11,41 +11,10 @@
 #include <linux/init.h>
 #include <linux/platform_device.h>
 
-#include <asm/io.h>
-
 #include <asm/arch/init.h>
-#include <asm/arch/sm.h>
-
-struct at32_sm system_manager;
-
-static int __init at32_sm_init(void)
-{
-	struct resource *regs;
-	struct at32_sm *sm = &system_manager;
-	int ret = -ENXIO;
-
-	regs = platform_get_resource(&at32_sm_device, IORESOURCE_MEM, 0);
-	if (!regs)
-		goto fail;
-
-	spin_lock_init(&sm->lock);
-	sm->pdev = &at32_sm_device;
-
-	ret = -ENOMEM;
-	sm->regs = ioremap(regs->start, regs->end - regs->start + 1);
-	if (!sm->regs)
-		goto fail;
-
-	return 0;
-
-fail:
-	printk(KERN_ERR "Failed to initialize System Manager: %d\n", ret);
-	return ret;
-}
 
 void __init setup_platform(void)
 {
-	at32_sm_init();
 	at32_clock_init();
 	at32_portmux_init();
 }
diff --git a/arch/avr32/mach-at32ap/at32ap7000.c b/arch/avr32/mach-at32ap/at32ap7000.c
index 4dda42d3f6d..64cc5583ddf 100644
--- a/arch/avr32/mach-at32ap/at32ap7000.c
+++ b/arch/avr32/mach-at32ap/at32ap7000.c
@@ -17,14 +17,20 @@
 #include <asm/arch/at32ap7000.h>
 #include <asm/arch/board.h>
 #include <asm/arch/portmux.h>
-#include <asm/arch/sm.h>
 
 #include <video/atmel_lcdc.h>
 
 #include "clock.h"
 #include "hmatrix.h"
 #include "pio.h"
-#include "sm.h"
+#include "pm.h"
+
+/*
+ * We can reduce the code size a bit by using a constant here. Since
+ * this file is completely chip-specific, it's safe to not use
+ * ioremap. Generic drivers should of course never do this.
+ */
+#define AT32_PM_BASE	0xfff00000
 
 #define PBMEM(base)					\
 	{						\
@@ -88,6 +94,8 @@ static struct clk devname##_##_name = {				\
 	.index		= _index,				\
 }
 
+static DEFINE_SPINLOCK(pm_lock);
+
 unsigned long at32ap7000_osc_rates[3] = {
 	[0] = 32768,
 	/* FIXME: these are ATSTK1002-specific */
@@ -104,11 +112,11 @@ static unsigned long pll_get_rate(struct clk *clk, unsigned long control)
 {
 	unsigned long div, mul, rate;
 
-	if (!(control & SM_BIT(PLLEN)))
+	if (!(control & PM_BIT(PLLEN)))
 		return 0;
 
-	div = SM_BFEXT(PLLDIV, control) + 1;
-	mul = SM_BFEXT(PLLMUL, control) + 1;
+	div = PM_BFEXT(PLLDIV, control) + 1;
+	mul = PM_BFEXT(PLLMUL, control) + 1;
 
 	rate = clk->parent->get_rate(clk->parent);
 	rate = (rate + div / 2) / div;
@@ -121,7 +129,7 @@ static unsigned long pll0_get_rate(struct clk *clk)
 {
 	u32 control;
 
-	control = sm_readl(&system_manager, PM_PLL0);
+	control = pm_readl(PLL0);
 
 	return pll_get_rate(clk, control);
 }
@@ -130,7 +138,7 @@ static unsigned long pll1_get_rate(struct clk *clk)
 {
 	u32 control;
 
-	control = sm_readl(&system_manager, PM_PLL1);
+	control = pm_readl(PLL1);
 
 	return pll_get_rate(clk, control);
 }
@@ -187,108 +195,139 @@ static unsigned long bus_clk_get_rate(struct clk *clk, unsigned int shift)
 
 static void cpu_clk_mode(struct clk *clk, int enabled)
 {
-	struct at32_sm *sm = &system_manager;
 	unsigned long flags;
 	u32 mask;
 
-	spin_lock_irqsave(&sm->lock, flags);
-	mask = sm_readl(sm, PM_CPU_MASK);
+	spin_lock_irqsave(&pm_lock, flags);
+	mask = pm_readl(CPU_MASK);
 	if (enabled)
 		mask |= 1 << clk->index;
 	else
 		mask &= ~(1 << clk->index);
-	sm_writel(sm, PM_CPU_MASK, mask);
-	spin_unlock_irqrestore(&sm->lock, flags);
+	pm_writel(CPU_MASK, mask);
+	spin_unlock_irqrestore(&pm_lock, flags);
 }
 
 static unsigned long cpu_clk_get_rate(struct clk *clk)
 {
 	unsigned long cksel, shift = 0;
 
-	cksel = sm_readl(&system_manager, PM_CKSEL);
-	if (cksel & SM_BIT(CPUDIV))
-		shift = SM_BFEXT(CPUSEL, cksel) + 1;
+	cksel = pm_readl(CKSEL);
+	if (cksel & PM_BIT(CPUDIV))
+		shift = PM_BFEXT(CPUSEL, cksel) + 1;
 
 	return bus_clk_get_rate(clk, shift);
 }
 
+static long cpu_clk_set_rate(struct clk *clk, unsigned long rate, int apply)
+{
+	u32 control;
+	unsigned long parent_rate, child_div, actual_rate, div;
+
+	parent_rate = clk->parent->get_rate(clk->parent);
+	control = pm_readl(CKSEL);
+
+	if (control & PM_BIT(HSBDIV))
+		child_div = 1 << (PM_BFEXT(HSBSEL, control) + 1);
+	else
+		child_div = 1;
+
+	if (rate > 3 * (parent_rate / 4) || child_div == 1) {
+		actual_rate = parent_rate;
+		control &= ~PM_BIT(CPUDIV);
+	} else {
+		unsigned int cpusel;
+		div = (parent_rate + rate / 2) / rate;
+		if (div > child_div)
+			div = child_div;
+		cpusel = (div > 1) ? (fls(div) - 2) : 0;
+		control = PM_BIT(CPUDIV) | PM_BFINS(CPUSEL, cpusel, control);
+		actual_rate = parent_rate / (1 << (cpusel + 1));
+	}
+
+	pr_debug("clk %s: new rate %lu (actual rate %lu)\n",
+			clk->name, rate, actual_rate);
+
+	if (apply)
+		pm_writel(CKSEL, control);
+
+	return actual_rate;
+}
+
 static void hsb_clk_mode(struct clk *clk, int enabled)
 {
-	struct at32_sm *sm = &system_manager;
 	unsigned long flags;
 	u32 mask;
 
-	spin_lock_irqsave(&sm->lock, flags);
-	mask = sm_readl(sm, PM_HSB_MASK);
+	spin_lock_irqsave(&pm_lock, flags);
+	mask = pm_readl(HSB_MASK);
 	if (enabled)
 		mask |= 1 << clk->index;
 	else
 		mask &= ~(1 << clk->index);
-	sm_writel(sm, PM_HSB_MASK, mask);
-	spin_unlock_irqrestore(&sm->lock, flags);
+	pm_writel(HSB_MASK, mask);
+	spin_unlock_irqrestore(&pm_lock, flags);
 }
 
 static unsigned long hsb_clk_get_rate(struct clk *clk)
 {
 	unsigned long cksel, shift = 0;
 
-	cksel = sm_readl(&system_manager, PM_CKSEL);
-	if (cksel & SM_BIT(HSBDIV))
-		shift = SM_BFEXT(HSBSEL, cksel) + 1;
+	cksel = pm_readl(CKSEL);
+	if (cksel & PM_BIT(HSBDIV))
+		shift = PM_BFEXT(HSBSEL, cksel) + 1;
 
 	return bus_clk_get_rate(clk, shift);
 }
 
 static void pba_clk_mode(struct clk *clk, int enabled)
 {
-	struct at32_sm *sm = &system_manager;
 	unsigned long flags;
 	u32 mask;
 
-	spin_lock_irqsave(&sm->lock, flags);
-	mask = sm_readl(sm, PM_PBA_MASK);
+	spin_lock_irqsave(&pm_lock, flags);
+	mask = pm_readl(PBA_MASK);
 	if (enabled)
 		mask |= 1 << clk->index;
 	else
 		mask &= ~(1 << clk->index);
-	sm_writel(sm, PM_PBA_MASK, mask);
-	spin_unlock_irqrestore(&sm->lock, flags);
+	pm_writel(PBA_MASK, mask);
+	spin_unlock_irqrestore(&pm_lock, flags);
 }
 
 static unsigned long pba_clk_get_rate(struct clk *clk)
 {
 	unsigned long cksel, shift = 0;
 
-	cksel = sm_readl(&system_manager, PM_CKSEL);
-	if (cksel & SM_BIT(PBADIV))
-		shift = SM_BFEXT(PBASEL, cksel) + 1;
+	cksel = pm_readl(CKSEL);
+	if (cksel & PM_BIT(PBADIV))
+		shift = PM_BFEXT(PBASEL, cksel) + 1;
 
 	return bus_clk_get_rate(clk, shift);
 }
 
 static void pbb_clk_mode(struct clk *clk, int enabled)
 {
-	struct at32_sm *sm = &system_manager;
 	unsigned long flags;
 	u32 mask;
 
-	spin_lock_irqsave(&sm->lock, flags);
-	mask = sm_readl(sm, PM_PBB_MASK);
+	spin_lock_irqsave(&pm_lock, flags);
+	mask = pm_readl(PBB_MASK);
 	if (enabled)
 		mask |= 1 << clk->index;
 	else
 		mask &= ~(1 << clk->index);
-	sm_writel(sm, PM_PBB_MASK, mask);
-	spin_unlock_irqrestore(&sm->lock, flags);
+	pm_writel(PBB_MASK, mask);
+	spin_unlock_irqrestore(&pm_lock, flags);
 }
 
 static unsigned long pbb_clk_get_rate(struct clk *clk)
 {
 	unsigned long cksel, shift = 0;
 
-	cksel = sm_readl(&system_manager, PM_CKSEL);
-	if (cksel & SM_BIT(PBBDIV))
-		shift = SM_BFEXT(PBBSEL, cksel) + 1;
+	cksel = pm_readl(CKSEL);
+	if (cksel & PM_BIT(PBBDIV))
+		shift = PM_BFEXT(PBBSEL, cksel) + 1;
 
 	return bus_clk_get_rate(clk, shift);
 }
@@ -296,6 +335,7 @@ static unsigned long pbb_clk_get_rate(struct clk *clk)
 static struct clk cpu_clk = {
 	.name		= "cpu",
 	.get_rate	= cpu_clk_get_rate,
+	.set_rate	= cpu_clk_set_rate,
 	.users		= 1,
 };
 static struct clk hsb_clk = {
@@ -327,12 +367,12 @@ static void genclk_mode(struct clk *clk, int enabled)
 {
 	u32 control;
 
-	control = sm_readl(&system_manager, PM_GCCTRL + 4 * clk->index);
+	control = pm_readl(GCCTRL(clk->index));
 	if (enabled)
-		control |= SM_BIT(CEN);
+		control |= PM_BIT(CEN);
 	else
-		control &= ~SM_BIT(CEN);
-	sm_writel(&system_manager, PM_GCCTRL + 4 * clk->index, control);
+		control &= ~PM_BIT(CEN);
+	pm_writel(GCCTRL(clk->index), control);
 }
 
 static unsigned long genclk_get_rate(struct clk *clk)
@@ -340,9 +380,9 @@ static unsigned long genclk_get_rate(struct clk *clk)
 	u32 control;
 	unsigned long div = 1;
 
-	control = sm_readl(&system_manager, PM_GCCTRL + 4 * clk->index);
-	if (control & SM_BIT(DIVEN))
-		div = 2 * (SM_BFEXT(DIV, control) + 1);
+	control = pm_readl(GCCTRL(clk->index));
+	if (control & PM_BIT(DIVEN))
+		div = 2 * (PM_BFEXT(DIV, control) + 1);
 
 	return clk->parent->get_rate(clk->parent) / div;
 }
@@ -353,23 +393,22 @@ static long genclk_set_rate(struct clk *clk, unsigned long rate, int apply)
 	unsigned long parent_rate, actual_rate, div;
 
 	parent_rate = clk->parent->get_rate(clk->parent);
-	control = sm_readl(&system_manager, PM_GCCTRL + 4 * clk->index);
+	control = pm_readl(GCCTRL(clk->index));
 
 	if (rate > 3 * parent_rate / 4) {
 		actual_rate = parent_rate;
-		control &= ~SM_BIT(DIVEN);
+		control &= ~PM_BIT(DIVEN);
 	} else {
 		div = (parent_rate + rate) / (2 * rate) - 1;
-		control = SM_BFINS(DIV, div, control) | SM_BIT(DIVEN);
+		control = PM_BFINS(DIV, div, control) | PM_BIT(DIVEN);
 		actual_rate = parent_rate / (2 * (div + 1));
 	}
 
-	printk("clk %s: new rate %lu (actual rate %lu)\n",
-	       clk->name, rate, actual_rate);
+	dev_dbg(clk->dev, "clk %s: new rate %lu (actual rate %lu)\n",
+		clk->name, rate, actual_rate);
 
 	if (apply)
-		sm_writel(&system_manager, PM_GCCTRL + 4 * clk->index,
-			  control);
+		pm_writel(GCCTRL(clk->index), control);
 
 	return actual_rate;
 }
@@ -378,24 +417,24 @@ int genclk_set_parent(struct clk *clk, struct clk *parent)
 {
 	u32 control;
 
-	printk("clk %s: new parent %s (was %s)\n",
-	       clk->name, parent->name, clk->parent->name);
+	dev_dbg(clk->dev, "clk %s: new parent %s (was %s)\n",
+		clk->name, parent->name, clk->parent->name);
 
-	control = sm_readl(&system_manager, PM_GCCTRL + 4 * clk->index);
+	control = pm_readl(GCCTRL(clk->index));
 
 	if (parent == &osc1 || parent == &pll1)
-		control |= SM_BIT(OSCSEL);
+		control |= PM_BIT(OSCSEL);
 	else if (parent == &osc0 || parent == &pll0)
-		control &= ~SM_BIT(OSCSEL);
+		control &= ~PM_BIT(OSCSEL);
 	else
 		return -EINVAL;
 
 	if (parent == &pll0 || parent == &pll1)
-		control |= SM_BIT(PLLSEL);
+		control |= PM_BIT(PLLSEL);
 	else
-		control &= ~SM_BIT(PLLSEL);
+		control &= ~PM_BIT(PLLSEL);
 
-	sm_writel(&system_manager, PM_GCCTRL + 4 * clk->index, control);
+	pm_writel(GCCTRL(clk->index), control);
 	clk->parent = parent;
 
 	return 0;
@@ -408,11 +447,11 @@ static void __init genclk_init_parent(struct clk *clk)
 
 	BUG_ON(clk->index > 7);
 
-	control = sm_readl(&system_manager, PM_GCCTRL + 4 * clk->index);
-	if (control & SM_BIT(OSCSEL))
-		parent = (control & SM_BIT(PLLSEL)) ? &pll1 : &osc1;
+	control = pm_readl(GCCTRL(clk->index));
+	if (control & PM_BIT(OSCSEL))
+		parent = (control & PM_BIT(PLLSEL)) ? &pll1 : &osc1;
 	else
-		parent = (control & SM_BIT(PLLSEL)) ? &pll0 : &osc0;
+		parent = (control & PM_BIT(PLLSEL)) ? &pll0 : &osc0;
 
 	clk->parent = parent;
 }
@@ -420,21 +459,53 @@ static void __init genclk_init_parent(struct clk *clk)
 /* --------------------------------------------------------------------
  *  System peripherals
  * -------------------------------------------------------------------- */
-static struct resource sm_resource[] = {
-	PBMEM(0xfff00000),
-	NAMED_IRQ(19, "eim"),
-	NAMED_IRQ(20, "pm"),
-	NAMED_IRQ(21, "rtc"),
+static struct resource at32_pm0_resource[] = {
+	{
+		.start	= 0xfff00000,
+		.end	= 0xfff0007f,
+		.flags	= IORESOURCE_MEM,
+	},
+	IRQ(20),
 };
-struct platform_device at32_sm_device = {
-	.name		= "sm",
-	.id		= 0,
-	.resource	= sm_resource,
-	.num_resources	= ARRAY_SIZE(sm_resource),
+
+static struct resource at32ap700x_rtc0_resource[] = {
+	{
+		.start	= 0xfff00080,
+		.end	= 0xfff000af,
+		.flags	= IORESOURCE_MEM,
+	},
+	IRQ(21),
+};
+
+static struct resource at32_wdt0_resource[] = {
+	{
+		.start	= 0xfff000b0,
+		.end	= 0xfff000bf,
+		.flags	= IORESOURCE_MEM,
+	},
+};
+
+static struct resource at32_eic0_resource[] = {
+	{
+		.start	= 0xfff00100,
+		.end	= 0xfff0013f,
+		.flags	= IORESOURCE_MEM,
+	},
+	IRQ(19),
 };
-static struct clk at32_sm_pclk = {
+
+DEFINE_DEV(at32_pm, 0);
+DEFINE_DEV(at32ap700x_rtc, 0);
+DEFINE_DEV(at32_wdt, 0);
+DEFINE_DEV(at32_eic, 0);
+
+/*
+ * Peripheral clock for PM, RTC, WDT and EIC. PM will ensure that this
+ * is always running.
+ */
+static struct clk at32_pm_pclk = {
 	.name		= "pclk",
-	.dev		= &at32_sm_device.dev,
+	.dev		= &at32_pm0_device.dev,
 	.parent		= &pbb_clk,
 	.mode		= pbb_clk_mode,
 	.get_rate	= pbb_clk_get_rate,
@@ -583,10 +654,11 @@ DEV_CLK(mck, pio4, pba, 14);
 
 void __init at32_add_system_devices(void)
 {
-	system_manager.eim_first_irq = EIM_IRQ_BASE;
-
-	platform_device_register(&at32_sm_device);
+	platform_device_register(&at32_pm0_device);
 	platform_device_register(&at32_intc0_device);
+	platform_device_register(&at32ap700x_rtc0_device);
+	platform_device_register(&at32_wdt0_device);
+	platform_device_register(&at32_eic0_device);
 	platform_device_register(&smc0_device);
 	platform_device_register(&pdc_device);
 
@@ -1013,6 +1085,89 @@ err_dup_modedb:
 }
 
 /* --------------------------------------------------------------------
+ *  SSC
+ * -------------------------------------------------------------------- */
+static struct resource ssc0_resource[] = {
+	PBMEM(0xffe01c00),
+	IRQ(10),
+};
+DEFINE_DEV(ssc, 0);
+DEV_CLK(pclk, ssc0, pba, 7);
+
+static struct resource ssc1_resource[] = {
+	PBMEM(0xffe02000),
+	IRQ(11),
+};
+DEFINE_DEV(ssc, 1);
+DEV_CLK(pclk, ssc1, pba, 8);
+
+static struct resource ssc2_resource[] = {
+	PBMEM(0xffe02400),
+	IRQ(12),
+};
+DEFINE_DEV(ssc, 2);
+DEV_CLK(pclk, ssc2, pba, 9);
+
+struct platform_device *__init
+at32_add_device_ssc(unsigned int id, unsigned int flags)
+{
+	struct platform_device *pdev;
+
+	switch (id) {
+	case 0:
+		pdev = &ssc0_device;
+		if (flags & ATMEL_SSC_RF)
+			select_peripheral(PA(21), PERIPH_A, 0);	/* RF */
+		if (flags & ATMEL_SSC_RK)
+			select_peripheral(PA(22), PERIPH_A, 0);	/* RK */
+		if (flags & ATMEL_SSC_TK)
+			select_peripheral(PA(23), PERIPH_A, 0);	/* TK */
+		if (flags & ATMEL_SSC_TF)
+			select_peripheral(PA(24), PERIPH_A, 0);	/* TF */
+		if (flags & ATMEL_SSC_TD)
+			select_peripheral(PA(25), PERIPH_A, 0);	/* TD */
+		if (flags & ATMEL_SSC_RD)
+			select_peripheral(PA(26), PERIPH_A, 0);	/* RD */
+		break;
+	case 1:
+		pdev = &ssc1_device;
+		if (flags & ATMEL_SSC_RF)
+			select_peripheral(PA(0), PERIPH_B, 0);	/* RF */
+		if (flags & ATMEL_SSC_RK)
+			select_peripheral(PA(1), PERIPH_B, 0);	/* RK */
+		if (flags & ATMEL_SSC_TK)
+			select_peripheral(PA(2), PERIPH_B, 0);	/* TK */
+		if (flags & ATMEL_SSC_TF)
+			select_peripheral(PA(3), PERIPH_B, 0);	/* TF */
+		if (flags & ATMEL_SSC_TD)
+			select_peripheral(PA(4), PERIPH_B, 0);	/* TD */
+		if (flags & ATMEL_SSC_RD)
+			select_peripheral(PA(5), PERIPH_B, 0);	/* RD */
+		break;
+	case 2:
+		pdev = &ssc2_device;
+		if (flags & ATMEL_SSC_TD)
+			select_peripheral(PB(13), PERIPH_A, 0);	/* TD */
+		if (flags & ATMEL_SSC_RD)
+			select_peripheral(PB(14), PERIPH_A, 0);	/* RD */
+		if (flags & ATMEL_SSC_TK)
+			select_peripheral(PB(15), PERIPH_A, 0);	/* TK */
+		if (flags & ATMEL_SSC_TF)
+			select_peripheral(PB(16), PERIPH_A, 0);	/* TF */
+		if (flags & ATMEL_SSC_RF)
+			select_peripheral(PB(17), PERIPH_A, 0);	/* RF */
+		if (flags & ATMEL_SSC_RK)
+			select_peripheral(PB(18), PERIPH_A, 0);	/* RK */
+		break;
+	default:
+		return NULL;
+	}
+
+	platform_device_register(pdev);
+	return pdev;
+}
+
+/* --------------------------------------------------------------------
  *  GCLK
  * -------------------------------------------------------------------- */
 static struct clk gclk0 = {
@@ -1066,7 +1221,7 @@ struct clk *at32_clock_list[] = {
 	&hsb_clk,
 	&pba_clk,
 	&pbb_clk,
-	&at32_sm_pclk,
+	&at32_pm_pclk,
 	&at32_intc0_pclk,
 	&hmatrix_clk,
 	&ebi_clk,
@@ -1094,6 +1249,9 @@ struct clk *at32_clock_list[] = {
 	&atmel_spi1_spi_clk,
 	&atmel_lcdfb0_hck1,
 	&atmel_lcdfb0_pixclk,
+	&ssc0_pclk,
+	&ssc1_pclk,
+	&ssc2_pclk,
 	&gclk0,
 	&gclk1,
 	&gclk2,
@@ -1113,18 +1271,20 @@ void __init at32_portmux_init(void)
 
 void __init at32_clock_init(void)
 {
-	struct at32_sm *sm = &system_manager;
 	u32 cpu_mask = 0, hsb_mask = 0, pba_mask = 0, pbb_mask = 0;
 	int i;
 
-	if (sm_readl(sm, PM_MCCTRL) & SM_BIT(PLLSEL))
+	if (pm_readl(MCCTRL) & PM_BIT(PLLSEL)) {
 		main_clock = &pll0;
-	else
+		cpu_clk.parent = &pll0;
+	} else {
 		main_clock = &osc0;
+		cpu_clk.parent = &osc0;
+	}
 
-	if (sm_readl(sm, PM_PLL0) & SM_BIT(PLLOSC))
+	if (pm_readl(PLL0) & PM_BIT(PLLOSC))
 		pll0.parent = &osc1;
-	if (sm_readl(sm, PM_PLL1) & SM_BIT(PLLOSC))
+	if (pm_readl(PLL1) & PM_BIT(PLLOSC))
 		pll1.parent = &osc1;
 
 	genclk_init_parent(&gclk0);
@@ -1157,8 +1317,8 @@ void __init at32_clock_init(void)
 			pbb_mask |= 1 << clk->index;
 	}
 
-	sm_writel(sm, PM_CPU_MASK, cpu_mask);
-	sm_writel(sm, PM_HSB_MASK, hsb_mask);
-	sm_writel(sm, PM_PBA_MASK, pba_mask);
-	sm_writel(sm, PM_PBB_MASK, pbb_mask);
+	pm_writel(CPU_MASK, cpu_mask);
+	pm_writel(HSB_MASK, hsb_mask);
+	pm_writel(PBA_MASK, pba_mask);
+	pm_writel(PBB_MASK, pbb_mask);
 }
diff --git a/arch/avr32/mach-at32ap/cpufreq.c b/arch/avr32/mach-at32ap/cpufreq.c
new file mode 100644
index 00000000000..235524b7919
--- /dev/null
+++ b/arch/avr32/mach-at32ap/cpufreq.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright (C) 2004-2007 Atmel Corporation
+ *
+ * Based on MIPS implementation arch/mips/kernel/time.c
+ *   Copyright 2001 MontaVista Software Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*#define DEBUG*/
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/io.h>
+#include <linux/clk.h>
+#include <linux/err.h>
+#include <asm/system.h>
+
+static struct clk *cpuclk;
+
+static int at32_verify_speed(struct cpufreq_policy *policy)
+{
+	if (policy->cpu != 0)
+		return -EINVAL;
+
+	cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
+			policy->cpuinfo.max_freq);
+	return 0;
+}
+
+static unsigned int at32_get_speed(unsigned int cpu)
+{
+	/* No SMP support */
+	if (cpu)
+		return 0;
+	return (unsigned int)((clk_get_rate(cpuclk) + 500) / 1000);
+}
+
+static int at32_set_target(struct cpufreq_policy *policy,
+			  unsigned int target_freq,
+			  unsigned int relation)
+{
+	struct cpufreq_freqs freqs;
+	long freq;
+
+	/* Convert target_freq from kHz to Hz */
+	freq = clk_round_rate(cpuclk, target_freq * 1000);
+
+	/* Check if policy->min <= new_freq <= policy->max */
+	if(freq < (policy->min * 1000) || freq > (policy->max * 1000))
+		return -EINVAL;
+
+	pr_debug("cpufreq: requested frequency %u Hz\n", target_freq * 1000);
+
+	freqs.old = at32_get_speed(0);
+	freqs.new = (freq + 500) / 1000;
+	freqs.cpu = 0;
+	freqs.flags = 0;
+
+	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	clk_set_rate(cpuclk, freq);
+	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+
+	pr_debug("cpufreq: set frequency %lu Hz\n", freq);
+
+	return 0;
+}
+
+static int __init at32_cpufreq_driver_init(struct cpufreq_policy *policy)
+{
+	if (policy->cpu != 0)
+		return -EINVAL;
+
+	cpuclk = clk_get(NULL, "cpu");
+	if (IS_ERR(cpuclk)) {
+		pr_debug("cpufreq: could not get CPU clk\n");
+		return PTR_ERR(cpuclk);
+	}
+
+	policy->cpuinfo.min_freq = (clk_round_rate(cpuclk, 1) + 500) / 1000;
+	policy->cpuinfo.max_freq = (clk_round_rate(cpuclk, ~0UL) + 500) / 1000;
+	policy->cpuinfo.transition_latency = 0;
+	policy->cur = at32_get_speed(0);
+	policy->min = policy->cpuinfo.min_freq;
+	policy->max = policy->cpuinfo.max_freq;
+	policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
+
+	printk("cpufreq: AT32AP CPU frequency driver\n");
+
+	return 0;
+}
+
+static struct cpufreq_driver at32_driver = {
+	.name		= "at32ap",
+	.owner		= THIS_MODULE,
+	.init		= at32_cpufreq_driver_init,
+	.verify		= at32_verify_speed,
+	.target		= at32_set_target,
+	.get		= at32_get_speed,
+	.flags		= CPUFREQ_STICKY,
+};
+
+static int __init at32_cpufreq_init(void)
+{
+	return cpufreq_register_driver(&at32_driver);
+}
+
+arch_initcall(at32_cpufreq_init);
diff --git a/arch/avr32/mach-at32ap/extint.c b/arch/avr32/mach-at32ap/extint.c
index 4a60eccfebd..8acd0109003 100644
--- a/arch/avr32/mach-at32ap/extint.c
+++ b/arch/avr32/mach-at32ap/extint.c
@@ -17,42 +17,83 @@
 
 #include <asm/io.h>
 
-#include <asm/arch/sm.h>
-
-#include "sm.h"
+/* EIC register offsets */
+#define EIC_IER					0x0000
+#define EIC_IDR					0x0004
+#define EIC_IMR					0x0008
+#define EIC_ISR					0x000c
+#define EIC_ICR					0x0010
+#define EIC_MODE				0x0014
+#define EIC_EDGE				0x0018
+#define EIC_LEVEL				0x001c
+#define EIC_TEST				0x0020
+#define EIC_NMIC				0x0024
+
+/* Bitfields in TEST */
+#define EIC_TESTEN_OFFSET			31
+#define EIC_TESTEN_SIZE				1
+
+/* Bitfields in NMIC */
+#define EIC_EN_OFFSET				0
+#define EIC_EN_SIZE				1
+
+/* Bit manipulation macros */
+#define EIC_BIT(name)					\
+	(1 << EIC_##name##_OFFSET)
+#define EIC_BF(name,value)				\
+	(((value) & ((1 << EIC_##name##_SIZE) - 1))	\
+	 << EIC_##name##_OFFSET)
+#define EIC_BFEXT(name,value)				\
+	(((value) >> EIC_##name##_OFFSET)		\
+	 & ((1 << EIC_##name##_SIZE) - 1))
+#define EIC_BFINS(name,value,old)			\
+	(((old) & ~(((1 << EIC_##name##_SIZE) - 1)	\
+		    << EIC_##name##_OFFSET))		\
+	 | EIC_BF(name,value))
+
+/* Register access macros */
+#define eic_readl(port,reg)				\
+	__raw_readl((port)->regs + EIC_##reg)
+#define eic_writel(port,reg,value)			\
+	__raw_writel((value), (port)->regs + EIC_##reg)
+
+struct eic {
+	void __iomem *regs;
+	struct irq_chip *chip;
+	unsigned int first_irq;
+};
 
-static void eim_ack_irq(unsigned int irq)
+static void eic_ack_irq(unsigned int irq)
 {
-	struct at32_sm *sm = get_irq_chip_data(irq);
-	sm_writel(sm, EIM_ICR, 1 << (irq - sm->eim_first_irq));
+	struct eic *eic = get_irq_chip_data(irq);
+	eic_writel(eic, ICR, 1 << (irq - eic->first_irq));
 }
 
-static void eim_mask_irq(unsigned int irq)
+static void eic_mask_irq(unsigned int irq)
 {
-	struct at32_sm *sm = get_irq_chip_data(irq);
-	sm_writel(sm, EIM_IDR, 1 << (irq - sm->eim_first_irq));
+	struct eic *eic = get_irq_chip_data(irq);
+	eic_writel(eic, IDR, 1 << (irq - eic->first_irq));
 }
 
-static void eim_mask_ack_irq(unsigned int irq)
+static void eic_mask_ack_irq(unsigned int irq)
 {
-	struct at32_sm *sm = get_irq_chip_data(irq);
-	sm_writel(sm, EIM_ICR, 1 << (irq - sm->eim_first_irq));
-	sm_writel(sm, EIM_IDR, 1 << (irq - sm->eim_first_irq));
+	struct eic *eic = get_irq_chip_data(irq);
+	eic_writel(eic, ICR, 1 << (irq - eic->first_irq));
+	eic_writel(eic, IDR, 1 << (irq - eic->first_irq));
 }
 
-static void eim_unmask_irq(unsigned int irq)
+static void eic_unmask_irq(unsigned int irq)
 {
-	struct at32_sm *sm = get_irq_chip_data(irq);
-	sm_writel(sm, EIM_IER, 1 << (irq - sm->eim_first_irq));
+	struct eic *eic = get_irq_chip_data(irq);
+	eic_writel(eic, IER, 1 << (irq - eic->first_irq));
 }
 
-static int eim_set_irq_type(unsigned int irq, unsigned int flow_type)
+static int eic_set_irq_type(unsigned int irq, unsigned int flow_type)
 {
-	struct at32_sm *sm = get_irq_chip_data(irq);
+	struct eic *eic = get_irq_chip_data(irq);
 	struct irq_desc *desc;
-	unsigned int i = irq - sm->eim_first_irq;
+	unsigned int i = irq - eic->first_irq;
 	u32 mode, edge, level;
-	unsigned long flags;
 	int ret = 0;
 
 	flow_type &= IRQ_TYPE_SENSE_MASK;
@@ -60,11 +101,10 @@ static int eim_set_irq_type(unsigned int irq, unsigned int flow_type)
 		flow_type = IRQ_TYPE_LEVEL_LOW;
 
 	desc = &irq_desc[irq];
-	spin_lock_irqsave(&sm->lock, flags);
 
-	mode = sm_readl(sm, EIM_MODE);
-	edge = sm_readl(sm, EIM_EDGE);
-	level = sm_readl(sm, EIM_LEVEL);
+	mode = eic_readl(eic, MODE);
+	edge = eic_readl(eic, EDGE);
+	level = eic_readl(eic, LEVEL);
 
 	switch (flow_type) {
 	case IRQ_TYPE_LEVEL_LOW:
@@ -89,9 +129,9 @@ static int eim_set_irq_type(unsigned int irq, unsigned int flow_type)
 	}
 
 	if (ret == 0) {
-		sm_writel(sm, EIM_MODE, mode);
-		sm_writel(sm, EIM_EDGE, edge);
-		sm_writel(sm, EIM_LEVEL, level);
+		eic_writel(eic, MODE, mode);
+		eic_writel(eic, EDGE, edge);
+		eic_writel(eic, LEVEL, level);
 
 		if (flow_type & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
 			flow_type |= IRQ_LEVEL;
@@ -99,35 +139,33 @@ static int eim_set_irq_type(unsigned int irq, unsigned int flow_type)
 		desc->status |= flow_type;
 	}
 
-	spin_unlock_irqrestore(&sm->lock, flags);
-
 	return ret;
 }
 
-struct irq_chip eim_chip = {
-	.name		= "eim",
-	.ack		= eim_ack_irq,
-	.mask		= eim_mask_irq,
-	.mask_ack	= eim_mask_ack_irq,
-	.unmask		= eim_unmask_irq,
-	.set_type	= eim_set_irq_type,
+struct irq_chip eic_chip = {
+	.name		= "eic",
+	.ack		= eic_ack_irq,
+	.mask		= eic_mask_irq,
+	.mask_ack	= eic_mask_ack_irq,
+	.unmask		= eic_unmask_irq,
+	.set_type	= eic_set_irq_type,
 };
 
-static void demux_eim_irq(unsigned int irq, struct irq_desc *desc)
+static void demux_eic_irq(unsigned int irq, struct irq_desc *desc)
 {
-	struct at32_sm *sm = desc->handler_data;
+	struct eic *eic = desc->handler_data;
 	struct irq_desc *ext_desc;
 	unsigned long status, pending;
 	unsigned int i, ext_irq;
 
-	status = sm_readl(sm, EIM_ISR);
-	pending = status & sm_readl(sm, EIM_IMR);
+	status = eic_readl(eic, ISR);
+	pending = status & eic_readl(eic, IMR);
 
 	while (pending) {
 		i = fls(pending) - 1;
 		pending &= ~(1 << i);
 
-		ext_irq = i + sm->eim_first_irq;
+		ext_irq = i + eic->first_irq;
 		ext_desc = irq_desc + ext_irq;
 		if (ext_desc->status & IRQ_LEVEL)
 			handle_level_irq(ext_irq, ext_desc);
@@ -136,51 +174,85 @@ static void demux_eim_irq(unsigned int irq, struct irq_desc *desc)
 	}
 }
 
-static int __init eim_init(void)
+static int __init eic_probe(struct platform_device *pdev)
 {
-	struct at32_sm *sm = &system_manager;
+	struct eic *eic;
+	struct resource *regs;
 	unsigned int i;
 	unsigned int nr_irqs;
 	unsigned int int_irq;
+	int ret;
 	u32 pattern;
 
-	/*
-	 * The EIM is really the same module as SM, so register
-	 * mapping, etc. has been taken care of already.
-	 */
+	regs = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	int_irq = platform_get_irq(pdev, 0);
+	if (!regs || !int_irq) {
+		dev_dbg(&pdev->dev, "missing regs and/or irq resource\n");
+		return -ENXIO;
+	}
+
+	ret = -ENOMEM;
+	eic = kzalloc(sizeof(struct eic), GFP_KERNEL);
+	if (!eic) {
+		dev_dbg(&pdev->dev, "no memory for eic structure\n");
+		goto err_kzalloc;
+	}
+
+	eic->first_irq = EIM_IRQ_BASE + 32 * pdev->id;
+	eic->regs = ioremap(regs->start, regs->end - regs->start + 1);
+	if (!eic->regs) {
+		dev_dbg(&pdev->dev, "failed to map regs\n");
+		goto err_ioremap;
+	}
 
 	/*
 	 * Find out how many interrupt lines that are actually
 	 * implemented in hardware.
 	 */
-	sm_writel(sm, EIM_IDR, ~0UL);
-	sm_writel(sm, EIM_MODE, ~0UL);
-	pattern = sm_readl(sm, EIM_MODE);
+	eic_writel(eic, IDR, ~0UL);
+	eic_writel(eic, MODE, ~0UL);
+	pattern = eic_readl(eic, MODE);
 	nr_irqs = fls(pattern);
 
 	/* Trigger on falling edge unless overridden by driver */
-	sm_writel(sm, EIM_MODE, 0UL);
-	sm_writel(sm, EIM_EDGE, 0UL);
+	eic_writel(eic, MODE, 0UL);
+	eic_writel(eic, EDGE, 0UL);
 
-	sm->eim_chip = &eim_chip;
+	eic->chip = &eic_chip;
 
 	for (i = 0; i < nr_irqs; i++) {
 		/* NOTE the handler we set here is ignored by the demux */
-		set_irq_chip_and_handler(sm->eim_first_irq + i, &eim_chip,
+		set_irq_chip_and_handler(eic->first_irq + i, &eic_chip,
 					 handle_level_irq);
-		set_irq_chip_data(sm->eim_first_irq + i, sm);
+		set_irq_chip_data(eic->first_irq + i, eic);
 	}
 
-	int_irq = platform_get_irq_byname(sm->pdev, "eim");
-
-	set_irq_chained_handler(int_irq, demux_eim_irq);
-	set_irq_data(int_irq, sm);
+	set_irq_chained_handler(int_irq, demux_eic_irq);
+	set_irq_data(int_irq, eic);
 
-	printk("EIM: External Interrupt Module at 0x%p, IRQ %u\n",
-	       sm->regs, int_irq);
-	printk("EIM: Handling %u external IRQs, starting with IRQ %u\n",
-	       nr_irqs, sm->eim_first_irq);
+	dev_info(&pdev->dev,
+		 "External Interrupt Controller at 0x%p, IRQ %u\n",
+		 eic->regs, int_irq);
+	dev_info(&pdev->dev,
+		 "Handling %u external IRQs, starting with IRQ %u\n",
+		 nr_irqs, eic->first_irq);
 
 	return 0;
+
+err_ioremap:
+	kfree(eic);
+err_kzalloc:
+	return ret;
+}
+
+static struct platform_driver eic_driver = {
+	.driver = {
+		.name = "at32_eic",
+	},
+};
+
+static int __init eic_init(void)
+{
+	return platform_driver_probe(&eic_driver, eic_probe);
 }
-arch_initcall(eim_init);
+arch_initcall(eic_init);
diff --git a/arch/avr32/mach-at32ap/pm.h b/arch/avr32/mach-at32ap/pm.h
new file mode 100644
index 00000000000..a1f8aced0a8
--- /dev/null
+++ b/arch/avr32/mach-at32ap/pm.h
@@ -0,0 +1,112 @@
+/*
+ * Register definitions for the Power Manager (PM)
+ */
+#ifndef __ARCH_AVR32_MACH_AT32AP_PM_H__
+#define __ARCH_AVR32_MACH_AT32AP_PM_H__
+
+/* PM register offsets */
+#define PM_MCCTRL				0x0000
+#define PM_CKSEL				0x0004
+#define PM_CPU_MASK				0x0008
+#define PM_HSB_MASK				0x000c
+#define PM_PBA_MASK				0x0010
+#define PM_PBB_MASK				0x0014
+#define PM_PLL0					0x0020
+#define PM_PLL1					0x0024
+#define PM_IER					0x0040
+#define PM_IDR					0x0044
+#define PM_IMR					0x0048
+#define PM_ISR					0x004c
+#define PM_ICR					0x0050
+#define PM_GCCTRL(x)				(0x0060 + 4 * (x))
+#define PM_RCAUSE				0x00c0
+
+/* Bitfields in CKSEL */
+#define PM_CPUSEL_OFFSET			0
+#define PM_CPUSEL_SIZE				3
+#define PM_CPUDIV_OFFSET			7
+#define PM_CPUDIV_SIZE				1
+#define PM_HSBSEL_OFFSET			8
+#define PM_HSBSEL_SIZE				3
+#define PM_HSBDIV_OFFSET			15
+#define PM_HSBDIV_SIZE				1
+#define PM_PBASEL_OFFSET			16
+#define PM_PBASEL_SIZE				3
+#define PM_PBADIV_OFFSET			23
+#define PM_PBADIV_SIZE				1
+#define PM_PBBSEL_OFFSET			24
+#define PM_PBBSEL_SIZE				3
+#define PM_PBBDIV_OFFSET			31
+#define PM_PBBDIV_SIZE				1
+
+/* Bitfields in PLL0 */
+#define PM_PLLEN_OFFSET				0
+#define PM_PLLEN_SIZE				1
+#define PM_PLLOSC_OFFSET			1
+#define PM_PLLOSC_SIZE				1
+#define PM_PLLOPT_OFFSET			2
+#define PM_PLLOPT_SIZE				3
+#define PM_PLLDIV_OFFSET			8
+#define PM_PLLDIV_SIZE				8
+#define PM_PLLMUL_OFFSET			16
+#define PM_PLLMUL_SIZE				8
+#define PM_PLLCOUNT_OFFSET			24
+#define PM_PLLCOUNT_SIZE			6
+#define PM_PLLTEST_OFFSET			31
+#define PM_PLLTEST_SIZE				1
+
+/* Bitfields in ICR */
+#define PM_LOCK0_OFFSET				0
+#define PM_LOCK0_SIZE				1
+#define PM_LOCK1_OFFSET				1
+#define PM_LOCK1_SIZE				1
+#define PM_WAKE_OFFSET				2
+#define PM_WAKE_SIZE				1
+#define PM_CKRDY_OFFSET				5
+#define PM_CKRDY_SIZE				1
+#define PM_MSKRDY_OFFSET			6
+#define PM_MSKRDY_SIZE				1
+
+/* Bitfields in GCCTRL0 */
+#define PM_OSCSEL_OFFSET			0
+#define PM_OSCSEL_SIZE				1
+#define PM_PLLSEL_OFFSET			1
+#define PM_PLLSEL_SIZE				1
+#define PM_CEN_OFFSET				2
+#define PM_CEN_SIZE				1
+#define PM_DIVEN_OFFSET				4
+#define PM_DIVEN_SIZE				1
+#define PM_DIV_OFFSET				8
+#define PM_DIV_SIZE				8
+
+/* Bitfields in RCAUSE */
+#define PM_POR_OFFSET				0
+#define PM_POR_SIZE				1
+#define PM_EXT_OFFSET				2
+#define PM_EXT_SIZE				1
+#define PM_WDT_OFFSET				3
+#define PM_WDT_SIZE				1
+#define PM_NTAE_OFFSET				4
+#define PM_NTAE_SIZE				1
+
+/* Bit manipulation macros */
+#define PM_BIT(name)					\
+	(1 << PM_##name##_OFFSET)
+#define PM_BF(name,value)				\
+	(((value) & ((1 << PM_##name##_SIZE) - 1))	\
+	 << PM_##name##_OFFSET)
+#define PM_BFEXT(name,value)				\
+	(((value) >> PM_##name##_OFFSET)		\
+	 & ((1 << PM_##name##_SIZE) - 1))
+#define PM_BFINS(name,value,old)\
+	(((old) & ~(((1 << PM_##name##_SIZE) - 1)	\
+		    << PM_##name##_OFFSET))		\
+	 | PM_BF(name,value))
+
+/* Register access macros */
+#define pm_readl(reg)							\
+	__raw_readl((void __iomem *)AT32_PM_BASE + PM_##reg)
+#define pm_writel(reg,value)						\
+	__raw_writel((value), (void __iomem *)AT32_PM_BASE + PM_##reg)
+
+#endif /* __ARCH_AVR32_MACH_AT32AP_PM_H__ */
diff --git a/arch/avr32/mach-at32ap/sm.h b/arch/avr32/mach-at32ap/sm.h
deleted file mode 100644
index cad02b512bc..00000000000
--- a/arch/avr32/mach-at32ap/sm.h
+++ /dev/null
@@ -1,242 +0,0 @@
-/*
- * Register definitions for SM
- *
- * System Manager
- */
-#ifndef __ASM_AVR32_SM_H__
-#define __ASM_AVR32_SM_H__
-
-/* SM register offsets */
-#define SM_PM_MCCTRL                            0x0000
-#define SM_PM_CKSEL                             0x0004
-#define SM_PM_CPU_MASK                          0x0008
-#define SM_PM_HSB_MASK                          0x000c
-#define SM_PM_PBA_MASK                         0x0010
-#define SM_PM_PBB_MASK                         0x0014
-#define SM_PM_PLL0                              0x0020
-#define SM_PM_PLL1                              0x0024
-#define SM_PM_VCTRL                             0x0030
-#define SM_PM_VMREF                             0x0034
-#define SM_PM_VMV                               0x0038
-#define SM_PM_IER                               0x0040
-#define SM_PM_IDR                               0x0044
-#define SM_PM_IMR                               0x0048
-#define SM_PM_ISR                               0x004c
-#define SM_PM_ICR                               0x0050
-#define SM_PM_GCCTRL                            0x0060
-#define SM_RTC_CTRL                             0x0080
-#define SM_RTC_VAL                              0x0084
-#define SM_RTC_TOP                              0x0088
-#define SM_RTC_IER                              0x0090
-#define SM_RTC_IDR                              0x0094
-#define SM_RTC_IMR                              0x0098
-#define SM_RTC_ISR                              0x009c
-#define SM_RTC_ICR                              0x00a0
-#define SM_WDT_CTRL                             0x00b0
-#define SM_WDT_CLR                              0x00b4
-#define SM_WDT_EXT                              0x00b8
-#define SM_RC_RCAUSE                            0x00c0
-#define SM_EIM_IER                              0x0100
-#define SM_EIM_IDR                              0x0104
-#define SM_EIM_IMR                              0x0108
-#define SM_EIM_ISR                              0x010c
-#define SM_EIM_ICR                              0x0110
-#define SM_EIM_MODE                             0x0114
-#define SM_EIM_EDGE                             0x0118
-#define SM_EIM_LEVEL                            0x011c
-#define SM_EIM_TEST                             0x0120
-#define SM_EIM_NMIC                             0x0124
-
-/* Bitfields in PM_MCCTRL */
-
-/* Bitfields in PM_CKSEL */
-#define SM_CPUSEL_OFFSET                        0
-#define SM_CPUSEL_SIZE                          3
-#define SM_CPUDIV_OFFSET                        7
-#define SM_CPUDIV_SIZE                          1
-#define SM_HSBSEL_OFFSET                        8
-#define SM_HSBSEL_SIZE                          3
-#define SM_HSBDIV_OFFSET                        15
-#define SM_HSBDIV_SIZE                          1
-#define SM_PBASEL_OFFSET                       16
-#define SM_PBASEL_SIZE                         3
-#define SM_PBADIV_OFFSET                       23
-#define SM_PBADIV_SIZE                         1
-#define SM_PBBSEL_OFFSET                       24
-#define SM_PBBSEL_SIZE                         3
-#define SM_PBBDIV_OFFSET                       31
-#define SM_PBBDIV_SIZE                         1
-
-/* Bitfields in PM_CPU_MASK */
-
-/* Bitfields in PM_HSB_MASK */
-
-/* Bitfields in PM_PBA_MASK */
-
-/* Bitfields in PM_PBB_MASK */
-
-/* Bitfields in PM_PLL0 */
-#define SM_PLLEN_OFFSET                         0
-#define SM_PLLEN_SIZE                           1
-#define SM_PLLOSC_OFFSET                        1
-#define SM_PLLOSC_SIZE                          1
-#define SM_PLLOPT_OFFSET                        2
-#define SM_PLLOPT_SIZE                          3
-#define SM_PLLDIV_OFFSET                        8
-#define SM_PLLDIV_SIZE                          8
-#define SM_PLLMUL_OFFSET                        16
-#define SM_PLLMUL_SIZE                          8
-#define SM_PLLCOUNT_OFFSET                      24
-#define SM_PLLCOUNT_SIZE                        6
-#define SM_PLLTEST_OFFSET                       31
-#define SM_PLLTEST_SIZE                         1
-
-/* Bitfields in PM_PLL1 */
-
-/* Bitfields in PM_VCTRL */
-#define SM_VAUTO_OFFSET                         0
-#define SM_VAUTO_SIZE                           1
-#define SM_PM_VCTRL_VAL_OFFSET                  8
-#define SM_PM_VCTRL_VAL_SIZE                    7
-
-/* Bitfields in PM_VMREF */
-#define SM_REFSEL_OFFSET                        0
-#define SM_REFSEL_SIZE                          4
-
-/* Bitfields in PM_VMV */
-#define SM_PM_VMV_VAL_OFFSET                    0
-#define SM_PM_VMV_VAL_SIZE                      8
-
-/* Bitfields in PM_IER */
-
-/* Bitfields in PM_IDR */
-
-/* Bitfields in PM_IMR */
-
-/* Bitfields in PM_ISR */
-
-/* Bitfields in PM_ICR */
-#define SM_LOCK0_OFFSET                         0
-#define SM_LOCK0_SIZE                           1
-#define SM_LOCK1_OFFSET                         1
-#define SM_LOCK1_SIZE                           1
-#define SM_WAKE_OFFSET                          2
-#define SM_WAKE_SIZE                            1
-#define SM_VOK_OFFSET                           3
-#define SM_VOK_SIZE                             1
-#define SM_VMRDY_OFFSET                         4
-#define SM_VMRDY_SIZE                           1
-#define SM_CKRDY_OFFSET                         5
-#define SM_CKRDY_SIZE                           1
-
-/* Bitfields in PM_GCCTRL */
-#define SM_OSCSEL_OFFSET                        0
-#define SM_OSCSEL_SIZE                          1
-#define SM_PLLSEL_OFFSET                        1
-#define SM_PLLSEL_SIZE                          1
-#define SM_CEN_OFFSET                           2
-#define SM_CEN_SIZE                             1
-#define SM_CPC_OFFSET                           3
-#define SM_CPC_SIZE                             1
-#define SM_DIVEN_OFFSET                         4
-#define SM_DIVEN_SIZE                           1
-#define SM_DIV_OFFSET                           8
-#define SM_DIV_SIZE                             8
-
-/* Bitfields in RTC_CTRL */
-#define SM_PCLR_OFFSET                          1
-#define SM_PCLR_SIZE                            1
-#define SM_TOPEN_OFFSET                         2
-#define SM_TOPEN_SIZE                           1
-#define SM_CLKEN_OFFSET                         3
-#define SM_CLKEN_SIZE                           1
-#define SM_PSEL_OFFSET                          8
-#define SM_PSEL_SIZE                            16
-
-/* Bitfields in RTC_VAL */
-#define SM_RTC_VAL_VAL_OFFSET                   0
-#define SM_RTC_VAL_VAL_SIZE                     31
-
-/* Bitfields in RTC_TOP */
-#define SM_RTC_TOP_VAL_OFFSET                   0
-#define SM_RTC_TOP_VAL_SIZE                     32
-
-/* Bitfields in RTC_IER */
-
-/* Bitfields in RTC_IDR */
-
-/* Bitfields in RTC_IMR */
-
-/* Bitfields in RTC_ISR */
-
-/* Bitfields in RTC_ICR */
-#define SM_TOPI_OFFSET                          0
-#define SM_TOPI_SIZE                            1
-
-/* Bitfields in WDT_CTRL */
-#define SM_KEY_OFFSET                           24
-#define SM_KEY_SIZE                             8
-
-/* Bitfields in WDT_CLR */
-
-/* Bitfields in WDT_EXT */
-
-/* Bitfields in RC_RCAUSE */
-#define SM_POR_OFFSET                           0
-#define SM_POR_SIZE                             1
-#define SM_BOD_OFFSET                           1
-#define SM_BOD_SIZE                             1
-#define SM_EXT_OFFSET                           2
-#define SM_EXT_SIZE                             1
-#define SM_WDT_OFFSET                           3
-#define SM_WDT_SIZE                             1
-#define SM_NTAE_OFFSET                          4
-#define SM_NTAE_SIZE                            1
-#define SM_SERP_OFFSET                          5
-#define SM_SERP_SIZE                            1
-
-/* Bitfields in EIM_IER */
-
-/* Bitfields in EIM_IDR */
-
-/* Bitfields in EIM_IMR */
-
-/* Bitfields in EIM_ISR */
-
-/* Bitfields in EIM_ICR */
-
-/* Bitfields in EIM_MODE */
-
-/* Bitfields in EIM_EDGE */
-#define SM_INT0_OFFSET                          0
-#define SM_INT0_SIZE                            1
-#define SM_INT1_OFFSET                          1
-#define SM_INT1_SIZE                            1
-#define SM_INT2_OFFSET                          2
-#define SM_INT2_SIZE                            1
-#define SM_INT3_OFFSET                          3
-#define SM_INT3_SIZE                            1
-
-/* Bitfields in EIM_LEVEL */
-
-/* Bitfields in EIM_TEST */
-#define SM_TESTEN_OFFSET                        31
-#define SM_TESTEN_SIZE                          1
-
-/* Bitfields in EIM_NMIC */
-#define SM_EN_OFFSET                            0
-#define SM_EN_SIZE                              1
-
-/* Bit manipulation macros */
-#define SM_BIT(name)                            (1 << SM_##name##_OFFSET)
-#define SM_BF(name,value)                       (((value) & ((1 << SM_##name##_SIZE) - 1)) << SM_##name##_OFFSET)
-#define SM_BFEXT(name,value)                    (((value) >> SM_##name##_OFFSET) & ((1 << SM_##name##_SIZE) - 1))
-#define SM_BFINS(name,value,old)                (((old) & ~(((1 << SM_##name##_SIZE) - 1) << SM_##name##_OFFSET)) | SM_BF(name,value))
-
-/* Register access macros */
-#define sm_readl(port,reg)					\
-	__raw_readl((port)->regs + SM_##reg)
-#define sm_writel(port,reg,value)				\
-	__raw_writel((value), (port)->regs + SM_##reg)
-
-#endif /* __ASM_AVR32_SM_H__ */
diff --git a/arch/frv/kernel/setup.c b/arch/frv/kernel/setup.c
index c1c32e4c863..a74c08786b2 100644
--- a/arch/frv/kernel/setup.c
+++ b/arch/frv/kernel/setup.c
@@ -29,6 +29,7 @@
 #include <linux/serial.h>
 #include <linux/serial_core.h>
 #include <linux/serial_reg.h>
+#include <linux/serial_8250.h>
 
 #include <asm/setup.h>
 #include <asm/irq.h>
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index c7c9c2a15fa..7a11b905ef4 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -222,6 +222,8 @@ config PARAVIRT
 	  However, when run without a hypervisor the kernel is
 	  theoretically slower.  If in doubt, say N.
 
+source "arch/i386/xen/Kconfig"
+
 config VMI
 	bool "VMI Paravirt-ops support"
 	depends on PARAVIRT
diff --git a/arch/i386/Makefile b/arch/i386/Makefile
index 181cc29a7c4..01f0ff0daaf 100644
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -93,6 +93,9 @@ mflags-$(CONFIG_X86_ES7000)	:= -Iinclude/asm-i386/mach-es7000
 mcore-$(CONFIG_X86_ES7000)	:= mach-default
 core-$(CONFIG_X86_ES7000)	:= arch/i386/mach-es7000/
 
+# Xen paravirtualization support
+core-$(CONFIG_XEN)		+= arch/i386/xen/
+
 # default subarch .h files
 mflags-y += -Iinclude/asm-i386/mach-default
 
diff --git a/arch/i386/boot/Makefile b/arch/i386/boot/Makefile
index 08678a0a3d1..93386a4e40b 100644
--- a/arch/i386/boot/Makefile
+++ b/arch/i386/boot/Makefile
@@ -39,7 +39,7 @@ setup-y		+= printf.o string.o tty.o video.o version.o voyager.o
 setup-y		+= video-vga.o
 setup-y		+= video-vesa.o
 setup-y		+= video-bios.o
-
+targets		+= $(setup-y)
 hostprogs-y	:= tools/build
 
 HOSTCFLAGS_build.o := $(LINUXINCLUDE)
diff --git a/arch/i386/boot/boot.h b/arch/i386/boot/boot.h
index 0329c4fe4f8..dec70c9b605 100644
--- a/arch/i386/boot/boot.h
+++ b/arch/i386/boot/boot.h
@@ -56,7 +56,7 @@ static inline u16 inw(u16 port)
 
 static inline void outl(u32 v, u16 port)
 {
-	asm volatile("outl %0,%1" : : "a" (v), "dn" (port));
+	asm volatile("outl %0,%1" : : "a" (v), "dN" (port));
 }
 static inline u32 inl(u32 port)
 {
diff --git a/arch/i386/boot/compressed/relocs.c b/arch/i386/boot/compressed/relocs.c
index ce4fda261aa..b0e21c3cee5 100644
--- a/arch/i386/boot/compressed/relocs.c
+++ b/arch/i386/boot/compressed/relocs.c
@@ -31,6 +31,8 @@ static const char* safe_abs_relocs[] = {
 		"__kernel_rt_sigreturn",
 		"__kernel_sigreturn",
 		"SYSENTER_RETURN",
+		"xen_irq_disable_direct_reloc",
+		"xen_save_fl_direct_reloc",
 };
 
 static int is_safe_abs_reloc(const char* sym_name)
diff --git a/arch/i386/boot/cpucheck.c b/arch/i386/boot/cpucheck.c
index 8b0f4473b08..991e8ceae1d 100644
--- a/arch/i386/boot/cpucheck.c
+++ b/arch/i386/boot/cpucheck.c
@@ -115,8 +115,8 @@ static int has_eflag(u32 mask)
 	    "pushfl ; "
 	    "popl %1 ; "
 	    "popfl"
-	    : "=r" (f0), "=r" (f1)
-	    : "g" (mask));
+	    : "=&r" (f0), "=&r" (f1)
+	    : "ri" (mask));
 
 	return !!((f0^f1) & mask);
 }
diff --git a/arch/i386/boot/mca.c b/arch/i386/boot/mca.c
index 9b68bd1aef1..68222f2d4b6 100644
--- a/arch/i386/boot/mca.c
+++ b/arch/i386/boot/mca.c
@@ -26,7 +26,7 @@ int query_mca(void)
 	    "setc %0 ; "
 	    "movw %%es, %1 ; "
 	    "popw %%es"
-	    : "=acdSDm" (err), "=acdSDm" (es), "=b" (bx)
+	    : "=acd" (err), "=acdSD" (es), "=b" (bx)
 	    : "a" (0xc000));
 
 	if (err)
diff --git a/arch/i386/boot/pm.c b/arch/i386/boot/pm.c
index 3fa53e15ed7..1df025c7326 100644
--- a/arch/i386/boot/pm.c
+++ b/arch/i386/boot/pm.c
@@ -65,7 +65,7 @@ static void move_kernel_around(void)
 			     "popw %%ds ; "
 			     "popw %%es"
 			     : "+c" (dwords)
-			     : "rm" (dst_seg), "rm" (src_seg)
+			     : "r" (dst_seg), "r" (src_seg)
 			     : "esi", "edi");
 
 		syssize -= paras;
diff --git a/arch/i386/boot/tools/build.c b/arch/i386/boot/tools/build.c
index 886f47d8a48..b4248740ff0 100644
--- a/arch/i386/boot/tools/build.c
+++ b/arch/i386/boot/tools/build.c
@@ -5,7 +5,7 @@
  */
 
 /*
- * This file builds a disk-image from three different files:
+ * This file builds a disk-image from two different files:
  *
  * - setup: 8086 machine code, sets up system parm
  * - system: 80386 code for actual system
diff --git a/arch/i386/boot/tty.c b/arch/i386/boot/tty.c
index a8db78736b0..9c668aad351 100644
--- a/arch/i386/boot/tty.c
+++ b/arch/i386/boot/tty.c
@@ -31,7 +31,7 @@ void __attribute__((section(".inittext"))) putchar(int ch)
 
 	/* int $0x10 is known to have bugs involving touching registers
 	   it shouldn't.  Be extra conservative... */
-	asm volatile("pushal; int $0x10; popal"
+	asm volatile("pushal; pushw %%ds; int $0x10; popw %%ds; popal"
 		     : : "b" (0x0007), "c" (0x0001), "a" (0x0e00|ch));
 }
 
diff --git a/arch/i386/boot/video.c b/arch/i386/boot/video.c
index 3bb3573cd6a..958130ef004 100644
--- a/arch/i386/boot/video.c
+++ b/arch/i386/boot/video.c
@@ -195,7 +195,7 @@ static void vga_recalc_vertical(void)
 {
 	unsigned int font_size, rows;
 	u16 crtc;
-	u8 ov;
+	u8 pt, ov;
 
 	set_fs(0);
 	font_size = rdfs8(0x485); /* BIOS: font size (pixels) */
@@ -206,7 +206,12 @@ static void vga_recalc_vertical(void)
 
 	crtc = vga_crtc();
 
+	pt = in_idx(crtc, 0x11);
+	pt &= ~0x80;		/* Unlock CR0-7 */
+	out_idx(pt, crtc, 0x11);
+
 	out_idx((u8)rows, crtc, 0x12); /* Lower height register */
+
 	ov = in_idx(crtc, 0x07); /* Overflow register */
 	ov &= 0xbd;
 	ov |= (rows >> (8-1)) & 0x02;
@@ -411,7 +416,7 @@ static void restore_screen(void)
 			     "1: rep;stosl ; "
 			     "popw %%es"
 			     : "+D" (dst), "+c" (npad)
-			     : "bdSm" (video_segment),
+			     : "bdS" (video_segment),
 			       "a" (0x07200720));
 	}
 
diff --git a/arch/i386/boot/video.h b/arch/i386/boot/video.h
index 29eca1710b2..b92447d5121 100644
--- a/arch/i386/boot/video.h
+++ b/arch/i386/boot/video.h
@@ -117,8 +117,15 @@ extern int graphic_mode;	/* Graphics mode with linear frame buffer */
  * int $0x10 is notorious for touching registers it shouldn't.
  * gcc doesn't like %ebp being clobbered, so define it as a push/pop
  * sequence here.
+ *
+ * A number of systems, including the original PC can clobber %bp in
+ * certain circumstances, like when scrolling.  There exists at least
+ * one Trident video card which could clobber DS under a set of
+ * circumstances that we are unlikely to encounter (scrolling when
+ * using an extended graphics mode of more than 800x600 pixels), but
+ * it's cheap insurance to deal with that here.
  */
-#define INT10 "pushl %%ebp; int $0x10; popl %%ebp"
+#define INT10 "pushl %%ebp; pushw %%ds; int $0x10; popw %%ds; popl %%ebp"
 
 /* Accessing VGA indexed registers */
 static inline u8 in_idx(u16 port, u8 index)
diff --git a/arch/i386/boot/voyager.c b/arch/i386/boot/voyager.c
index 9221614d0db..61c8fe0453b 100644
--- a/arch/i386/boot/voyager.c
+++ b/arch/i386/boot/voyager.c
@@ -32,7 +32,7 @@ int query_voyager(void)
 	    "setc %0 ; "
 	    "movw %%es, %1 ; "
 	    "popw %%es"
-	    : "=qm" (err), "=rm" (es), "=D" (di)
+	    : "=q" (err), "=r" (es), "=D" (di)
 	    : "a" (0xffc0));
 
 	if (err)
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index 27a776c9044..25f7eb51392 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -17,6 +17,8 @@
 #include <asm/thread_info.h>
 #include <asm/elf.h>
 
+#include <xen/interface/xen.h>
+
 #define DEFINE(sym, val) \
         asm volatile("\n->" #sym " %0 " #val : : "i" (val))
 
@@ -59,6 +61,7 @@ void foo(void)
 	OFFSET(TI_addr_limit, thread_info, addr_limit);
 	OFFSET(TI_restart_block, thread_info, restart_block);
 	OFFSET(TI_sysenter_return, thread_info, sysenter_return);
+	OFFSET(TI_cpu, thread_info, cpu);
 	BLANK();
 
 	OFFSET(GDS_size, Xgt_desc_struct, size);
@@ -115,4 +118,10 @@ void foo(void)
 	OFFSET(PARAVIRT_iret, paravirt_ops, iret);
 	OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
 #endif
+
+#ifdef CONFIG_XEN
+	BLANK();
+	OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
+	OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
+#endif
 }
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 3c3c220488c..a714d6b4350 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -409,8 +409,6 @@ restore_nocheck_notrace:
 1:	INTERRUPT_RETURN
 .section .fixup,"ax"
 iret_exc:
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushl $0			# no error code
 	pushl $do_iret_error
 	jmp error_code
@@ -1023,6 +1021,91 @@ ENTRY(kernel_thread_helper)
 	CFI_ENDPROC
 ENDPROC(kernel_thread_helper)
 
+#ifdef CONFIG_XEN
+ENTRY(xen_hypervisor_callback)
+	CFI_STARTPROC
+	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	TRACE_IRQS_OFF
+
+	/* Check to see if we got the event in the critical
+	   region in xen_iret_direct, after we've reenabled
+	   events and checked for pending events.  This simulates
+	   iret instruction's behaviour where it delivers a
+	   pending interrupt when enabling interrupts. */
+	movl PT_EIP(%esp),%eax
+	cmpl $xen_iret_start_crit,%eax
+	jb   1f
+	cmpl $xen_iret_end_crit,%eax
+	jae  1f
+
+	call xen_iret_crit_fixup
+
+1:	mov %esp, %eax
+	call xen_evtchn_do_upcall
+	jmp  ret_from_intr
+	CFI_ENDPROC
+ENDPROC(xen_hypervisor_callback)
+
+# Hypervisor uses this for application faults while it executes.
+# We get here for two reasons:
+#  1. Fault while reloading DS, ES, FS or GS
+#  2. Fault while executing IRET
+# Category 1 we fix up by reattempting the load, and zeroing the segment
+# register if the load fails.
+# Category 2 we fix up by jumping to do_iret_error. We cannot use the
+# normal Linux return path in this case because if we use the IRET hypercall
+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+# We distinguish between categories by maintaining a status value in EAX.
+ENTRY(xen_failsafe_callback)
+	CFI_STARTPROC
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	movl $1,%eax
+1:	mov 4(%esp),%ds
+2:	mov 8(%esp),%es
+3:	mov 12(%esp),%fs
+4:	mov 16(%esp),%gs
+	testl %eax,%eax
+	popl %eax
+	CFI_ADJUST_CFA_OFFSET -4
+	lea 16(%esp),%esp
+	CFI_ADJUST_CFA_OFFSET -16
+	jz 5f
+	addl $16,%esp
+	jmp iret_exc		# EAX != 0 => Category 2 (Bad IRET)
+5:	pushl $0		# EAX == 0 => Category 1 (Bad segment)
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	jmp ret_from_exception
+	CFI_ENDPROC
+
+.section .fixup,"ax"
+6:	xorl %eax,%eax
+	movl %eax,4(%esp)
+	jmp 1b
+7:	xorl %eax,%eax
+	movl %eax,8(%esp)
+	jmp 2b
+8:	xorl %eax,%eax
+	movl %eax,12(%esp)
+	jmp 3b
+9:	xorl %eax,%eax
+	movl %eax,16(%esp)
+	jmp 4b
+.previous
+.section __ex_table,"a"
+	.align 4
+	.long 1b,6b
+	.long 2b,7b
+	.long 3b,8b
+	.long 4b,9b
+.previous
+ENDPROC(xen_failsafe_callback)
+
+#endif	/* CONFIG_XEN */
+
 .section .rodata,"a"
 #include "syscall_table.S"
 
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index 82714668d43..7c52b222207 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -510,7 +510,8 @@ ENTRY(_stext)
 /*
  * BSS section
  */
-.section ".bss.page_aligned","w"
+.section ".bss.page_aligned","wa"
+	.align PAGE_SIZE_asm
 ENTRY(swapper_pg_dir)
 	.fill 1024,4,0
 ENTRY(swapper_pg_pmd)
@@ -538,6 +539,8 @@ fault_msg:
 	.ascii "Int %d: CR2 %p  err %p  EIP %p  CS %p  flags %p\n"
 	.asciz "Stack: %p %p %p %p %p %p %p %p\n"
 
+#include "../xen/xen-head.S"
+
 /*
  * The IDT and GDT 'descriptors' are a strange 48-bit object
  * only used by the lidt and lgdt instructions. They are not
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
index faab09abca5..53f07a8275e 100644
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -228,6 +228,41 @@ static int __init print_banner(void)
 }
 core_initcall(print_banner);
 
+static struct resource reserve_ioports = {
+	.start = 0,
+	.end = IO_SPACE_LIMIT,
+	.name = "paravirt-ioport",
+	.flags = IORESOURCE_IO | IORESOURCE_BUSY,
+};
+
+static struct resource reserve_iomem = {
+	.start = 0,
+	.end = -1,
+	.name = "paravirt-iomem",
+	.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
+};
+
+/*
+ * Reserve the whole legacy IO space to prevent any legacy drivers
+ * from wasting time probing for their hardware.  This is a fairly
+ * brute-force approach to disabling all non-virtual drivers.
+ *
+ * Note that this must be called very early to have any effect.
+ */
+int paravirt_disable_iospace(void)
+{
+	int ret;
+
+	ret = request_resource(&ioport_resource, &reserve_ioports);
+	if (ret == 0) {
+		ret = request_resource(&iomem_resource, &reserve_iomem);
+		if (ret)
+			release_resource(&reserve_ioports);
+	}
+
+	return ret;
+}
+
 struct paravirt_ops paravirt_ops = {
 	.name = "bare hardware",
 	.paravirt_enabled = 0,
@@ -267,7 +302,7 @@ struct paravirt_ops paravirt_ops = {
 	.write_msr = native_write_msr_safe,
 	.read_tsc = native_read_tsc,
 	.read_pmc = native_read_pmc,
-	.get_scheduled_cycles = native_read_tsc,
+	.sched_clock = native_sched_clock,
 	.get_cpu_khz = native_calculate_cpu_khz,
 	.load_tr_desc = native_load_tr_desc,
 	.set_ldt = native_set_ldt,
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index 1c075f58d1f..0c8f00e69c4 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -164,14 +164,22 @@ static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_
 		u32 *desc;
 		unsigned long base;
 
-		down(&child->mm->context.sem);
-		desc = child->mm->context.ldt + (seg & ~7);
-		base = (desc[0] >> 16) | ((desc[1] & 0xff) << 16) | (desc[1] & 0xff000000);
+		seg &= ~7UL;
 
-		/* 16-bit code segment? */
-		if (!((desc[1] >> 22) & 1))
-			addr &= 0xffff;
-		addr += base;
+		down(&child->mm->context.sem);
+		if (unlikely((seg >> 3) >= child->mm->context.size))
+			addr = -1L; /* bogus selector, access would fault */
+		else {
+			desc = child->mm->context.ldt + seg;
+			base = ((desc[0] >> 16) |
+				((desc[1] & 0xff) << 16) |
+				(desc[1] & 0xff000000));
+
+			/* 16-bit code segment? */
+			if (!((desc[1] >> 22) & 1))
+				addr &= 0xffff;
+			addr += base;
+		}
 		up(&child->mm->context.sem);
 	}
 	return addr;
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 2d61e65eeb5..74871d066c2 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -601,6 +601,8 @@ void __init setup_arch(char **cmdline_p)
 	 * NOTE: at this point the bootmem allocator is fully available.
 	 */
 
+	paravirt_post_allocator_init();
+
 	dmi_scan_machine();
 
 #ifdef CONFIG_X86_GENERICARCH
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 6299c080f6e..2d35d850202 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -22,6 +22,7 @@
 
 #include <asm/mtrr.h>
 #include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
 #include <mach_apic.h>
 
 /*
@@ -249,13 +250,13 @@ static unsigned long flush_va;
 static DEFINE_SPINLOCK(tlbstate_lock);
 
 /*
- * We cannot call mmdrop() because we are in interrupt context, 
+ * We cannot call mmdrop() because we are in interrupt context,
  * instead update mm->cpu_vm_mask.
  *
  * We need to reload %cr3 since the page tables may be going
  * away from under us..
  */
-static inline void leave_mm (unsigned long cpu)
+void leave_mm(unsigned long cpu)
 {
 	if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
 		BUG();
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 0b2954534b8..5910d3fac56 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -148,7 +148,7 @@ void __init smp_alloc_memory(void)
  * a given CPU
  */
 
-static void __cpuinit smp_store_cpu_info(int id)
+void __cpuinit smp_store_cpu_info(int id)
 {
 	struct cpuinfo_x86 *c = cpu_data + id;
 
@@ -308,8 +308,7 @@ cpumask_t cpu_coregroup_map(int cpu)
 /* representing cpus for which sibling maps can be computed */
 static cpumask_t cpu_sibling_setup_map;
 
-static inline void
-set_cpu_sibling_map(int cpu)
+void set_cpu_sibling_map(int cpu)
 {
 	int i;
 	struct cpuinfo_x86 *c = cpu_data;
@@ -1144,8 +1143,7 @@ void __init native_smp_prepare_boot_cpu(void)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static void
-remove_siblinginfo(int cpu)
+void remove_siblinginfo(int cpu)
 {
 	int sibling;
 	struct cpuinfo_x86 *c = cpu_data;
diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index bf6adce5226..8344c70adf6 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -323,3 +323,4 @@ ENTRY(sys_call_table)
 	.long sys_signalfd
 	.long sys_timerfd
 	.long sys_eventfd
+	.long sys_fallocate
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 18c1c285836..d32fd4b6f78 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -518,10 +518,12 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \
 	do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
 }
 
-#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
 fastcall void do_##name(struct pt_regs * regs, long error_code) \
 { \
 	siginfo_t info; \
+	if (irq) \
+		local_irq_enable(); \
 	info.si_signo = signr; \
 	info.si_errno = 0; \
 	info.si_code = sicode; \
@@ -561,13 +563,13 @@ DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
 #endif
 DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
 DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip)
+DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0)
 DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
 DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
 DO_ERROR(12, SIGBUS,  "stack segment", stack_segment)
-DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
-DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0)
+DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
+DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
 
 fastcall void __kprobes do_general_protection(struct pt_regs * regs,
 					      long error_code)
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index ea63a30ca3e..252f9010f28 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -84,7 +84,7 @@ static inline int check_tsc_unstable(void)
  *
  *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
  */
-static unsigned long cyc2ns_scale __read_mostly;
+unsigned long cyc2ns_scale __read_mostly;
 
 #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
 
@@ -93,15 +93,10 @@ static inline void set_cyc2ns_scale(unsigned long cpu_khz)
 	cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
 }
 
-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
-{
-	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
-}
-
 /*
  * Scheduler clock - returns current time in nanosec units.
  */
-unsigned long long sched_clock(void)
+unsigned long long native_sched_clock(void)
 {
 	unsigned long long this_offset;
 
@@ -118,12 +113,24 @@ unsigned long long sched_clock(void)
 		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
 
 	/* read the Time Stamp Counter: */
-	get_scheduled_cycles(this_offset);
+	rdtscll(this_offset);
 
 	/* return the value in ns */
 	return cycles_2_ns(this_offset);
 }
 
+/* We need to define a real function for sched_clock, to override the
+   weak default version */
+#ifdef CONFIG_PARAVIRT
+unsigned long long sched_clock(void)
+{
+	return paravirt_sched_clock();
+}
+#else
+unsigned long long sched_clock(void)
+	__attribute__((alias("native_sched_clock")));
+#endif
+
 unsigned long native_calculate_cpu_khz(void)
 {
 	unsigned long long start, end;
diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c
index c12720d7cbc..72042bb7ec9 100644
--- a/arch/i386/kernel/vmi.c
+++ b/arch/i386/kernel/vmi.c
@@ -362,7 +362,7 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
 }
 #endif
 
-static void vmi_allocate_pt(u32 pfn)
+static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn)
 {
 	vmi_set_page_type(pfn, VMI_PAGE_L1);
 	vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
@@ -891,7 +891,7 @@ static inline int __init activate_vmi(void)
 		paravirt_ops.setup_boot_clock = vmi_time_bsp_init;
 		paravirt_ops.setup_secondary_clock = vmi_time_ap_init;
 #endif
-		paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles;
+		paravirt_ops.sched_clock = vmi_sched_clock;
  		paravirt_ops.get_cpu_khz = vmi_cpu_khz;
 
 		/* We have true wallclock functions; disable CMOS clock sync */
diff --git a/arch/i386/kernel/vmiclock.c b/arch/i386/kernel/vmiclock.c
index 26a37f8a876..f9b845f4e69 100644
--- a/arch/i386/kernel/vmiclock.c
+++ b/arch/i386/kernel/vmiclock.c
@@ -64,10 +64,10 @@ int vmi_set_wallclock(unsigned long now)
 	return 0;
 }
 
-/* paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles */
-unsigned long long vmi_get_sched_cycles(void)
+/* paravirt_ops.sched_clock = vmi_sched_clock */
+unsigned long long vmi_sched_clock(void)
 {
-	return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE);
+	return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
 }
 
 /* paravirt_ops.get_cpu_khz = vmi_cpu_khz */
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index aa87b06c7c8..00f1bc47d3a 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -88,6 +88,7 @@ SECTIONS
 
   . = ALIGN(4096);
   .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+	*(.data.page_aligned)
 	*(.data.idt)
   }
 
diff --git a/arch/i386/kernel/vsyscall-note.S b/arch/i386/kernel/vsyscall-note.S
index d4b5be4f3d5..271f16a8ca0 100644
--- a/arch/i386/kernel/vsyscall-note.S
+++ b/arch/i386/kernel/vsyscall-note.S
@@ -3,23 +3,40 @@
  * Here we can supply some information useful to userland.
  */
 
-#include <linux/uts.h>
 #include <linux/version.h>
+#include <linux/elfnote.h>
 
-#define ASM_ELF_NOTE_BEGIN(name, flags, vendor, type)			      \
-	.section name, flags;						      \
-	.balign 4;							      \
-	.long 1f - 0f;		/* name length */			      \
-	.long 3f - 2f;		/* data length */			      \
-	.long type;		/* note type */				      \
-0:	.asciz vendor;		/* vendor name */			      \
-1:	.balign 4;							      \
-2:
+/* Ideally this would use UTS_NAME, but using a quoted string here
+   doesn't work. Remember to change this when changing the
+   kernel's name. */
+ELFNOTE_START(Linux, 0, "a")
+	.long LINUX_VERSION_CODE
+ELFNOTE_END
 
-#define ASM_ELF_NOTE_END						      \
-3:	.balign 4;		/* pad out section */			      \
-	.previous
+#ifdef CONFIG_XEN
 
-	ASM_ELF_NOTE_BEGIN(".note.kernel-version", "a", UTS_SYSNAME, 0)
-	.long LINUX_VERSION_CODE
-	ASM_ELF_NOTE_END
+/*
+ * Add a special note telling glibc's dynamic linker a fake hardware
+ * flavor that it will use to choose the search path for libraries in the
+ * same way it uses real hardware capabilities like "mmx".
+ * We supply "nosegneg" as the fake capability, to indicate that we
+ * do not like negative offsets in instructions using segment overrides,
+ * since we implement those inefficiently.  This makes it possible to
+ * install libraries optimized to avoid those access patterns in someplace
+ * like /lib/i686/tls/nosegneg.  Note that an /etc/ld.so.conf.d/file
+ * corresponding to the bits here is needed to make ldconfig work right.
+ * It should contain:
+ *	hwcap 1 nosegneg
+ * to match the mapping of bit to name that we give here.
+ */
+
+/* Bit used for the pseudo-hwcap for non-negative segments.  We use
+   bit 1 to avoid bugs in some versions of glibc when bit 0 is
+   used; the choice is otherwise arbitrary. */
+#define VDSO_NOTE_NONEGSEG_BIT	1
+
+ELFNOTE_START(GNU, 2, "a")
+	.long 1, 1<<VDSO_NOTE_NONEGSEG_BIT		/* ncaps, mask */
+	.byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg"	/* bit, name */
+ELFNOTE_END
+#endif
diff --git a/arch/i386/mach-voyager/voyager_thread.c b/arch/i386/mach-voyager/voyager_thread.c
index b4b24e0e45e..f9d59533815 100644
--- a/arch/i386/mach-voyager/voyager_thread.c
+++ b/arch/i386/mach-voyager/voyager_thread.c
@@ -52,7 +52,7 @@ execute(const char *string)
 		NULL,
 	};
 
-	if ((ret = call_usermodehelper(argv[0], argv, envp, 1)) != 0) {
+	if ((ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) {
 		printk(KERN_ERR "Voyager failed to run \"%s\": %i\n",
 		       string, ret);
 	}
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 7135946d366..6a68b1ae061 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -87,7 +87,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
 	if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
 		pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
 
-		paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT);
+		paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
 		set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
 		BUG_ON(page_table != pte_offset_kernel(pmd, 0));
 	}
@@ -473,6 +473,7 @@ void zap_low_mappings (void)
 
 static int disable_nx __initdata = 0;
 u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
 
 /*
  * noexec = on|off
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index 2eb14a73be9..37992ffb163 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -60,7 +60,7 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
 	address = __pa(address);
 	addr = address & LARGE_PAGE_MASK; 
 	pbase = (pte_t *)page_address(base);
-	paravirt_alloc_pt(page_to_pfn(base));
+	paravirt_alloc_pt(&init_mm, page_to_pfn(base));
 	for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
                set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
                                           addr == address ? prot : ref_prot));
diff --git a/arch/i386/xen/Kconfig b/arch/i386/xen/Kconfig
new file mode 100644
index 00000000000..9df99e1885a
--- /dev/null
+++ b/arch/i386/xen/Kconfig
@@ -0,0 +1,11 @@
+#
+# This Kconfig describes xen options
+#
+
+config XEN
+	bool "Enable support for Xen hypervisor"
+	depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES
+	help
+	  This is the Linux Xen port.  Enabling this will allow the
+	  kernel to boot in a paravirtualized environment under the
+	  Xen hypervisor.
diff --git a/arch/i386/xen/Makefile b/arch/i386/xen/Makefile
new file mode 100644
index 00000000000..343df246bd3
--- /dev/null
+++ b/arch/i386/xen/Makefile
@@ -0,0 +1,4 @@
+obj-y		:= enlighten.o setup.o features.o multicalls.o mmu.o \
+			events.o time.o manage.o xen-asm.o
+
+obj-$(CONFIG_SMP)	+= smp.o
diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c
new file mode 100644
index 00000000000..9a8c1181c00
--- /dev/null
+++ b/arch/i386/xen/enlighten.c
@@ -0,0 +1,1144 @@
+/*
+ * Core of Xen paravirt_ops implementation.
+ *
+ * This file contains the xen_paravirt_ops structure itself, and the
+ * implementations for:
+ * - privileged instructions
+ * - interrupt flags
+ * - segment operations
+ * - booting and setup
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/preempt.h>
+#include <linux/hardirq.h>
+#include <linux/percpu.h>
+#include <linux/delay.h>
+#include <linux/start_kernel.h>
+#include <linux/sched.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/page-flags.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/physdev.h>
+#include <xen/interface/vcpu.h>
+#include <xen/interface/sched.h>
+#include <xen/features.h>
+#include <xen/page.h>
+
+#include <asm/paravirt.h>
+#include <asm/page.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/fixmap.h>
+#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/reboot.h>
+
+#include "xen-ops.h"
+#include "mmu.h"
+#include "multicalls.h"
+
+EXPORT_SYMBOL_GPL(hypercall_page);
+
+DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
+
+DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
+DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
+DEFINE_PER_CPU(unsigned long, xen_cr3);
+
+struct start_info *xen_start_info;
+EXPORT_SYMBOL_GPL(xen_start_info);
+
+static /* __initdata */ struct shared_info dummy_shared_info;
+
+/*
+ * Point at some empty memory to start with. We map the real shared_info
+ * page as soon as fixmap is up and running.
+ */
+struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info;
+
+/*
+ * Flag to determine whether vcpu info placement is available on all
+ * VCPUs.  We assume it is to start with, and then set it to zero on
+ * the first failure.  This is because it can succeed on some VCPUs
+ * and not others, since it can involve hypervisor memory allocation,
+ * or because the guest failed to guarantee all the appropriate
+ * constraints on all VCPUs (ie buffer can't cross a page boundary).
+ *
+ * Note that any particular CPU may be using a placed vcpu structure,
+ * but we can only optimise if the all are.
+ *
+ * 0: not available, 1: available
+ */
+static int have_vcpu_info_placement = 1;
+
+static void __init xen_vcpu_setup(int cpu)
+{
+	struct vcpu_register_vcpu_info info;
+	int err;
+	struct vcpu_info *vcpup;
+
+	per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+
+	if (!have_vcpu_info_placement)
+		return;		/* already tested, not available */
+
+	vcpup = &per_cpu(xen_vcpu_info, cpu);
+
+	info.mfn = virt_to_mfn(vcpup);
+	info.offset = offset_in_page(vcpup);
+
+	printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n",
+	       cpu, vcpup, info.mfn, info.offset);
+
+	/* Check to see if the hypervisor will put the vcpu_info
+	   structure where we want it, which allows direct access via
+	   a percpu-variable. */
+	err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info);
+
+	if (err) {
+		printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
+		have_vcpu_info_placement = 0;
+	} else {
+		/* This cpu is using the registered vcpu info, even if
+		   later ones fail to. */
+		per_cpu(xen_vcpu, cpu) = vcpup;
+
+		printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
+		       cpu, vcpup);
+	}
+}
+
+static void __init xen_banner(void)
+{
+	printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
+	       paravirt_ops.name);
+	printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
+}
+
+static void xen_cpuid(unsigned int *eax, unsigned int *ebx,
+		      unsigned int *ecx, unsigned int *edx)
+{
+	unsigned maskedx = ~0;
+
+	/*
+	 * Mask out inconvenient features, to try and disable as many
+	 * unsupported kernel subsystems as possible.
+	 */
+	if (*eax == 1)
+		maskedx = ~((1 << X86_FEATURE_APIC) |  /* disable APIC */
+			    (1 << X86_FEATURE_ACPI) |  /* disable ACPI */
+			    (1 << X86_FEATURE_ACC));   /* thermal monitoring */
+
+	asm(XEN_EMULATE_PREFIX "cpuid"
+		: "=a" (*eax),
+		  "=b" (*ebx),
+		  "=c" (*ecx),
+		  "=d" (*edx)
+		: "0" (*eax), "2" (*ecx));
+	*edx &= maskedx;
+}
+
+static void xen_set_debugreg(int reg, unsigned long val)
+{
+	HYPERVISOR_set_debugreg(reg, val);
+}
+
+static unsigned long xen_get_debugreg(int reg)
+{
+	return HYPERVISOR_get_debugreg(reg);
+}
+
+static unsigned long xen_save_fl(void)
+{
+	struct vcpu_info *vcpu;
+	unsigned long flags;
+
+	vcpu = x86_read_percpu(xen_vcpu);
+
+	/* flag has opposite sense of mask */
+	flags = !vcpu->evtchn_upcall_mask;
+
+	/* convert to IF type flag
+	   -0 -> 0x00000000
+	   -1 -> 0xffffffff
+	*/
+	return (-flags) & X86_EFLAGS_IF;
+}
+
+static void xen_restore_fl(unsigned long flags)
+{
+	struct vcpu_info *vcpu;
+
+	/* convert from IF type flag */
+	flags = !(flags & X86_EFLAGS_IF);
+
+	/* There's a one instruction preempt window here.  We need to
+	   make sure we're don't switch CPUs between getting the vcpu
+	   pointer and updating the mask. */
+	preempt_disable();
+	vcpu = x86_read_percpu(xen_vcpu);
+	vcpu->evtchn_upcall_mask = flags;
+	preempt_enable_no_resched();
+
+	/* Doesn't matter if we get preempted here, because any
+	   pending event will get dealt with anyway. */
+
+	if (flags == 0) {
+		preempt_check_resched();
+		barrier(); /* unmask then check (avoid races) */
+		if (unlikely(vcpu->evtchn_upcall_pending))
+			force_evtchn_callback();
+	}
+}
+
+static void xen_irq_disable(void)
+{
+	/* There's a one instruction preempt window here.  We need to
+	   make sure we're don't switch CPUs between getting the vcpu
+	   pointer and updating the mask. */
+	preempt_disable();
+	x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
+	preempt_enable_no_resched();
+}
+
+static void xen_irq_enable(void)
+{
+	struct vcpu_info *vcpu;
+
+	/* There's a one instruction preempt window here.  We need to
+	   make sure we're don't switch CPUs between getting the vcpu
+	   pointer and updating the mask. */
+	preempt_disable();
+	vcpu = x86_read_percpu(xen_vcpu);
+	vcpu->evtchn_upcall_mask = 0;
+	preempt_enable_no_resched();
+
+	/* Doesn't matter if we get preempted here, because any
+	   pending event will get dealt with anyway. */
+
+	barrier(); /* unmask then check (avoid races) */
+	if (unlikely(vcpu->evtchn_upcall_pending))
+		force_evtchn_callback();
+}
+
+static void xen_safe_halt(void)
+{
+	/* Blocking includes an implicit local_irq_enable(). */
+	if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0)
+		BUG();
+}
+
+static void xen_halt(void)
+{
+	if (irqs_disabled())
+		HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
+	else
+		xen_safe_halt();
+}
+
+static void xen_set_lazy_mode(enum paravirt_lazy_mode mode)
+{
+	BUG_ON(preemptible());
+
+	switch (mode) {
+	case PARAVIRT_LAZY_NONE:
+		BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE);
+		break;
+
+	case PARAVIRT_LAZY_MMU:
+	case PARAVIRT_LAZY_CPU:
+		BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE);
+		break;
+
+	case PARAVIRT_LAZY_FLUSH:
+		/* flush if necessary, but don't change state */
+		if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE)
+			xen_mc_flush();
+		return;
+	}
+
+	xen_mc_flush();
+	x86_write_percpu(xen_lazy_mode, mode);
+}
+
+static unsigned long xen_store_tr(void)
+{
+	return 0;
+}
+
+static void xen_set_ldt(const void *addr, unsigned entries)
+{
+	unsigned long linear_addr = (unsigned long)addr;
+	struct mmuext_op *op;
+	struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+
+	op = mcs.args;
+	op->cmd = MMUEXT_SET_LDT;
+	if (linear_addr) {
+		/* ldt my be vmalloced, use arbitrary_virt_to_machine */
+		xmaddr_t maddr;
+		maddr = arbitrary_virt_to_machine((unsigned long)addr);
+		linear_addr = (unsigned long)maddr.maddr;
+	}
+	op->arg1.linear_addr = linear_addr;
+	op->arg2.nr_ents = entries;
+
+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+	xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
+static void xen_load_gdt(const struct Xgt_desc_struct *dtr)
+{
+	unsigned long *frames;
+	unsigned long va = dtr->address;
+	unsigned int size = dtr->size + 1;
+	unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+	int f;
+	struct multicall_space mcs;
+
+	/* A GDT can be up to 64k in size, which corresponds to 8192
+	   8-byte entries, or 16 4k pages.. */
+
+	BUG_ON(size > 65536);
+	BUG_ON(va & ~PAGE_MASK);
+
+	mcs = xen_mc_entry(sizeof(*frames) * pages);
+	frames = mcs.args;
+
+	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
+		frames[f] = virt_to_mfn(va);
+		make_lowmem_page_readonly((void *)va);
+	}
+
+	MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
+
+	xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
+static void load_TLS_descriptor(struct thread_struct *t,
+				unsigned int cpu, unsigned int i)
+{
+	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+	xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+	struct multicall_space mc = __xen_mc_entry(0);
+
+	MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
+}
+
+static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+	xen_mc_batch();
+
+	load_TLS_descriptor(t, cpu, 0);
+	load_TLS_descriptor(t, cpu, 1);
+	load_TLS_descriptor(t, cpu, 2);
+
+	xen_mc_issue(PARAVIRT_LAZY_CPU);
+
+	/*
+	 * XXX sleazy hack: If we're being called in a lazy-cpu zone,
+	 * it means we're in a context switch, and %gs has just been
+	 * saved.  This means we can zero it out to prevent faults on
+	 * exit from the hypervisor if the next process has no %gs.
+	 * Either way, it has been saved, and the new value will get
+	 * loaded properly.  This will go away as soon as Xen has been
+	 * modified to not save/restore %gs for normal hypercalls.
+	 */
+	if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU)
+		loadsegment(gs, 0);
+}
+
+static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
+				u32 low, u32 high)
+{
+	unsigned long lp = (unsigned long)&dt[entrynum];
+	xmaddr_t mach_lp = virt_to_machine(lp);
+	u64 entry = (u64)high << 32 | low;
+
+	preempt_disable();
+
+	xen_mc_flush();
+	if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
+		BUG();
+
+	preempt_enable();
+}
+
+static int cvt_gate_to_trap(int vector, u32 low, u32 high,
+			    struct trap_info *info)
+{
+	u8 type, dpl;
+
+	type = (high >> 8) & 0x1f;
+	dpl = (high >> 13) & 3;
+
+	if (type != 0xf && type != 0xe)
+		return 0;
+
+	info->vector = vector;
+	info->address = (high & 0xffff0000) | (low & 0x0000ffff);
+	info->cs = low >> 16;
+	info->flags = dpl;
+	/* interrupt gates clear IF */
+	if (type == 0xe)
+		info->flags |= 4;
+
+	return 1;
+}
+
+/* Locations of each CPU's IDT */
+static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc);
+
+/* Set an IDT entry.  If the entry is part of the current IDT, then
+   also update Xen. */
+static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
+				u32 low, u32 high)
+{
+	unsigned long p = (unsigned long)&dt[entrynum];
+	unsigned long start, end;
+
+	preempt_disable();
+
+	start = __get_cpu_var(idt_desc).address;
+	end = start + __get_cpu_var(idt_desc).size + 1;
+
+	xen_mc_flush();
+
+	write_dt_entry(dt, entrynum, low, high);
+
+	if (p >= start && (p + 8) <= end) {
+		struct trap_info info[2];
+
+		info[1].address = 0;
+
+		if (cvt_gate_to_trap(entrynum, low, high, &info[0]))
+			if (HYPERVISOR_set_trap_table(info))
+				BUG();
+	}
+
+	preempt_enable();
+}
+
+static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
+				  struct trap_info *traps)
+{
+	unsigned in, out, count;
+
+	count = (desc->size+1) / 8;
+	BUG_ON(count > 256);
+
+	for (in = out = 0; in < count; in++) {
+		const u32 *entry = (u32 *)(desc->address + in * 8);
+
+		if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out]))
+			out++;
+	}
+	traps[out].address = 0;
+}
+
+void xen_copy_trap_info(struct trap_info *traps)
+{
+	const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc);
+
+	xen_convert_trap_info(desc, traps);
+}
+
+/* Load a new IDT into Xen.  In principle this can be per-CPU, so we
+   hold a spinlock to protect the static traps[] array (static because
+   it avoids allocation, and saves stack space). */
+static void xen_load_idt(const struct Xgt_desc_struct *desc)
+{
+	static DEFINE_SPINLOCK(lock);
+	static struct trap_info traps[257];
+
+	spin_lock(&lock);
+
+	__get_cpu_var(idt_desc) = *desc;
+
+	xen_convert_trap_info(desc, traps);
+
+	xen_mc_flush();
+	if (HYPERVISOR_set_trap_table(traps))
+		BUG();
+
+	spin_unlock(&lock);
+}
+
+/* Write a GDT descriptor entry.  Ignore LDT descriptors, since
+   they're handled differently. */
+static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
+				u32 low, u32 high)
+{
+	preempt_disable();
+
+	switch ((high >> 8) & 0xff) {
+	case DESCTYPE_LDT:
+	case DESCTYPE_TSS:
+		/* ignore */
+		break;
+
+	default: {
+		xmaddr_t maddr = virt_to_machine(&dt[entry]);
+		u64 desc = (u64)high << 32 | low;
+
+		xen_mc_flush();
+		if (HYPERVISOR_update_descriptor(maddr.maddr, desc))
+			BUG();
+	}
+
+	}
+
+	preempt_enable();
+}
+
+static void xen_load_esp0(struct tss_struct *tss,
+			  struct thread_struct *thread)
+{
+	struct multicall_space mcs = xen_mc_entry(0);
+	MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0);
+	xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
+static void xen_set_iopl_mask(unsigned mask)
+{
+	struct physdev_set_iopl set_iopl;
+
+	/* Force the change at ring 0. */
+	set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
+	HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+}
+
+static void xen_io_delay(void)
+{
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static unsigned long xen_apic_read(unsigned long reg)
+{
+	return 0;
+}
+
+static void xen_apic_write(unsigned long reg, unsigned long val)
+{
+	/* Warn to see if there's any stray references */
+	WARN_ON(1);
+}
+#endif
+
+static void xen_flush_tlb(void)
+{
+	struct mmuext_op *op;
+	struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+
+	op = mcs.args;
+	op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+	xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+
+static void xen_flush_tlb_single(unsigned long addr)
+{
+	struct mmuext_op *op;
+	struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+
+	op = mcs.args;
+	op->cmd = MMUEXT_INVLPG_LOCAL;
+	op->arg1.linear_addr = addr & PAGE_MASK;
+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+	xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+
+static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
+				 unsigned long va)
+{
+	struct {
+		struct mmuext_op op;
+		cpumask_t mask;
+	} *args;
+	cpumask_t cpumask = *cpus;
+	struct multicall_space mcs;
+
+	/*
+	 * A couple of (to be removed) sanity checks:
+	 *
+	 * - current CPU must not be in mask
+	 * - mask must exist :)
+	 */
+	BUG_ON(cpus_empty(cpumask));
+	BUG_ON(cpu_isset(smp_processor_id(), cpumask));
+	BUG_ON(!mm);
+
+	/* If a CPU which we ran on has gone down, OK. */
+	cpus_and(cpumask, cpumask, cpu_online_map);
+	if (cpus_empty(cpumask))
+		return;
+
+	mcs = xen_mc_entry(sizeof(*args));
+	args = mcs.args;
+	args->mask = cpumask;
+	args->op.arg2.vcpumask = &args->mask;
+
+	if (va == TLB_FLUSH_ALL) {
+		args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
+	} else {
+		args->op.cmd = MMUEXT_INVLPG_MULTI;
+		args->op.arg1.linear_addr = va;
+	}
+
+	MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
+
+	xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+
+static void xen_write_cr2(unsigned long cr2)
+{
+	x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
+}
+
+static unsigned long xen_read_cr2(void)
+{
+	return x86_read_percpu(xen_vcpu)->arch.cr2;
+}
+
+static unsigned long xen_read_cr2_direct(void)
+{
+	return x86_read_percpu(xen_vcpu_info.arch.cr2);
+}
+
+static void xen_write_cr4(unsigned long cr4)
+{
+	/* never allow TSC to be disabled */
+	native_write_cr4(cr4 & ~X86_CR4_TSD);
+}
+
+static unsigned long xen_read_cr3(void)
+{
+	return x86_read_percpu(xen_cr3);
+}
+
+static void xen_write_cr3(unsigned long cr3)
+{
+	BUG_ON(preemptible());
+
+	if (cr3 == x86_read_percpu(xen_cr3)) {
+		/* just a simple tlb flush */
+		xen_flush_tlb();
+		return;
+	}
+
+	x86_write_percpu(xen_cr3, cr3);
+
+
+	{
+		struct mmuext_op *op;
+		struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+		unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
+
+		op = mcs.args;
+		op->cmd = MMUEXT_NEW_BASEPTR;
+		op->arg1.mfn = mfn;
+
+		MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+		xen_mc_issue(PARAVIRT_LAZY_CPU);
+	}
+}
+
+/* Early in boot, while setting up the initial pagetable, assume
+   everything is pinned. */
+static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
+{
+	BUG_ON(mem_map);	/* should only be used early */
+	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+}
+
+/* This needs to make sure the new pte page is pinned iff its being
+   attached to a pinned pagetable. */
+static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
+{
+	struct page *page = pfn_to_page(pfn);
+
+	if (PagePinned(virt_to_page(mm->pgd))) {
+		SetPagePinned(page);
+
+		if (!PageHighMem(page))
+			make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+		else
+			/* make sure there are no stray mappings of
+			   this page */
+			kmap_flush_unused();
+	}
+}
+
+/* This should never happen until we're OK to use struct page */
+static void xen_release_pt(u32 pfn)
+{
+	struct page *page = pfn_to_page(pfn);
+
+	if (PagePinned(page)) {
+		if (!PageHighMem(page))
+			make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+	}
+}
+
+#ifdef CONFIG_HIGHPTE
+static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
+{
+	pgprot_t prot = PAGE_KERNEL;
+
+	if (PagePinned(page))
+		prot = PAGE_KERNEL_RO;
+
+	if (0 && PageHighMem(page))
+		printk("mapping highpte %lx type %d prot %s\n",
+		       page_to_pfn(page), type,
+		       (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
+
+	return kmap_atomic_prot(page, type, prot);
+}
+#endif
+
+static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
+{
+	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
+	if (pte_val_ma(*ptep) & _PAGE_PRESENT)
+		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
+			       pte_val_ma(pte));
+
+	return pte;
+}
+
+/* Init-time set_pte while constructing initial pagetables, which
+   doesn't allow RO pagetable pages to be remapped RW */
+static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
+{
+	pte = mask_rw_pte(ptep, pte);
+
+	xen_set_pte(ptep, pte);
+}
+
+static __init void xen_pagetable_setup_start(pgd_t *base)
+{
+	pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
+
+	/* special set_pte for pagetable initialization */
+	paravirt_ops.set_pte = xen_set_pte_init;
+
+	init_mm.pgd = base;
+	/*
+	 * copy top-level of Xen-supplied pagetable into place.	 For
+	 * !PAE we can use this as-is, but for PAE it is a stand-in
+	 * while we copy the pmd pages.
+	 */
+	memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
+
+	if (PTRS_PER_PMD > 1) {
+		int i;
+		/*
+		 * For PAE, need to allocate new pmds, rather than
+		 * share Xen's, since Xen doesn't like pmd's being
+		 * shared between address spaces.
+		 */
+		for (i = 0; i < PTRS_PER_PGD; i++) {
+			if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
+				pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+
+				memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
+				       PAGE_SIZE);
+
+				make_lowmem_page_readonly(pmd);
+
+				set_pgd(&base[i], __pgd(1 + __pa(pmd)));
+			} else
+				pgd_clear(&base[i]);
+		}
+	}
+
+	/* make sure zero_page is mapped RO so we can use it in pagetables */
+	make_lowmem_page_readonly(empty_zero_page);
+	make_lowmem_page_readonly(base);
+	/*
+	 * Switch to new pagetable.  This is done before
+	 * pagetable_init has done anything so that the new pages
+	 * added to the table can be prepared properly for Xen.
+	 */
+	xen_write_cr3(__pa(base));
+}
+
+static __init void xen_pagetable_setup_done(pgd_t *base)
+{
+	/* This will work as long as patching hasn't happened yet
+	   (which it hasn't) */
+	paravirt_ops.alloc_pt = xen_alloc_pt;
+	paravirt_ops.set_pte = xen_set_pte;
+
+	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+		/*
+		 * Create a mapping for the shared info page.
+		 * Should be set_fixmap(), but shared_info is a machine
+		 * address with no corresponding pseudo-phys address.
+		 */
+		set_pte_mfn(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
+			    PFN_DOWN(xen_start_info->shared_info),
+			    PAGE_KERNEL);
+
+		HYPERVISOR_shared_info =
+			(struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
+
+	} else
+		HYPERVISOR_shared_info =
+			(struct shared_info *)__va(xen_start_info->shared_info);
+
+	/* Actually pin the pagetable down, but we can't set PG_pinned
+	   yet because the page structures don't exist yet. */
+	{
+		struct mmuext_op op;
+#ifdef CONFIG_X86_PAE
+		op.cmd = MMUEXT_PIN_L3_TABLE;
+#else
+		op.cmd = MMUEXT_PIN_L3_TABLE;
+#endif
+		op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base)));
+		if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+			BUG();
+	}
+}
+
+/* This is called once we have the cpu_possible_map */
+void __init xen_setup_vcpu_info_placement(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		xen_vcpu_setup(cpu);
+
+	/* xen_vcpu_setup managed to place the vcpu_info within the
+	   percpu area for all cpus, so make use of it */
+	if (have_vcpu_info_placement) {
+		printk(KERN_INFO "Xen: using vcpu_info placement\n");
+
+		paravirt_ops.save_fl = xen_save_fl_direct;
+		paravirt_ops.restore_fl = xen_restore_fl_direct;
+		paravirt_ops.irq_disable = xen_irq_disable_direct;
+		paravirt_ops.irq_enable = xen_irq_enable_direct;
+		paravirt_ops.read_cr2 = xen_read_cr2_direct;
+		paravirt_ops.iret = xen_iret_direct;
+	}
+}
+
+static unsigned xen_patch(u8 type, u16 clobbers, void *insns, unsigned len)
+{
+	char *start, *end, *reloc;
+	unsigned ret;
+
+	start = end = reloc = NULL;
+
+#define SITE(x)								\
+	case PARAVIRT_PATCH(x):						\
+	if (have_vcpu_info_placement) {					\
+		start = (char *)xen_##x##_direct;			\
+		end = xen_##x##_direct_end;				\
+		reloc = xen_##x##_direct_reloc;				\
+	}								\
+	goto patch_site
+
+	switch (type) {
+		SITE(irq_enable);
+		SITE(irq_disable);
+		SITE(save_fl);
+		SITE(restore_fl);
+#undef SITE
+
+	patch_site:
+		if (start == NULL || (end-start) > len)
+			goto default_patch;
+
+		ret = paravirt_patch_insns(insns, len, start, end);
+
+		/* Note: because reloc is assigned from something that
+		   appears to be an array, gcc assumes it's non-null,
+		   but doesn't know its relationship with start and
+		   end. */
+		if (reloc > start && reloc < end) {
+			int reloc_off = reloc - start;
+			long *relocp = (long *)(insns + reloc_off);
+			long delta = start - (char *)insns;
+
+			*relocp += delta;
+		}
+		break;
+
+	default_patch:
+	default:
+		ret = paravirt_patch_default(type, clobbers, insns, len);
+		break;
+	}
+
+	return ret;
+}
+
+static const struct paravirt_ops xen_paravirt_ops __initdata = {
+	.paravirt_enabled = 1,
+	.shared_kernel_pmd = 0,
+
+	.name = "Xen",
+	.banner = xen_banner,
+
+	.patch = xen_patch,
+
+	.memory_setup = xen_memory_setup,
+	.arch_setup = xen_arch_setup,
+	.init_IRQ = xen_init_IRQ,
+	.post_allocator_init = xen_mark_init_mm_pinned,
+
+	.time_init = xen_time_init,
+	.set_wallclock = xen_set_wallclock,
+	.get_wallclock = xen_get_wallclock,
+	.get_cpu_khz = xen_cpu_khz,
+	.sched_clock = xen_sched_clock,
+
+	.cpuid = xen_cpuid,
+
+	.set_debugreg = xen_set_debugreg,
+	.get_debugreg = xen_get_debugreg,
+
+	.clts = native_clts,
+
+	.read_cr0 = native_read_cr0,
+	.write_cr0 = native_write_cr0,
+
+	.read_cr2 = xen_read_cr2,
+	.write_cr2 = xen_write_cr2,
+
+	.read_cr3 = xen_read_cr3,
+	.write_cr3 = xen_write_cr3,
+
+	.read_cr4 = native_read_cr4,
+	.read_cr4_safe = native_read_cr4_safe,
+	.write_cr4 = xen_write_cr4,
+
+	.save_fl = xen_save_fl,
+	.restore_fl = xen_restore_fl,
+	.irq_disable = xen_irq_disable,
+	.irq_enable = xen_irq_enable,
+	.safe_halt = xen_safe_halt,
+	.halt = xen_halt,
+	.wbinvd = native_wbinvd,
+
+	.read_msr = native_read_msr_safe,
+	.write_msr = native_write_msr_safe,
+	.read_tsc = native_read_tsc,
+	.read_pmc = native_read_pmc,
+
+	.iret = (void *)&hypercall_page[__HYPERVISOR_iret],
+	.irq_enable_sysexit = NULL,  /* never called */
+
+	.load_tr_desc = paravirt_nop,
+	.set_ldt = xen_set_ldt,
+	.load_gdt = xen_load_gdt,
+	.load_idt = xen_load_idt,
+	.load_tls = xen_load_tls,
+
+	.store_gdt = native_store_gdt,
+	.store_idt = native_store_idt,
+	.store_tr = xen_store_tr,
+
+	.write_ldt_entry = xen_write_ldt_entry,
+	.write_gdt_entry = xen_write_gdt_entry,
+	.write_idt_entry = xen_write_idt_entry,
+	.load_esp0 = xen_load_esp0,
+
+	.set_iopl_mask = xen_set_iopl_mask,
+	.io_delay = xen_io_delay,
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	.apic_write = xen_apic_write,
+	.apic_write_atomic = xen_apic_write,
+	.apic_read = xen_apic_read,
+	.setup_boot_clock = paravirt_nop,
+	.setup_secondary_clock = paravirt_nop,
+	.startup_ipi_hook = paravirt_nop,
+#endif
+
+	.flush_tlb_user = xen_flush_tlb,
+	.flush_tlb_kernel = xen_flush_tlb,
+	.flush_tlb_single = xen_flush_tlb_single,
+	.flush_tlb_others = xen_flush_tlb_others,
+
+	.pte_update = paravirt_nop,
+	.pte_update_defer = paravirt_nop,
+
+	.pagetable_setup_start = xen_pagetable_setup_start,
+	.pagetable_setup_done = xen_pagetable_setup_done,
+
+	.alloc_pt = xen_alloc_pt_init,
+	.release_pt = xen_release_pt,
+	.alloc_pd = paravirt_nop,
+	.alloc_pd_clone = paravirt_nop,
+	.release_pd = paravirt_nop,
+
+#ifdef CONFIG_HIGHPTE
+	.kmap_atomic_pte = xen_kmap_atomic_pte,
+#endif
+
+	.set_pte = NULL,	/* see xen_pagetable_setup_* */
+	.set_pte_at = xen_set_pte_at,
+	.set_pmd = xen_set_pmd,
+
+	.pte_val = xen_pte_val,
+	.pgd_val = xen_pgd_val,
+
+	.make_pte = xen_make_pte,
+	.make_pgd = xen_make_pgd,
+
+#ifdef CONFIG_X86_PAE
+	.set_pte_atomic = xen_set_pte_atomic,
+	.set_pte_present = xen_set_pte_at,
+	.set_pud = xen_set_pud,
+	.pte_clear = xen_pte_clear,
+	.pmd_clear = xen_pmd_clear,
+
+	.make_pmd = xen_make_pmd,
+	.pmd_val = xen_pmd_val,
+#endif	/* PAE */
+
+	.activate_mm = xen_activate_mm,
+	.dup_mmap = xen_dup_mmap,
+	.exit_mmap = xen_exit_mmap,
+
+	.set_lazy_mode = xen_set_lazy_mode,
+};
+
+#ifdef CONFIG_SMP
+static const struct smp_ops xen_smp_ops __initdata = {
+	.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
+	.smp_prepare_cpus = xen_smp_prepare_cpus,
+	.cpu_up = xen_cpu_up,
+	.smp_cpus_done = xen_smp_cpus_done,
+
+	.smp_send_stop = xen_smp_send_stop,
+	.smp_send_reschedule = xen_smp_send_reschedule,
+	.smp_call_function_mask = xen_smp_call_function_mask,
+};
+#endif	/* CONFIG_SMP */
+
+static void xen_reboot(int reason)
+{
+#ifdef CONFIG_SMP
+	smp_send_stop();
+#endif
+
+	if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason))
+		BUG();
+}
+
+static void xen_restart(char *msg)
+{
+	xen_reboot(SHUTDOWN_reboot);
+}
+
+static void xen_emergency_restart(void)
+{
+	xen_reboot(SHUTDOWN_reboot);
+}
+
+static void xen_machine_halt(void)
+{
+	xen_reboot(SHUTDOWN_poweroff);
+}
+
+static void xen_crash_shutdown(struct pt_regs *regs)
+{
+	xen_reboot(SHUTDOWN_crash);
+}
+
+static const struct machine_ops __initdata xen_machine_ops = {
+	.restart = xen_restart,
+	.halt = xen_machine_halt,
+	.power_off = xen_machine_halt,
+	.shutdown = xen_machine_halt,
+	.crash_shutdown = xen_crash_shutdown,
+	.emergency_restart = xen_emergency_restart,
+};
+
+
+/* First C function to be called on Xen boot */
+asmlinkage void __init xen_start_kernel(void)
+{
+	pgd_t *pgd;
+
+	if (!xen_start_info)
+		return;
+
+	BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0);
+
+	/* Install Xen paravirt ops */
+	paravirt_ops = xen_paravirt_ops;
+	machine_ops = xen_machine_ops;
+
+#ifdef CONFIG_SMP
+	smp_ops = xen_smp_ops;
+#endif
+
+	xen_setup_features();
+
+	/* Get mfn list */
+	if (!xen_feature(XENFEAT_auto_translated_physmap))
+		phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list;
+
+	pgd = (pgd_t *)xen_start_info->pt_base;
+
+	init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
+
+	init_mm.pgd = pgd; /* use the Xen pagetables to start */
+
+	/* keep using Xen gdt for now; no urgent need to change it */
+
+	x86_write_percpu(xen_cr3, __pa(pgd));
+
+#ifdef CONFIG_SMP
+	/* Don't do the full vcpu_info placement stuff until we have a
+	   possible map. */
+	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
+#else
+	/* May as well do it now, since there's no good time to call
+	   it later on UP. */
+	xen_setup_vcpu_info_placement();
+#endif
+
+	paravirt_ops.kernel_rpl = 1;
+	if (xen_feature(XENFEAT_supervisor_mode_kernel))
+		paravirt_ops.kernel_rpl = 0;
+
+	/* set the limit of our address space */
+	reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE);
+
+	/* set up basic CPUID stuff */
+	cpu_detect(&new_cpu_data);
+	new_cpu_data.hard_math = 1;
+	new_cpu_data.x86_capability[0] = cpuid_edx(1);
+
+	/* Poke various useful things into boot_params */
+	LOADER_TYPE = (9 << 4) | 0;
+	INITRD_START = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0;
+	INITRD_SIZE = xen_start_info->mod_len;
+
+	/* Start the world */
+	start_kernel();
+}
diff --git a/arch/i386/xen/events.c b/arch/i386/xen/events.c
new file mode 100644
index 00000000000..8904acc20f8
--- /dev/null
+++ b/arch/i386/xen/events.c
@@ -0,0 +1,590 @@
+/*
+ * Xen event channels
+ *
+ * Xen models interrupts with abstract event channels.  Because each
+ * domain gets 1024 event channels, but NR_IRQ is not that large, we
+ * must dynamically map irqs<->event channels.  The event channels
+ * interface with the rest of the kernel by defining a xen interrupt
+ * chip.  When an event is recieved, it is mapped to an irq and sent
+ * through the normal interrupt processing path.
+ *
+ * There are four kinds of events which can be mapped to an event
+ * channel:
+ *
+ * 1. Inter-domain notifications.  This includes all the virtual
+ *    device events, since they're driven by front-ends in another domain
+ *    (typically dom0).
+ * 2. VIRQs, typically used for timers.  These are per-cpu events.
+ * 3. IPIs.
+ * 4. Hardware interrupts. Not supported at present.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+
+#include <linux/linkage.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <asm/ptrace.h>
+#include <asm/irq.h>
+#include <asm/sync_bitops.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+
+#include "xen-ops.h"
+
+/*
+ * This lock protects updates to the following mapping and reference-count
+ * arrays. The lock does not need to be acquired to read the mapping tables.
+ */
+static DEFINE_SPINLOCK(irq_mapping_update_lock);
+
+/* IRQ <-> VIRQ mapping. */
+static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
+
+/* IRQ <-> IPI mapping */
+static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
+
+/* Packed IRQ information: binding type, sub-type index, and event channel. */
+struct packed_irq
+{
+	unsigned short evtchn;
+	unsigned char index;
+	unsigned char type;
+};
+
+static struct packed_irq irq_info[NR_IRQS];
+
+/* Binding types. */
+enum {
+	IRQT_UNBOUND,
+	IRQT_PIRQ,
+	IRQT_VIRQ,
+	IRQT_IPI,
+	IRQT_EVTCHN
+};
+
+/* Convenient shorthand for packed representation of an unbound IRQ. */
+#define IRQ_UNBOUND	mk_irq_info(IRQT_UNBOUND, 0, 0)
+
+static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
+	[0 ... NR_EVENT_CHANNELS-1] = -1
+};
+static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
+static u8 cpu_evtchn[NR_EVENT_CHANNELS];
+
+/* Reference counts for bindings to IRQs. */
+static int irq_bindcount[NR_IRQS];
+
+/* Xen will never allocate port zero for any purpose. */
+#define VALID_EVTCHN(chn)	((chn) != 0)
+
+/*
+ * Force a proper event-channel callback from Xen after clearing the
+ * callback mask. We do this in a very simple manner, by making a call
+ * down into Xen. The pending flag will be checked by Xen on return.
+ */
+void force_evtchn_callback(void)
+{
+	(void)HYPERVISOR_xen_version(0, NULL);
+}
+EXPORT_SYMBOL_GPL(force_evtchn_callback);
+
+static struct irq_chip xen_dynamic_chip;
+
+/* Constructor for packed IRQ information. */
+static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn)
+{
+	return (struct packed_irq) { evtchn, index, type };
+}
+
+/*
+ * Accessors for packed IRQ information.
+ */
+static inline unsigned int evtchn_from_irq(int irq)
+{
+	return irq_info[irq].evtchn;
+}
+
+static inline unsigned int index_from_irq(int irq)
+{
+	return irq_info[irq].index;
+}
+
+static inline unsigned int type_from_irq(int irq)
+{
+	return irq_info[irq].type;
+}
+
+static inline unsigned long active_evtchns(unsigned int cpu,
+					   struct shared_info *sh,
+					   unsigned int idx)
+{
+	return (sh->evtchn_pending[idx] &
+		cpu_evtchn_mask[cpu][idx] &
+		~sh->evtchn_mask[idx]);
+}
+
+static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
+{
+	int irq = evtchn_to_irq[chn];
+
+	BUG_ON(irq == -1);
+#ifdef CONFIG_SMP
+	irq_desc[irq].affinity = cpumask_of_cpu(cpu);
+#endif
+
+	__clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
+	__set_bit(chn, cpu_evtchn_mask[cpu]);
+
+	cpu_evtchn[chn] = cpu;
+}
+
+static void init_evtchn_cpu_bindings(void)
+{
+#ifdef CONFIG_SMP
+	int i;
+	/* By default all event channels notify CPU#0. */
+	for (i = 0; i < NR_IRQS; i++)
+		irq_desc[i].affinity = cpumask_of_cpu(0);
+#endif
+
+	memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
+	memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
+}
+
+static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
+{
+	return cpu_evtchn[evtchn];
+}
+
+static inline void clear_evtchn(int port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	sync_clear_bit(port, &s->evtchn_pending[0]);
+}
+
+static inline void set_evtchn(int port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	sync_set_bit(port, &s->evtchn_pending[0]);
+}
+
+
+/**
+ * notify_remote_via_irq - send event to remote end of event channel via irq
+ * @irq: irq of event channel to send event to
+ *
+ * Unlike notify_remote_via_evtchn(), this is safe to use across
+ * save/restore. Notifications on a broken connection are silently
+ * dropped.
+ */
+void notify_remote_via_irq(int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		notify_remote_via_evtchn(evtchn);
+}
+EXPORT_SYMBOL_GPL(notify_remote_via_irq);
+
+static void mask_evtchn(int port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	sync_set_bit(port, &s->evtchn_mask[0]);
+}
+
+static void unmask_evtchn(int port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	unsigned int cpu = get_cpu();
+
+	BUG_ON(!irqs_disabled());
+
+	/* Slow path (hypercall) if this is a non-local port. */
+	if (unlikely(cpu != cpu_from_evtchn(port))) {
+		struct evtchn_unmask unmask = { .port = port };
+		(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
+	} else {
+		struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
+
+		sync_clear_bit(port, &s->evtchn_mask[0]);
+
+		/*
+		 * The following is basically the equivalent of
+		 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
+		 * the interrupt edge' if the channel is masked.
+		 */
+		if (sync_test_bit(port, &s->evtchn_pending[0]) &&
+		    !sync_test_and_set_bit(port / BITS_PER_LONG,
+					   &vcpu_info->evtchn_pending_sel))
+			vcpu_info->evtchn_upcall_pending = 1;
+	}
+
+	put_cpu();
+}
+
+static int find_unbound_irq(void)
+{
+	int irq;
+
+	/* Only allocate from dynirq range */
+	for (irq = 0; irq < NR_IRQS; irq++)
+		if (irq_bindcount[irq] == 0)
+			break;
+
+	if (irq == NR_IRQS)
+		panic("No available IRQ to bind to: increase NR_IRQS!\n");
+
+	return irq;
+}
+
+int bind_evtchn_to_irq(unsigned int evtchn)
+{
+	int irq;
+
+	spin_lock(&irq_mapping_update_lock);
+
+	irq = evtchn_to_irq[evtchn];
+
+	if (irq == -1) {
+		irq = find_unbound_irq();
+
+		dynamic_irq_init(irq);
+		set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+					      handle_level_irq, "event");
+
+		evtchn_to_irq[evtchn] = irq;
+		irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
+	}
+
+	irq_bindcount[irq]++;
+
+	spin_unlock(&irq_mapping_update_lock);
+
+	return irq;
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
+
+static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
+{
+	struct evtchn_bind_ipi bind_ipi;
+	int evtchn, irq;
+
+	spin_lock(&irq_mapping_update_lock);
+
+	irq = per_cpu(ipi_to_irq, cpu)[ipi];
+	if (irq == -1) {
+		irq = find_unbound_irq();
+		if (irq < 0)
+			goto out;
+
+		dynamic_irq_init(irq);
+		set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+					      handle_level_irq, "ipi");
+
+		bind_ipi.vcpu = cpu;
+		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
+						&bind_ipi) != 0)
+			BUG();
+		evtchn = bind_ipi.port;
+
+		evtchn_to_irq[evtchn] = irq;
+		irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
+
+		per_cpu(ipi_to_irq, cpu)[ipi] = irq;
+
+		bind_evtchn_to_cpu(evtchn, cpu);
+	}
+
+	irq_bindcount[irq]++;
+
+ out:
+	spin_unlock(&irq_mapping_update_lock);
+	return irq;
+}
+
+
+static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
+{
+	struct evtchn_bind_virq bind_virq;
+	int evtchn, irq;
+
+	spin_lock(&irq_mapping_update_lock);
+
+	irq = per_cpu(virq_to_irq, cpu)[virq];
+
+	if (irq == -1) {
+		bind_virq.virq = virq;
+		bind_virq.vcpu = cpu;
+		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+						&bind_virq) != 0)
+			BUG();
+		evtchn = bind_virq.port;
+
+		irq = find_unbound_irq();
+
+		dynamic_irq_init(irq);
+		set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+					      handle_level_irq, "virq");
+
+		evtchn_to_irq[evtchn] = irq;
+		irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
+
+		per_cpu(virq_to_irq, cpu)[virq] = irq;
+
+		bind_evtchn_to_cpu(evtchn, cpu);
+	}
+
+	irq_bindcount[irq]++;
+
+	spin_unlock(&irq_mapping_update_lock);
+
+	return irq;
+}
+
+static void unbind_from_irq(unsigned int irq)
+{
+	struct evtchn_close close;
+	int evtchn = evtchn_from_irq(irq);
+
+	spin_lock(&irq_mapping_update_lock);
+
+	if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) {
+		close.port = evtchn;
+		if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
+			BUG();
+
+		switch (type_from_irq(irq)) {
+		case IRQT_VIRQ:
+			per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
+				[index_from_irq(irq)] = -1;
+			break;
+		default:
+			break;
+		}
+
+		/* Closed ports are implicitly re-bound to VCPU0. */
+		bind_evtchn_to_cpu(evtchn, 0);
+
+		evtchn_to_irq[evtchn] = -1;
+		irq_info[irq] = IRQ_UNBOUND;
+
+		dynamic_irq_init(irq);
+	}
+
+	spin_unlock(&irq_mapping_update_lock);
+}
+
+int bind_evtchn_to_irqhandler(unsigned int evtchn,
+			      irqreturn_t (*handler)(int, void *),
+			      unsigned long irqflags,
+			      const char *devname, void *dev_id)
+{
+	unsigned int irq;
+	int retval;
+
+	irq = bind_evtchn_to_irq(evtchn);
+	retval = request_irq(irq, handler, irqflags, devname, dev_id);
+	if (retval != 0) {
+		unbind_from_irq(irq);
+		return retval;
+	}
+
+	return irq;
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
+
+int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
+			    irqreturn_t (*handler)(int, void *),
+			    unsigned long irqflags, const char *devname, void *dev_id)
+{
+	unsigned int irq;
+	int retval;
+
+	irq = bind_virq_to_irq(virq, cpu);
+	retval = request_irq(irq, handler, irqflags, devname, dev_id);
+	if (retval != 0) {
+		unbind_from_irq(irq);
+		return retval;
+	}
+
+	return irq;
+}
+EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
+
+int bind_ipi_to_irqhandler(enum ipi_vector ipi,
+			   unsigned int cpu,
+			   irq_handler_t handler,
+			   unsigned long irqflags,
+			   const char *devname,
+			   void *dev_id)
+{
+	int irq, retval;
+
+	irq = bind_ipi_to_irq(ipi, cpu);
+	if (irq < 0)
+		return irq;
+
+	retval = request_irq(irq, handler, irqflags, devname, dev_id);
+	if (retval != 0) {
+		unbind_from_irq(irq);
+		return retval;
+	}
+
+	return irq;
+}
+
+void unbind_from_irqhandler(unsigned int irq, void *dev_id)
+{
+	free_irq(irq, dev_id);
+	unbind_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
+
+void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
+{
+	int irq = per_cpu(ipi_to_irq, cpu)[vector];
+	BUG_ON(irq < 0);
+	notify_remote_via_irq(irq);
+}
+
+
+/*
+ * Search the CPUs pending events bitmasks.  For each one found, map
+ * the event number to an irq, and feed it into do_IRQ() for
+ * handling.
+ *
+ * Xen uses a two-level bitmap to speed searching.  The first level is
+ * a bitset of words which contain pending event bits.  The second
+ * level is a bitset of pending events themselves.
+ */
+fastcall void xen_evtchn_do_upcall(struct pt_regs *regs)
+{
+	int cpu = get_cpu();
+	struct shared_info *s = HYPERVISOR_shared_info;
+	struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
+	unsigned long pending_words;
+
+	vcpu_info->evtchn_upcall_pending = 0;
+
+	/* NB. No need for a barrier here -- XCHG is a barrier on x86. */
+	pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
+	while (pending_words != 0) {
+		unsigned long pending_bits;
+		int word_idx = __ffs(pending_words);
+		pending_words &= ~(1UL << word_idx);
+
+		while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
+			int bit_idx = __ffs(pending_bits);
+			int port = (word_idx * BITS_PER_LONG) + bit_idx;
+			int irq = evtchn_to_irq[port];
+
+			if (irq != -1) {
+				regs->orig_eax = ~irq;
+				do_IRQ(regs);
+			}
+		}
+	}
+
+	put_cpu();
+}
+
+/* Rebind an evtchn so that it gets delivered to a specific cpu */
+static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
+{
+	struct evtchn_bind_vcpu bind_vcpu;
+	int evtchn = evtchn_from_irq(irq);
+
+	if (!VALID_EVTCHN(evtchn))
+		return;
+
+	/* Send future instances of this interrupt to other vcpu. */
+	bind_vcpu.port = evtchn;
+	bind_vcpu.vcpu = tcpu;
+
+	/*
+	 * If this fails, it usually just indicates that we're dealing with a
+	 * virq or IPI channel, which don't actually need to be rebound. Ignore
+	 * it, but don't do the xenlinux-level rebind in that case.
+	 */
+	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
+		bind_evtchn_to_cpu(evtchn, tcpu);
+}
+
+
+static void set_affinity_irq(unsigned irq, cpumask_t dest)
+{
+	unsigned tcpu = first_cpu(dest);
+	rebind_irq_to_cpu(irq, tcpu);
+}
+
+static void enable_dynirq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		unmask_evtchn(evtchn);
+}
+
+static void disable_dynirq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		mask_evtchn(evtchn);
+}
+
+static void ack_dynirq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	move_native_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		clear_evtchn(evtchn);
+}
+
+static int retrigger_dynirq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+	int ret = 0;
+
+	if (VALID_EVTCHN(evtchn)) {
+		set_evtchn(evtchn);
+		ret = 1;
+	}
+
+	return ret;
+}
+
+static struct irq_chip xen_dynamic_chip __read_mostly = {
+	.name		= "xen-dyn",
+	.mask		= disable_dynirq,
+	.unmask		= enable_dynirq,
+	.ack		= ack_dynirq,
+	.set_affinity	= set_affinity_irq,
+	.retrigger	= retrigger_dynirq,
+};
+
+void __init xen_init_IRQ(void)
+{
+	int i;
+
+	init_evtchn_cpu_bindings();
+
+	/* No event channels are 'live' right now. */
+	for (i = 0; i < NR_EVENT_CHANNELS; i++)
+		mask_evtchn(i);
+
+	/* Dynamic IRQ space is currently unbound. Zero the refcnts. */
+	for (i = 0; i < NR_IRQS; i++)
+		irq_bindcount[i] = 0;
+
+	irq_ctx_init(smp_processor_id());
+}
diff --git a/arch/i386/xen/features.c b/arch/i386/xen/features.c
new file mode 100644
index 00000000000..0707714e40d
--- /dev/null
+++ b/arch/i386/xen/features.c
@@ -0,0 +1,29 @@
+/******************************************************************************
+ * features.c
+ *
+ * Xen feature flags.
+ *
+ * Copyright (c) 2006, Ian Campbell, XenSource Inc.
+ */
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/features.h>
+
+u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
+EXPORT_SYMBOL_GPL(xen_features);
+
+void xen_setup_features(void)
+{
+	struct xen_feature_info fi;
+	int i, j;
+
+	for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
+		fi.submap_idx = i;
+		if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
+			break;
+		for (j = 0; j < 32; j++)
+			xen_features[i * 32 + j] = !!(fi.submap & 1<<j);
+	}
+}
diff --git a/arch/i386/xen/manage.c b/arch/i386/xen/manage.c
new file mode 100644
index 00000000000..aa7af9e6abc
--- /dev/null
+++ b/arch/i386/xen/manage.c
@@ -0,0 +1,143 @@
+/*
+ * Handle extern requests for shutdown, reboot and sysrq
+ */
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/reboot.h>
+#include <linux/sysrq.h>
+
+#include <xen/xenbus.h>
+
+#define SHUTDOWN_INVALID  -1
+#define SHUTDOWN_POWEROFF  0
+#define SHUTDOWN_SUSPEND   2
+/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
+ * report a crash, not be instructed to crash!
+ * HALT is the same as POWEROFF, as far as we're concerned.  The tools use
+ * the distinction when we return the reason code to them.
+ */
+#define SHUTDOWN_HALT      4
+
+/* Ignore multiple shutdown requests. */
+static int shutting_down = SHUTDOWN_INVALID;
+
+static void shutdown_handler(struct xenbus_watch *watch,
+			     const char **vec, unsigned int len)
+{
+	char *str;
+	struct xenbus_transaction xbt;
+	int err;
+
+	if (shutting_down != SHUTDOWN_INVALID)
+		return;
+
+ again:
+	err = xenbus_transaction_start(&xbt);
+	if (err)
+		return;
+
+	str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
+	/* Ignore read errors and empty reads. */
+	if (XENBUS_IS_ERR_READ(str)) {
+		xenbus_transaction_end(xbt, 1);
+		return;
+	}
+
+	xenbus_write(xbt, "control", "shutdown", "");
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN) {
+		kfree(str);
+		goto again;
+	}
+
+	if (strcmp(str, "poweroff") == 0 ||
+	    strcmp(str, "halt") == 0)
+		orderly_poweroff(false);
+	else if (strcmp(str, "reboot") == 0)
+		ctrl_alt_del();
+	else {
+		printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
+		shutting_down = SHUTDOWN_INVALID;
+	}
+
+	kfree(str);
+}
+
+static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
+			  unsigned int len)
+{
+	char sysrq_key = '\0';
+	struct xenbus_transaction xbt;
+	int err;
+
+ again:
+	err = xenbus_transaction_start(&xbt);
+	if (err)
+		return;
+	if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
+		printk(KERN_ERR "Unable to read sysrq code in "
+		       "control/sysrq\n");
+		xenbus_transaction_end(xbt, 1);
+		return;
+	}
+
+	if (sysrq_key != '\0')
+		xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN)
+		goto again;
+
+	if (sysrq_key != '\0')
+		handle_sysrq(sysrq_key, NULL);
+}
+
+static struct xenbus_watch shutdown_watch = {
+	.node = "control/shutdown",
+	.callback = shutdown_handler
+};
+
+static struct xenbus_watch sysrq_watch = {
+	.node = "control/sysrq",
+	.callback = sysrq_handler
+};
+
+static int setup_shutdown_watcher(void)
+{
+	int err;
+
+	err = register_xenbus_watch(&shutdown_watch);
+	if (err) {
+		printk(KERN_ERR "Failed to set shutdown watcher\n");
+		return err;
+	}
+
+	err = register_xenbus_watch(&sysrq_watch);
+	if (err) {
+		printk(KERN_ERR "Failed to set sysrq watcher\n");
+		return err;
+	}
+
+	return 0;
+}
+
+static int shutdown_event(struct notifier_block *notifier,
+			  unsigned long event,
+			  void *data)
+{
+	setup_shutdown_watcher();
+	return NOTIFY_DONE;
+}
+
+static int __init setup_shutdown_event(void)
+{
+	static struct notifier_block xenstore_notifier = {
+		.notifier_call = shutdown_event
+	};
+	register_xenstore_notifier(&xenstore_notifier);
+
+	return 0;
+}
+
+subsys_initcall(setup_shutdown_event);
diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c
new file mode 100644
index 00000000000..4ae038aa6c2
--- /dev/null
+++ b/arch/i386/xen/mmu.c
@@ -0,0 +1,564 @@
+/*
+ * Xen mmu operations
+ *
+ * This file contains the various mmu fetch and update operations.
+ * The most important job they must perform is the mapping between the
+ * domain's pfn and the overall machine mfns.
+ *
+ * Xen allows guests to directly update the pagetable, in a controlled
+ * fashion.  In other words, the guest modifies the same pagetable
+ * that the CPU actually uses, which eliminates the overhead of having
+ * a separate shadow pagetable.
+ *
+ * In order to allow this, it falls on the guest domain to map its
+ * notion of a "physical" pfn - which is just a domain-local linear
+ * address - into a real "machine address" which the CPU's MMU can
+ * use.
+ *
+ * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
+ * inserted directly into the pagetable.  When creating a new
+ * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
+ * when reading the content back with __(pgd|pmd|pte)_val, it converts
+ * the mfn back into a pfn.
+ *
+ * The other constraint is that all pages which make up a pagetable
+ * must be mapped read-only in the guest.  This prevents uncontrolled
+ * guest updates to the pagetable.  Xen strictly enforces this, and
+ * will disallow any pagetable update which will end up mapping a
+ * pagetable page RW, and will disallow using any writable page as a
+ * pagetable.
+ *
+ * Naively, when loading %cr3 with the base of a new pagetable, Xen
+ * would need to validate the whole pagetable before going on.
+ * Naturally, this is quite slow.  The solution is to "pin" a
+ * pagetable, which enforces all the constraints on the pagetable even
+ * when it is not actively in use.  This menas that Xen can be assured
+ * that it is still valid when you do load it into %cr3, and doesn't
+ * need to revalidate it.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/bug.h>
+#include <linux/sched.h>
+
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/paravirt.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include <xen/page.h>
+#include <xen/interface/xen.h>
+
+#include "multicalls.h"
+#include "mmu.h"
+
+xmaddr_t arbitrary_virt_to_machine(unsigned long address)
+{
+	pte_t *pte = lookup_address(address);
+	unsigned offset = address & PAGE_MASK;
+
+	BUG_ON(pte == NULL);
+
+	return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
+}
+
+void make_lowmem_page_readonly(void *vaddr)
+{
+	pte_t *pte, ptev;
+	unsigned long address = (unsigned long)vaddr;
+
+	pte = lookup_address(address);
+	BUG_ON(pte == NULL);
+
+	ptev = pte_wrprotect(*pte);
+
+	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
+		BUG();
+}
+
+void make_lowmem_page_readwrite(void *vaddr)
+{
+	pte_t *pte, ptev;
+	unsigned long address = (unsigned long)vaddr;
+
+	pte = lookup_address(address);
+	BUG_ON(pte == NULL);
+
+	ptev = pte_mkwrite(*pte);
+
+	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
+		BUG();
+}
+
+
+void xen_set_pmd(pmd_t *ptr, pmd_t val)
+{
+	struct multicall_space mcs;
+	struct mmu_update *u;
+
+	preempt_disable();
+
+	mcs = xen_mc_entry(sizeof(*u));
+	u = mcs.args;
+	u->ptr = virt_to_machine(ptr).maddr;
+	u->val = pmd_val_ma(val);
+	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
+
+	xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+	preempt_enable();
+}
+
+/*
+ * Associate a virtual page frame with a given physical page frame
+ * and protection flags for that frame.
+ */
+void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pgd = swapper_pg_dir + pgd_index(vaddr);
+	if (pgd_none(*pgd)) {
+		BUG();
+		return;
+	}
+	pud = pud_offset(pgd, vaddr);
+	if (pud_none(*pud)) {
+		BUG();
+		return;
+	}
+	pmd = pmd_offset(pud, vaddr);
+	if (pmd_none(*pmd)) {
+		BUG();
+		return;
+	}
+	pte = pte_offset_kernel(pmd, vaddr);
+	/* <mfn,flags> stored as-is, to permit clearing entries */
+	xen_set_pte(pte, mfn_pte(mfn, flags));
+
+	/*
+	 * It's enough to flush this one mapping.
+	 * (PGE mappings get flushed as well)
+	 */
+	__flush_tlb_one(vaddr);
+}
+
+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+		    pte_t *ptep, pte_t pteval)
+{
+	if (mm == current->mm || mm == &init_mm) {
+		if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
+			struct multicall_space mcs;
+			mcs = xen_mc_entry(0);
+
+			MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
+			xen_mc_issue(PARAVIRT_LAZY_MMU);
+			return;
+		} else
+			if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
+				return;
+	}
+	xen_set_pte(ptep, pteval);
+}
+
+#ifdef CONFIG_X86_PAE
+void xen_set_pud(pud_t *ptr, pud_t val)
+{
+	struct multicall_space mcs;
+	struct mmu_update *u;
+
+	preempt_disable();
+
+	mcs = xen_mc_entry(sizeof(*u));
+	u = mcs.args;
+	u->ptr = virt_to_machine(ptr).maddr;
+	u->val = pud_val_ma(val);
+	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
+
+	xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+	preempt_enable();
+}
+
+void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+	ptep->pte_high = pte.pte_high;
+	smp_wmb();
+	ptep->pte_low = pte.pte_low;
+}
+
+void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+	set_64bit((u64 *)ptep, pte_val_ma(pte));
+}
+
+void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+	ptep->pte_low = 0;
+	smp_wmb();		/* make sure low gets written first */
+	ptep->pte_high = 0;
+}
+
+void xen_pmd_clear(pmd_t *pmdp)
+{
+	xen_set_pmd(pmdp, __pmd(0));
+}
+
+unsigned long long xen_pte_val(pte_t pte)
+{
+	unsigned long long ret = 0;
+
+	if (pte.pte_low) {
+		ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
+		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+	}
+
+	return ret;
+}
+
+unsigned long long xen_pmd_val(pmd_t pmd)
+{
+	unsigned long long ret = pmd.pmd;
+	if (ret)
+		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+	return ret;
+}
+
+unsigned long long xen_pgd_val(pgd_t pgd)
+{
+	unsigned long long ret = pgd.pgd;
+	if (ret)
+		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+	return ret;
+}
+
+pte_t xen_make_pte(unsigned long long pte)
+{
+	if (pte & 1)
+		pte = phys_to_machine(XPADDR(pte)).maddr;
+
+	return (pte_t){ pte, pte >> 32 };
+}
+
+pmd_t xen_make_pmd(unsigned long long pmd)
+{
+	if (pmd & 1)
+		pmd = phys_to_machine(XPADDR(pmd)).maddr;
+
+	return (pmd_t){ pmd };
+}
+
+pgd_t xen_make_pgd(unsigned long long pgd)
+{
+	if (pgd & _PAGE_PRESENT)
+		pgd = phys_to_machine(XPADDR(pgd)).maddr;
+
+	return (pgd_t){ pgd };
+}
+#else  /* !PAE */
+void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+	*ptep = pte;
+}
+
+unsigned long xen_pte_val(pte_t pte)
+{
+	unsigned long ret = pte.pte_low;
+
+	if (ret & _PAGE_PRESENT)
+		ret = machine_to_phys(XMADDR(ret)).paddr;
+
+	return ret;
+}
+
+unsigned long xen_pgd_val(pgd_t pgd)
+{
+	unsigned long ret = pgd.pgd;
+	if (ret)
+		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+	return ret;
+}
+
+pte_t xen_make_pte(unsigned long pte)
+{
+	if (pte & _PAGE_PRESENT)
+		pte = phys_to_machine(XPADDR(pte)).maddr;
+
+	return (pte_t){ pte };
+}
+
+pgd_t xen_make_pgd(unsigned long pgd)
+{
+	if (pgd & _PAGE_PRESENT)
+		pgd = phys_to_machine(XPADDR(pgd)).maddr;
+
+	return (pgd_t){ pgd };
+}
+#endif	/* CONFIG_X86_PAE */
+
+
+
+/*
+  (Yet another) pagetable walker.  This one is intended for pinning a
+  pagetable.  This means that it walks a pagetable and calls the
+  callback function on each page it finds making up the page table,
+  at every level.  It walks the entire pagetable, but it only bothers
+  pinning pte pages which are below pte_limit.  In the normal case
+  this will be TASK_SIZE, but at boot we need to pin up to
+  FIXADDR_TOP.  But the important bit is that we don't pin beyond
+  there, because then we start getting into Xen's ptes.
+*/
+static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
+		    unsigned long limit)
+{
+	pgd_t *pgd = pgd_base;
+	int flush = 0;
+	unsigned long addr = 0;
+	unsigned long pgd_next;
+
+	BUG_ON(limit > FIXADDR_TOP);
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return 0;
+
+	for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
+		pud_t *pud;
+		unsigned long pud_limit, pud_next;
+
+		pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
+
+		if (!pgd_val(*pgd))
+			continue;
+
+		pud = pud_offset(pgd, 0);
+
+		if (PTRS_PER_PUD > 1) /* not folded */
+			flush |= (*func)(virt_to_page(pud), 0);
+
+		for (; addr != pud_limit; pud++, addr = pud_next) {
+			pmd_t *pmd;
+			unsigned long pmd_limit;
+
+			pud_next = pud_addr_end(addr, pud_limit);
+
+			if (pud_next < limit)
+				pmd_limit = pud_next;
+			else
+				pmd_limit = limit;
+
+			if (pud_none(*pud))
+				continue;
+
+			pmd = pmd_offset(pud, 0);
+
+			if (PTRS_PER_PMD > 1) /* not folded */
+				flush |= (*func)(virt_to_page(pmd), 0);
+
+			for (; addr != pmd_limit; pmd++) {
+				addr += (PAGE_SIZE * PTRS_PER_PTE);
+				if ((pmd_limit-1) < (addr-1)) {
+					addr = pmd_limit;
+					break;
+				}
+
+				if (pmd_none(*pmd))
+					continue;
+
+				flush |= (*func)(pmd_page(*pmd), 0);
+			}
+		}
+	}
+
+	flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);
+
+	return flush;
+}
+
+static int pin_page(struct page *page, unsigned flags)
+{
+	unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
+	int flush;
+
+	if (pgfl)
+		flush = 0;		/* already pinned */
+	else if (PageHighMem(page))
+		/* kmaps need flushing if we found an unpinned
+		   highpage */
+		flush = 1;
+	else {
+		void *pt = lowmem_page_address(page);
+		unsigned long pfn = page_to_pfn(page);
+		struct multicall_space mcs = __xen_mc_entry(0);
+
+		flush = 0;
+
+		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
+					pfn_pte(pfn, PAGE_KERNEL_RO),
+					flags);
+	}
+
+	return flush;
+}
+
+/* This is called just after a mm has been created, but it has not
+   been used yet.  We need to make sure that its pagetable is all
+   read-only, and can be pinned. */
+void xen_pgd_pin(pgd_t *pgd)
+{
+	struct multicall_space mcs;
+	struct mmuext_op *op;
+
+	xen_mc_batch();
+
+	if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
+		/* re-enable interrupts for kmap_flush_unused */
+		xen_mc_issue(0);
+		kmap_flush_unused();
+		xen_mc_batch();
+	}
+
+	mcs = __xen_mc_entry(sizeof(*op));
+	op = mcs.args;
+
+#ifdef CONFIG_X86_PAE
+	op->cmd = MMUEXT_PIN_L3_TABLE;
+#else
+	op->cmd = MMUEXT_PIN_L2_TABLE;
+#endif
+	op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+	xen_mc_issue(0);
+}
+
+/* The init_mm pagetable is really pinned as soon as its created, but
+   that's before we have page structures to store the bits.  So do all
+   the book-keeping now. */
+static __init int mark_pinned(struct page *page, unsigned flags)
+{
+	SetPagePinned(page);
+	return 0;
+}
+
+void __init xen_mark_init_mm_pinned(void)
+{
+	pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
+}
+
+static int unpin_page(struct page *page, unsigned flags)
+{
+	unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
+
+	if (pgfl && !PageHighMem(page)) {
+		void *pt = lowmem_page_address(page);
+		unsigned long pfn = page_to_pfn(page);
+		struct multicall_space mcs = __xen_mc_entry(0);
+
+		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
+					pfn_pte(pfn, PAGE_KERNEL),
+					flags);
+	}
+
+	return 0;		/* never need to flush on unpin */
+}
+
+/* Release a pagetables pages back as normal RW */
+static void xen_pgd_unpin(pgd_t *pgd)
+{
+	struct mmuext_op *op;
+	struct multicall_space mcs;
+
+	xen_mc_batch();
+
+	mcs = __xen_mc_entry(sizeof(*op));
+
+	op = mcs.args;
+	op->cmd = MMUEXT_UNPIN_TABLE;
+	op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
+
+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+	pgd_walk(pgd, unpin_page, TASK_SIZE);
+
+	xen_mc_issue(0);
+}
+
+void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
+{
+	spin_lock(&next->page_table_lock);
+	xen_pgd_pin(next->pgd);
+	spin_unlock(&next->page_table_lock);
+}
+
+void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+	spin_lock(&mm->page_table_lock);
+	xen_pgd_pin(mm->pgd);
+	spin_unlock(&mm->page_table_lock);
+}
+
+
+#ifdef CONFIG_SMP
+/* Another cpu may still have their %cr3 pointing at the pagetable, so
+   we need to repoint it somewhere else before we can unpin it. */
+static void drop_other_mm_ref(void *info)
+{
+	struct mm_struct *mm = info;
+
+	if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
+		leave_mm(smp_processor_id());
+}
+
+static void drop_mm_ref(struct mm_struct *mm)
+{
+	if (current->active_mm == mm) {
+		if (current->mm == mm)
+			load_cr3(swapper_pg_dir);
+		else
+			leave_mm(smp_processor_id());
+	}
+
+	if (!cpus_empty(mm->cpu_vm_mask))
+		xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
+					   mm, 1);
+}
+#else
+static void drop_mm_ref(struct mm_struct *mm)
+{
+	if (current->active_mm == mm)
+		load_cr3(swapper_pg_dir);
+}
+#endif
+
+/*
+ * While a process runs, Xen pins its pagetables, which means that the
+ * hypervisor forces it to be read-only, and it controls all updates
+ * to it.  This means that all pagetable updates have to go via the
+ * hypervisor, which is moderately expensive.
+ *
+ * Since we're pulling the pagetable down, we switch to use init_mm,
+ * unpin old process pagetable and mark it all read-write, which
+ * allows further operations on it to be simple memory accesses.
+ *
+ * The only subtle point is that another CPU may be still using the
+ * pagetable because of lazy tlb flushing.  This means we need need to
+ * switch all CPUs off this pagetable before we can unpin it.
+ */
+void xen_exit_mmap(struct mm_struct *mm)
+{
+	get_cpu();		/* make sure we don't move around */
+	drop_mm_ref(mm);
+	put_cpu();
+
+	spin_lock(&mm->page_table_lock);
+	xen_pgd_unpin(mm->pgd);
+	spin_unlock(&mm->page_table_lock);
+}
diff --git a/arch/i386/xen/mmu.h b/arch/i386/xen/mmu.h
new file mode 100644
index 00000000000..c9ff27f3ac3
--- /dev/null
+++ b/arch/i386/xen/mmu.h
@@ -0,0 +1,60 @@
+#ifndef _XEN_MMU_H
+
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+/*
+ * Page-directory addresses above 4GB do not fit into architectural %cr3.
+ * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
+ * must use the following accessor macros to pack/unpack valid MFNs.
+ *
+ * Note that Xen is using the fact that the pagetable base is always
+ * page-aligned, and putting the 12 MSB of the address into the 12 LSB
+ * of cr3.
+ */
+#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
+#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
+
+
+void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
+
+void xen_set_pte(pte_t *ptep, pte_t pteval);
+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+		    pte_t *ptep, pte_t pteval);
+void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
+
+void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
+void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
+void xen_exit_mmap(struct mm_struct *mm);
+
+void xen_pgd_pin(pgd_t *pgd);
+//void xen_pgd_unpin(pgd_t *pgd);
+
+#ifdef CONFIG_X86_PAE
+unsigned long long xen_pte_val(pte_t);
+unsigned long long xen_pmd_val(pmd_t);
+unsigned long long xen_pgd_val(pgd_t);
+
+pte_t xen_make_pte(unsigned long long);
+pmd_t xen_make_pmd(unsigned long long);
+pgd_t xen_make_pgd(unsigned long long);
+
+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+		    pte_t *ptep, pte_t pteval);
+void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
+void xen_set_pud(pud_t *ptr, pud_t val);
+void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+void xen_pmd_clear(pmd_t *pmdp);
+
+
+#else
+unsigned long xen_pte_val(pte_t);
+unsigned long xen_pmd_val(pmd_t);
+unsigned long xen_pgd_val(pgd_t);
+
+pte_t xen_make_pte(unsigned long);
+pmd_t xen_make_pmd(unsigned long);
+pgd_t xen_make_pgd(unsigned long);
+#endif
+
+#endif	/* _XEN_MMU_H */
diff --git a/arch/i386/xen/multicalls.c b/arch/i386/xen/multicalls.c
new file mode 100644
index 00000000000..c837e8e463d
--- /dev/null
+++ b/arch/i386/xen/multicalls.c
@@ -0,0 +1,90 @@
+/*
+ * Xen hypercall batching.
+ *
+ * Xen allows multiple hypercalls to be issued at once, using the
+ * multicall interface.  This allows the cost of trapping into the
+ * hypervisor to be amortized over several calls.
+ *
+ * This file implements a simple interface for multicalls.  There's a
+ * per-cpu buffer of outstanding multicalls.  When you want to queue a
+ * multicall for issuing, you can allocate a multicall slot for the
+ * call and its arguments, along with storage for space which is
+ * pointed to by the arguments (for passing pointers to structures,
+ * etc).  When the multicall is actually issued, all the space for the
+ * commands and allocated memory is freed for reuse.
+ *
+ * Multicalls are flushed whenever any of the buffers get full, or
+ * when explicitly requested.  There's no way to get per-multicall
+ * return results back.  It will BUG if any of the multicalls fail.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+
+#include <asm/xen/hypercall.h>
+
+#include "multicalls.h"
+
+#define MC_BATCH	32
+#define MC_ARGS		(MC_BATCH * 16 / sizeof(u64))
+
+struct mc_buffer {
+	struct multicall_entry entries[MC_BATCH];
+	u64 args[MC_ARGS];
+	unsigned mcidx, argidx;
+};
+
+static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
+DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
+
+void xen_mc_flush(void)
+{
+	struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+	int ret = 0;
+	unsigned long flags;
+
+	BUG_ON(preemptible());
+
+	/* Disable interrupts in case someone comes in and queues
+	   something in the middle */
+	local_irq_save(flags);
+
+	if (b->mcidx) {
+		int i;
+
+		if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
+			BUG();
+		for (i = 0; i < b->mcidx; i++)
+			if (b->entries[i].result < 0)
+				ret++;
+		b->mcidx = 0;
+		b->argidx = 0;
+	} else
+		BUG_ON(b->argidx != 0);
+
+	local_irq_restore(flags);
+
+	BUG_ON(ret);
+}
+
+struct multicall_space __xen_mc_entry(size_t args)
+{
+	struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+	struct multicall_space ret;
+	unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64);
+
+	BUG_ON(preemptible());
+	BUG_ON(argspace > MC_ARGS);
+
+	if (b->mcidx == MC_BATCH ||
+	    (b->argidx + argspace) > MC_ARGS)
+		xen_mc_flush();
+
+	ret.mc = &b->entries[b->mcidx];
+	b->mcidx++;
+	ret.args = &b->args[b->argidx];
+	b->argidx += argspace;
+
+	return ret;
+}
diff --git a/arch/i386/xen/multicalls.h b/arch/i386/xen/multicalls.h
new file mode 100644
index 00000000000..e6f7530b156
--- /dev/null
+++ b/arch/i386/xen/multicalls.h
@@ -0,0 +1,45 @@
+#ifndef _XEN_MULTICALLS_H
+#define _XEN_MULTICALLS_H
+
+#include "xen-ops.h"
+
+/* Multicalls */
+struct multicall_space
+{
+	struct multicall_entry *mc;
+	void *args;
+};
+
+/* Allocate room for a multicall and its args */
+struct multicall_space __xen_mc_entry(size_t args);
+
+DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
+
+/* Call to start a batch of multiple __xen_mc_entry()s.  Must be
+   paired with xen_mc_issue() */
+static inline void xen_mc_batch(void)
+{
+	/* need to disable interrupts until this entry is complete */
+	local_irq_save(__get_cpu_var(xen_mc_irq_flags));
+}
+
+static inline struct multicall_space xen_mc_entry(size_t args)
+{
+	xen_mc_batch();
+	return __xen_mc_entry(args);
+}
+
+/* Flush all pending multicalls */
+void xen_mc_flush(void);
+
+/* Issue a multicall if we're not in a lazy mode */
+static inline void xen_mc_issue(unsigned mode)
+{
+	if ((xen_get_lazy_mode() & mode) == 0)
+		xen_mc_flush();
+
+	/* restore flags saved in xen_mc_batch */
+	local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
+}
+
+#endif /* _XEN_MULTICALLS_H */
diff --git a/arch/i386/xen/setup.c b/arch/i386/xen/setup.c
new file mode 100644
index 00000000000..2fe6eac510f
--- /dev/null
+++ b/arch/i386/xen/setup.c
@@ -0,0 +1,96 @@
+/*
+ * Machine specific setup for xen
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/pm.h>
+
+#include <asm/elf.h>
+#include <asm/e820.h>
+#include <asm/setup.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/interface/physdev.h>
+#include <xen/features.h>
+
+#include "xen-ops.h"
+
+/* These are code, but not functions.  Defined in entry.S */
+extern const char xen_hypervisor_callback[];
+extern const char xen_failsafe_callback[];
+
+unsigned long *phys_to_machine_mapping;
+EXPORT_SYMBOL(phys_to_machine_mapping);
+
+/**
+ * machine_specific_memory_setup - Hook for machine specific memory setup.
+ **/
+
+char * __init xen_memory_setup(void)
+{
+	unsigned long max_pfn = xen_start_info->nr_pages;
+
+	e820.nr_map = 0;
+	add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM);
+
+	return "Xen";
+}
+
+static void xen_idle(void)
+{
+	local_irq_disable();
+
+	if (need_resched())
+		local_irq_enable();
+	else {
+		current_thread_info()->status &= ~TS_POLLING;
+		smp_mb__after_clear_bit();
+		safe_halt();
+		current_thread_info()->status |= TS_POLLING;
+	}
+}
+
+void __init xen_arch_setup(void)
+{
+	struct physdev_set_iopl set_iopl;
+	int rc;
+
+	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
+	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
+
+	if (!xen_feature(XENFEAT_auto_translated_physmap))
+		HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
+
+	HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
+				 __KERNEL_CS, (unsigned long)xen_failsafe_callback);
+
+	set_iopl.iopl = 1;
+	rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+	if (rc != 0)
+		printk(KERN_INFO "physdev_op failed %d\n", rc);
+
+#ifdef CONFIG_ACPI
+	if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
+		printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
+		disable_acpi();
+	}
+#endif
+
+	memcpy(boot_command_line, xen_start_info->cmd_line,
+	       MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
+	       COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
+
+	pm_idle = xen_idle;
+
+#ifdef CONFIG_SMP
+	/* fill cpus_possible with all available cpus */
+	xen_fill_possible_map();
+#endif
+
+	paravirt_disable_iospace();
+}
diff --git a/arch/i386/xen/smp.c b/arch/i386/xen/smp.c
new file mode 100644
index 00000000000..557b8e24706
--- /dev/null
+++ b/arch/i386/xen/smp.c
@@ -0,0 +1,404 @@
+/*
+ * Xen SMP support
+ *
+ * This file implements the Xen versions of smp_ops.  SMP under Xen is
+ * very straightforward.  Bringing a CPU up is simply a matter of
+ * loading its initial context and setting it running.
+ *
+ * IPIs are handled through the Xen event mechanism.
+ *
+ * Because virtual CPUs can be scheduled onto any real CPU, there's no
+ * useful topology information for the kernel to make use of.  As a
+ * result, all CPUs are treated as if they're single-core and
+ * single-threaded.
+ *
+ * This does not handle HOTPLUG_CPU yet.
+ */
+#include <linux/sched.h>
+#include <linux/err.h>
+#include <linux/smp.h>
+
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/cpu.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+
+#include <asm/xen/interface.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/page.h>
+#include <xen/events.h>
+
+#include "xen-ops.h"
+#include "mmu.h"
+
+static cpumask_t cpu_initialized_map;
+static DEFINE_PER_CPU(int, resched_irq);
+static DEFINE_PER_CPU(int, callfunc_irq);
+
+/*
+ * Structure and data for smp_call_function(). This is designed to minimise
+ * static memory requirements. It also looks cleaner.
+ */
+static DEFINE_SPINLOCK(call_lock);
+
+struct call_data_struct {
+	void (*func) (void *info);
+	void *info;
+	atomic_t started;
+	atomic_t finished;
+	int wait;
+};
+
+static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
+
+static struct call_data_struct *call_data;
+
+/*
+ * Reschedule call back. Nothing to do,
+ * all the work is done automatically when
+ * we return from the interrupt.
+ */
+static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
+{
+	return IRQ_HANDLED;
+}
+
+static __cpuinit void cpu_bringup_and_idle(void)
+{
+	int cpu = smp_processor_id();
+
+	cpu_init();
+
+	preempt_disable();
+	per_cpu(cpu_state, cpu) = CPU_ONLINE;
+
+	xen_setup_cpu_clockevents();
+
+	/* We can take interrupts now: we're officially "up". */
+	local_irq_enable();
+
+	wmb();			/* make sure everything is out */
+	cpu_idle();
+}
+
+static int xen_smp_intr_init(unsigned int cpu)
+{
+	int rc;
+	const char *resched_name, *callfunc_name;
+
+	per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
+
+	resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
+	rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
+				    cpu,
+				    xen_reschedule_interrupt,
+				    IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+				    resched_name,
+				    NULL);
+	if (rc < 0)
+		goto fail;
+	per_cpu(resched_irq, cpu) = rc;
+
+	callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
+	rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
+				    cpu,
+				    xen_call_function_interrupt,
+				    IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+				    callfunc_name,
+				    NULL);
+	if (rc < 0)
+		goto fail;
+	per_cpu(callfunc_irq, cpu) = rc;
+
+	return 0;
+
+ fail:
+	if (per_cpu(resched_irq, cpu) >= 0)
+		unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
+	if (per_cpu(callfunc_irq, cpu) >= 0)
+		unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+	return rc;
+}
+
+void __init xen_fill_possible_map(void)
+{
+	int i, rc;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
+		if (rc >= 0)
+			cpu_set(i, cpu_possible_map);
+	}
+}
+
+void __init xen_smp_prepare_boot_cpu(void)
+{
+	int cpu;
+
+	BUG_ON(smp_processor_id() != 0);
+	native_smp_prepare_boot_cpu();
+
+	/* We've switched to the "real" per-cpu gdt, so make sure the
+	   old memory can be recycled */
+	make_lowmem_page_readwrite(&per_cpu__gdt_page);
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		cpus_clear(cpu_sibling_map[cpu]);
+		cpus_clear(cpu_core_map[cpu]);
+	}
+
+	xen_setup_vcpu_info_placement();
+}
+
+void __init xen_smp_prepare_cpus(unsigned int max_cpus)
+{
+	unsigned cpu;
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		cpus_clear(cpu_sibling_map[cpu]);
+		cpus_clear(cpu_core_map[cpu]);
+	}
+
+	smp_store_cpu_info(0);
+	set_cpu_sibling_map(0);
+
+	if (xen_smp_intr_init(0))
+		BUG();
+
+	cpu_initialized_map = cpumask_of_cpu(0);
+
+	/* Restrict the possible_map according to max_cpus. */
+	while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
+		for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--)
+			continue;
+		cpu_clear(cpu, cpu_possible_map);
+	}
+
+	for_each_possible_cpu (cpu) {
+		struct task_struct *idle;
+
+		if (cpu == 0)
+			continue;
+
+		idle = fork_idle(cpu);
+		if (IS_ERR(idle))
+			panic("failed fork for CPU %d", cpu);
+
+		cpu_set(cpu, cpu_present_map);
+	}
+
+	//init_xenbus_allowed_cpumask();
+}
+
+static __cpuinit int
+cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
+{
+	struct vcpu_guest_context *ctxt;
+	struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
+
+	if (cpu_test_and_set(cpu, cpu_initialized_map))
+		return 0;
+
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (ctxt == NULL)
+		return -ENOMEM;
+
+	ctxt->flags = VGCF_IN_KERNEL;
+	ctxt->user_regs.ds = __USER_DS;
+	ctxt->user_regs.es = __USER_DS;
+	ctxt->user_regs.fs = __KERNEL_PERCPU;
+	ctxt->user_regs.gs = 0;
+	ctxt->user_regs.ss = __KERNEL_DS;
+	ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
+	ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
+
+	memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
+
+	xen_copy_trap_info(ctxt->trap_ctxt);
+
+	ctxt->ldt_ents = 0;
+
+	BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK);
+	make_lowmem_page_readonly(gdt->gdt);
+
+	ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt);
+	ctxt->gdt_ents      = ARRAY_SIZE(gdt->gdt);
+
+	ctxt->user_regs.cs = __KERNEL_CS;
+	ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
+
+	ctxt->kernel_ss = __KERNEL_DS;
+	ctxt->kernel_sp = idle->thread.esp0;
+
+	ctxt->event_callback_cs     = __KERNEL_CS;
+	ctxt->event_callback_eip    = (unsigned long)xen_hypervisor_callback;
+	ctxt->failsafe_callback_cs  = __KERNEL_CS;
+	ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
+
+	per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
+	ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
+
+	if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
+		BUG();
+
+	kfree(ctxt);
+	return 0;
+}
+
+int __cpuinit xen_cpu_up(unsigned int cpu)
+{
+	struct task_struct *idle = idle_task(cpu);
+	int rc;
+
+#if 0
+	rc = cpu_up_check(cpu);
+	if (rc)
+		return rc;
+#endif
+
+	init_gdt(cpu);
+	per_cpu(current_task, cpu) = idle;
+	irq_ctx_init(cpu);
+	xen_setup_timer(cpu);
+
+	/* make sure interrupts start blocked */
+	per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
+
+	rc = cpu_initialize_context(cpu, idle);
+	if (rc)
+		return rc;
+
+	if (num_online_cpus() == 1)
+		alternatives_smp_switch(1);
+
+	rc = xen_smp_intr_init(cpu);
+	if (rc)
+		return rc;
+
+	smp_store_cpu_info(cpu);
+	set_cpu_sibling_map(cpu);
+	/* This must be done before setting cpu_online_map */
+	wmb();
+
+	cpu_set(cpu, cpu_online_map);
+
+	rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
+	BUG_ON(rc);
+
+	return 0;
+}
+
+void xen_smp_cpus_done(unsigned int max_cpus)
+{
+}
+
+static void stop_self(void *v)
+{
+	int cpu = smp_processor_id();
+
+	/* make sure we're not pinning something down */
+	load_cr3(swapper_pg_dir);
+	/* should set up a minimal gdt */
+
+	HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
+	BUG();
+}
+
+void xen_smp_send_stop(void)
+{
+	smp_call_function(stop_self, NULL, 0, 0);
+}
+
+void xen_smp_send_reschedule(int cpu)
+{
+	xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
+}
+
+
+static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
+{
+	unsigned cpu;
+
+	cpus_and(mask, mask, cpu_online_map);
+
+	for_each_cpu_mask(cpu, mask)
+		xen_send_IPI_one(cpu, vector);
+}
+
+static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
+{
+	void (*func) (void *info) = call_data->func;
+	void *info = call_data->info;
+	int wait = call_data->wait;
+
+	/*
+	 * Notify initiating CPU that I've grabbed the data and am
+	 * about to execute the function
+	 */
+	mb();
+	atomic_inc(&call_data->started);
+	/*
+	 * At this point the info structure may be out of scope unless wait==1
+	 */
+	irq_enter();
+	(*func)(info);
+	irq_exit();
+
+	if (wait) {
+		mb();		/* commit everything before setting finished */
+		atomic_inc(&call_data->finished);
+	}
+
+	return IRQ_HANDLED;
+}
+
+int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
+			       void *info, int wait)
+{
+	struct call_data_struct data;
+	int cpus;
+
+	/* Holding any lock stops cpus from going down. */
+	spin_lock(&call_lock);
+
+	cpu_clear(smp_processor_id(), mask);
+
+	cpus = cpus_weight(mask);
+	if (!cpus) {
+		spin_unlock(&call_lock);
+		return 0;
+	}
+
+	/* Can deadlock when called with interrupts disabled */
+	WARN_ON(irqs_disabled());
+
+	data.func = func;
+	data.info = info;
+	atomic_set(&data.started, 0);
+	data.wait = wait;
+	if (wait)
+		atomic_set(&data.finished, 0);
+
+	call_data = &data;
+	mb();			/* write everything before IPI */
+
+	/* Send a message to other CPUs and wait for them to respond */
+	xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
+
+	/* Make sure other vcpus get a chance to run.
+	   XXX too severe?  Maybe we should check the other CPU's states? */
+	HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+
+	/* Wait for response */
+	while (atomic_read(&data.started) != cpus ||
+	       (wait && atomic_read(&data.finished) != cpus))
+		cpu_relax();
+
+	spin_unlock(&call_lock);
+
+	return 0;
+}
diff --git a/arch/i386/xen/time.c b/arch/i386/xen/time.c
new file mode 100644
index 00000000000..51fdabf1fd4
--- /dev/null
+++ b/arch/i386/xen/time.c
@@ -0,0 +1,590 @@
+/*
+ * Xen time implementation.
+ *
+ * This is implemented in terms of a clocksource driver which uses
+ * the hypervisor clock as a nanosecond timebase, and a clockevent
+ * driver which uses the hypervisor's timer mechanism.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/kernel_stat.h>
+
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+
+#include "xen-ops.h"
+
+#define XEN_SHIFT 22
+
+/* Xen may fire a timer up to this many ns early */
+#define TIMER_SLOP	100000
+#define NS_PER_TICK	(1000000000LL / HZ)
+
+static cycle_t xen_clocksource_read(void);
+
+/* These are perodically updated in shared_info, and then copied here. */
+struct shadow_time_info {
+	u64 tsc_timestamp;     /* TSC at last update of time vals.  */
+	u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
+	u32 tsc_to_nsec_mul;
+	int tsc_shift;
+	u32 version;
+};
+
+static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
+
+/* runstate info updated by Xen */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
+
+/* snapshots of runstate info */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
+
+/* unused ns of stolen and blocked time */
+static DEFINE_PER_CPU(u64, residual_stolen);
+static DEFINE_PER_CPU(u64, residual_blocked);
+
+/* return an consistent snapshot of 64-bit time/counter value */
+static u64 get64(const u64 *p)
+{
+	u64 ret;
+
+	if (BITS_PER_LONG < 64) {
+		u32 *p32 = (u32 *)p;
+		u32 h, l;
+
+		/*
+		 * Read high then low, and then make sure high is
+		 * still the same; this will only loop if low wraps
+		 * and carries into high.
+		 * XXX some clean way to make this endian-proof?
+		 */
+		do {
+			h = p32[1];
+			barrier();
+			l = p32[0];
+			barrier();
+		} while (p32[1] != h);
+
+		ret = (((u64)h) << 32) | l;
+	} else
+		ret = *p;
+
+	return ret;
+}
+
+/*
+ * Runstate accounting
+ */
+static void get_runstate_snapshot(struct vcpu_runstate_info *res)
+{
+	u64 state_time;
+	struct vcpu_runstate_info *state;
+
+	BUG_ON(preemptible());
+
+	state = &__get_cpu_var(runstate);
+
+	/*
+	 * The runstate info is always updated by the hypervisor on
+	 * the current CPU, so there's no need to use anything
+	 * stronger than a compiler barrier when fetching it.
+	 */
+	do {
+		state_time = get64(&state->state_entry_time);
+		barrier();
+		*res = *state;
+		barrier();
+	} while (get64(&state->state_entry_time) != state_time);
+}
+
+static void setup_runstate_info(int cpu)
+{
+	struct vcpu_register_runstate_memory_area area;
+
+	area.addr.v = &per_cpu(runstate, cpu);
+
+	if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
+			       cpu, &area))
+		BUG();
+}
+
+static void do_stolen_accounting(void)
+{
+	struct vcpu_runstate_info state;
+	struct vcpu_runstate_info *snap;
+	s64 blocked, runnable, offline, stolen;
+	cputime_t ticks;
+
+	get_runstate_snapshot(&state);
+
+	WARN_ON(state.state != RUNSTATE_running);
+
+	snap = &__get_cpu_var(runstate_snapshot);
+
+	/* work out how much time the VCPU has not been runn*ing*  */
+	blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
+	runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
+	offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
+
+	*snap = state;
+
+	/* Add the appropriate number of ticks of stolen time,
+	   including any left-overs from last time.  Passing NULL to
+	   account_steal_time accounts the time as stolen. */
+	stolen = runnable + offline + __get_cpu_var(residual_stolen);
+
+	if (stolen < 0)
+		stolen = 0;
+
+	ticks = 0;
+	while (stolen >= NS_PER_TICK) {
+		ticks++;
+		stolen -= NS_PER_TICK;
+	}
+	__get_cpu_var(residual_stolen) = stolen;
+	account_steal_time(NULL, ticks);
+
+	/* Add the appropriate number of ticks of blocked time,
+	   including any left-overs from last time.  Passing idle to
+	   account_steal_time accounts the time as idle/wait. */
+	blocked += __get_cpu_var(residual_blocked);
+
+	if (blocked < 0)
+		blocked = 0;
+
+	ticks = 0;
+	while (blocked >= NS_PER_TICK) {
+		ticks++;
+		blocked -= NS_PER_TICK;
+	}
+	__get_cpu_var(residual_blocked) = blocked;
+	account_steal_time(idle_task(smp_processor_id()), ticks);
+}
+
+/*
+ * Xen sched_clock implementation.  Returns the number of unstolen
+ * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
+ * states.
+ */
+unsigned long long xen_sched_clock(void)
+{
+	struct vcpu_runstate_info state;
+	cycle_t now;
+	u64 ret;
+	s64 offset;
+
+	/*
+	 * Ideally sched_clock should be called on a per-cpu basis
+	 * anyway, so preempt should already be disabled, but that's
+	 * not current practice at the moment.
+	 */
+	preempt_disable();
+
+	now = xen_clocksource_read();
+
+	get_runstate_snapshot(&state);
+
+	WARN_ON(state.state != RUNSTATE_running);
+
+	offset = now - state.state_entry_time;
+	if (offset < 0)
+		offset = 0;
+
+	ret = state.time[RUNSTATE_blocked] +
+		state.time[RUNSTATE_running] +
+		offset;
+
+	preempt_enable();
+
+	return ret;
+}
+
+
+/* Get the CPU speed from Xen */
+unsigned long xen_cpu_khz(void)
+{
+	u64 cpu_khz = 1000000ULL << 32;
+	const struct vcpu_time_info *info =
+		&HYPERVISOR_shared_info->vcpu_info[0].time;
+
+	do_div(cpu_khz, info->tsc_to_system_mul);
+	if (info->tsc_shift < 0)
+		cpu_khz <<= -info->tsc_shift;
+	else
+		cpu_khz >>= info->tsc_shift;
+
+	return cpu_khz;
+}
+
+/*
+ * Reads a consistent set of time-base values from Xen, into a shadow data
+ * area.
+ */
+static unsigned get_time_values_from_xen(void)
+{
+	struct vcpu_time_info   *src;
+	struct shadow_time_info *dst;
+
+	/* src is shared memory with the hypervisor, so we need to
+	   make sure we get a consistent snapshot, even in the face of
+	   being preempted. */
+	src = &__get_cpu_var(xen_vcpu)->time;
+	dst = &__get_cpu_var(shadow_time);
+
+	do {
+		dst->version = src->version;
+		rmb();		/* fetch version before data */
+		dst->tsc_timestamp     = src->tsc_timestamp;
+		dst->system_timestamp  = src->system_time;
+		dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
+		dst->tsc_shift         = src->tsc_shift;
+		rmb();		/* test version after fetching data */
+	} while ((src->version & 1) | (dst->version ^ src->version));
+
+	return dst->version;
+}
+
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+	u64 product;
+#ifdef __i386__
+	u32 tmp1, tmp2;
+#endif
+
+	if (shift < 0)
+		delta >>= -shift;
+	else
+		delta <<= shift;
+
+#ifdef __i386__
+	__asm__ (
+		"mul  %5       ; "
+		"mov  %4,%%eax ; "
+		"mov  %%edx,%4 ; "
+		"mul  %5       ; "
+		"xor  %5,%5    ; "
+		"add  %4,%%eax ; "
+		"adc  %5,%%edx ; "
+		: "=A" (product), "=r" (tmp1), "=r" (tmp2)
+		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif __x86_64__
+	__asm__ (
+		"mul %%rdx ; shrd $32,%%rdx,%%rax"
+		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#else
+#error implement me!
+#endif
+
+	return product;
+}
+
+static u64 get_nsec_offset(struct shadow_time_info *shadow)
+{
+	u64 now, delta;
+	now = native_read_tsc();
+	delta = now - shadow->tsc_timestamp;
+	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+}
+
+static cycle_t xen_clocksource_read(void)
+{
+	struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
+	cycle_t ret;
+	unsigned version;
+
+	do {
+		version = get_time_values_from_xen();
+		barrier();
+		ret = shadow->system_timestamp + get_nsec_offset(shadow);
+		barrier();
+	} while (version != __get_cpu_var(xen_vcpu)->time.version);
+
+	put_cpu_var(shadow_time);
+
+	return ret;
+}
+
+static void xen_read_wallclock(struct timespec *ts)
+{
+	const struct shared_info *s = HYPERVISOR_shared_info;
+	u32 version;
+	u64 delta;
+	struct timespec now;
+
+	/* get wallclock at system boot */
+	do {
+		version = s->wc_version;
+		rmb();		/* fetch version before time */
+		now.tv_sec  = s->wc_sec;
+		now.tv_nsec = s->wc_nsec;
+		rmb();		/* fetch time before checking version */
+	} while ((s->wc_version & 1) | (version ^ s->wc_version));
+
+	delta = xen_clocksource_read();	/* time since system boot */
+	delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
+
+	now.tv_nsec = do_div(delta, NSEC_PER_SEC);
+	now.tv_sec = delta;
+
+	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+}
+
+unsigned long xen_get_wallclock(void)
+{
+	struct timespec ts;
+
+	xen_read_wallclock(&ts);
+
+	return ts.tv_sec;
+}
+
+int xen_set_wallclock(unsigned long now)
+{
+	/* do nothing for domU */
+	return -1;
+}
+
+static struct clocksource xen_clocksource __read_mostly = {
+	.name = "xen",
+	.rating = 400,
+	.read = xen_clocksource_read,
+	.mask = ~0,
+	.mult = 1<<XEN_SHIFT,		/* time directly in nanoseconds */
+	.shift = XEN_SHIFT,
+	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+/*
+   Xen clockevent implementation
+
+   Xen has two clockevent implementations:
+
+   The old timer_op one works with all released versions of Xen prior
+   to version 3.0.4.  This version of the hypervisor provides a
+   single-shot timer with nanosecond resolution.  However, sharing the
+   same event channel is a 100Hz tick which is delivered while the
+   vcpu is running.  We don't care about or use this tick, but it will
+   cause the core time code to think the timer fired too soon, and
+   will end up resetting it each time.  It could be filtered, but
+   doing so has complications when the ktime clocksource is not yet
+   the xen clocksource (ie, at boot time).
+
+   The new vcpu_op-based timer interface allows the tick timer period
+   to be changed or turned off.  The tick timer is not useful as a
+   periodic timer because events are only delivered to running vcpus.
+   The one-shot timer can report when a timeout is in the past, so
+   set_next_event is capable of returning -ETIME when appropriate.
+   This interface is used when available.
+*/
+
+
+/*
+  Get a hypervisor absolute time.  In theory we could maintain an
+  offset between the kernel's time and the hypervisor's time, and
+  apply that to a kernel's absolute timeout.  Unfortunately the
+  hypervisor and kernel times can drift even if the kernel is using
+  the Xen clocksource, because ntp can warp the kernel's clocksource.
+*/
+static s64 get_abs_timeout(unsigned long delta)
+{
+	return xen_clocksource_read() + delta;
+}
+
+static void xen_timerop_set_mode(enum clock_event_mode mode,
+				 struct clock_event_device *evt)
+{
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		/* unsupported */
+		WARN_ON(1);
+		break;
+
+	case CLOCK_EVT_MODE_ONESHOT:
+		break;
+
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		HYPERVISOR_set_timer_op(0);  /* cancel timeout */
+		break;
+	}
+}
+
+static int xen_timerop_set_next_event(unsigned long delta,
+				      struct clock_event_device *evt)
+{
+	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+
+	if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
+		BUG();
+
+	/* We may have missed the deadline, but there's no real way of
+	   knowing for sure.  If the event was in the past, then we'll
+	   get an immediate interrupt. */
+
+	return 0;
+}
+
+static const struct clock_event_device xen_timerop_clockevent = {
+	.name = "xen",
+	.features = CLOCK_EVT_FEAT_ONESHOT,
+
+	.max_delta_ns = 0xffffffff,
+	.min_delta_ns = TIMER_SLOP,
+
+	.mult = 1,
+	.shift = 0,
+	.rating = 500,
+
+	.set_mode = xen_timerop_set_mode,
+	.set_next_event = xen_timerop_set_next_event,
+};
+
+
+
+static void xen_vcpuop_set_mode(enum clock_event_mode mode,
+				struct clock_event_device *evt)
+{
+	int cpu = smp_processor_id();
+
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		WARN_ON(1);	/* unsupported */
+		break;
+
+	case CLOCK_EVT_MODE_ONESHOT:
+		if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+			BUG();
+		break;
+
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
+		    HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+			BUG();
+		break;
+	}
+}
+
+static int xen_vcpuop_set_next_event(unsigned long delta,
+				     struct clock_event_device *evt)
+{
+	int cpu = smp_processor_id();
+	struct vcpu_set_singleshot_timer single;
+	int ret;
+
+	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+
+	single.timeout_abs_ns = get_abs_timeout(delta);
+	single.flags = VCPU_SSHOTTMR_future;
+
+	ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
+
+	BUG_ON(ret != 0 && ret != -ETIME);
+
+	return ret;
+}
+
+static const struct clock_event_device xen_vcpuop_clockevent = {
+	.name = "xen",
+	.features = CLOCK_EVT_FEAT_ONESHOT,
+
+	.max_delta_ns = 0xffffffff,
+	.min_delta_ns = TIMER_SLOP,
+
+	.mult = 1,
+	.shift = 0,
+	.rating = 500,
+
+	.set_mode = xen_vcpuop_set_mode,
+	.set_next_event = xen_vcpuop_set_next_event,
+};
+
+static const struct clock_event_device *xen_clockevent =
+	&xen_timerop_clockevent;
+static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
+
+static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
+{
+	struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
+	irqreturn_t ret;
+
+	ret = IRQ_NONE;
+	if (evt->event_handler) {
+		evt->event_handler(evt);
+		ret = IRQ_HANDLED;
+	}
+
+	do_stolen_accounting();
+
+	return ret;
+}
+
+void xen_setup_timer(int cpu)
+{
+	const char *name;
+	struct clock_event_device *evt;
+	int irq;
+
+	printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
+
+	name = kasprintf(GFP_KERNEL, "timer%d", cpu);
+	if (!name)
+		name = "<timer kasprintf failed>";
+
+	irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
+				      IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+				      name, NULL);
+
+	evt = &per_cpu(xen_clock_events, cpu);
+	memcpy(evt, xen_clockevent, sizeof(*evt));
+
+	evt->cpumask = cpumask_of_cpu(cpu);
+	evt->irq = irq;
+
+	setup_runstate_info(cpu);
+}
+
+void xen_setup_cpu_clockevents(void)
+{
+	BUG_ON(preemptible());
+
+	clockevents_register_device(&__get_cpu_var(xen_clock_events));
+}
+
+__init void xen_time_init(void)
+{
+	int cpu = smp_processor_id();
+
+	get_time_values_from_xen();
+
+	clocksource_register(&xen_clocksource);
+
+	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
+		/* Successfully turned off 100Hz tick, so we have the
+		   vcpuop-based timer interface */
+		printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
+		xen_clockevent = &xen_vcpuop_clockevent;
+	}
+
+	/* Set initial system time with full resolution */
+	xen_read_wallclock(&xtime);
+	set_normalized_timespec(&wall_to_monotonic,
+				-xtime.tv_sec, -xtime.tv_nsec);
+
+	tsc_disable = 0;
+
+	xen_setup_timer(cpu);
+	xen_setup_cpu_clockevents();
+}
diff --git a/arch/i386/xen/xen-asm.S b/arch/i386/xen/xen-asm.S
new file mode 100644
index 00000000000..1a43b60c0c6
--- /dev/null
+++ b/arch/i386/xen/xen-asm.S
@@ -0,0 +1,291 @@
+/*
+	Asm versions of Xen pv-ops, suitable for either direct use or inlining.
+	The inline versions are the same as the direct-use versions, with the
+	pre- and post-amble chopped off.
+
+	This code is encoded for size rather than absolute efficiency,
+	with a view to being able to inline as much as possible.
+
+	We only bother with direct forms (ie, vcpu in pda) of the operations
+	here; the indirect forms are better handled in C, since they're
+	generally too large to inline anyway.
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
+#include <asm/segment.h>
+
+#include <xen/interface/xen.h>
+
+#define RELOC(x, v)	.globl x##_reloc; x##_reloc=v
+#define ENDPATCH(x)	.globl x##_end; x##_end=.
+
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI	0x80000000
+
+/*
+	Enable events.  This clears the event mask and tests the pending
+	event status with one and operation.  If there are pending
+	events, then enter the hypervisor to get them handled.
+ */
+ENTRY(xen_irq_enable_direct)
+	/* Clear mask and test pending */
+	andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
+	/* Preempt here doesn't matter because that will deal with
+	   any pending interrupts.  The pending check may end up being
+	   run on the wrong CPU, but that doesn't hurt. */
+	jz 1f
+2:	call check_events
+1:
+ENDPATCH(xen_irq_enable_direct)
+	ret
+	ENDPROC(xen_irq_enable_direct)
+	RELOC(xen_irq_enable_direct, 2b+1)
+
+
+/*
+	Disabling events is simply a matter of making the event mask
+	non-zero.
+ */
+ENTRY(xen_irq_disable_direct)
+	movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+ENDPATCH(xen_irq_disable_direct)
+	ret
+	ENDPROC(xen_irq_disable_direct)
+	RELOC(xen_irq_disable_direct, 0)
+
+/*
+	(xen_)save_fl is used to get the current interrupt enable status.
+	Callers expect the status to be in X86_EFLAGS_IF, and other bits
+	may be set in the return value.  We take advantage of this by
+	making sure that X86_EFLAGS_IF has the right value (and other bits
+	in that byte are 0), but other bits in the return value are
+	undefined.  We need to toggle the state of the bit, because
+	Xen and x86 use opposite senses (mask vs enable).
+ */
+ENTRY(xen_save_fl_direct)
+	testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+	setz %ah
+	addb %ah,%ah
+ENDPATCH(xen_save_fl_direct)
+	ret
+	ENDPROC(xen_save_fl_direct)
+	RELOC(xen_save_fl_direct, 0)
+
+
+/*
+	In principle the caller should be passing us a value return
+	from xen_save_fl_direct, but for robustness sake we test only
+	the X86_EFLAGS_IF flag rather than the whole byte. After
+	setting the interrupt mask state, it checks for unmasked
+	pending events and enters the hypervisor to get them delivered
+	if so.
+ */
+ENTRY(xen_restore_fl_direct)
+	testb $X86_EFLAGS_IF>>8, %ah
+	setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+	/* Preempt here doesn't matter because that will deal with
+	   any pending interrupts.  The pending check may end up being
+	   run on the wrong CPU, but that doesn't hurt. */
+
+	/* check for unmasked and pending */
+	cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
+	jz 1f
+2:	call check_events
+1:
+ENDPATCH(xen_restore_fl_direct)
+	ret
+	ENDPROC(xen_restore_fl_direct)
+	RELOC(xen_restore_fl_direct, 2b+1)
+
+/*
+	This is run where a normal iret would be run, with the same stack setup:
+	      8: eflags
+	      4: cs
+	esp-> 0: eip
+
+	This attempts to make sure that any pending events are dealt
+	with on return to usermode, but there is a small window in
+	which an event can happen just before entering usermode.  If
+	the nested interrupt ends up setting one of the TIF_WORK_MASK
+	pending work flags, they will not be tested again before
+	returning to usermode. This means that a process can end up
+	with pending work, which will be unprocessed until the process
+	enters and leaves the kernel again, which could be an
+	unbounded amount of time.  This means that a pending signal or
+	reschedule event could be indefinitely delayed.
+
+	The fix is to notice a nested interrupt in the critical
+	window, and if one occurs, then fold the nested interrupt into
+	the current interrupt stack frame, and re-process it
+	iteratively rather than recursively.  This means that it will
+	exit via the normal path, and all pending work will be dealt
+	with appropriately.
+
+	Because the nested interrupt handler needs to deal with the
+	current stack state in whatever form its in, we keep things
+	simple by only using a single register which is pushed/popped
+	on the stack.
+
+	Non-direct iret could be done in the same way, but it would
+	require an annoying amount of code duplication.  We'll assume
+	that direct mode will be the common case once the hypervisor
+	support becomes commonplace.
+ */
+ENTRY(xen_iret_direct)
+	/* test eflags for special cases */
+	testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
+	jnz hyper_iret
+
+	push %eax
+	ESP_OFFSET=4	# bytes pushed onto stack
+
+	/* Store vcpu_info pointer for easy access.  Do it this
+	   way to avoid having to reload %fs */
+#ifdef CONFIG_SMP
+	GET_THREAD_INFO(%eax)
+	movl TI_cpu(%eax),%eax
+	movl __per_cpu_offset(,%eax,4),%eax
+	lea per_cpu__xen_vcpu_info(%eax),%eax
+#else
+	movl $per_cpu__xen_vcpu_info, %eax
+#endif
+
+	/* check IF state we're restoring */
+	testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
+
+	/* Maybe enable events.  Once this happens we could get a
+	   recursive event, so the critical region starts immediately
+	   afterwards.  However, if that happens we don't end up
+	   resuming the code, so we don't have to be worried about
+	   being preempted to another CPU. */
+	setz XEN_vcpu_info_mask(%eax)
+xen_iret_start_crit:
+
+	/* check for unmasked and pending */
+	cmpw $0x0001, XEN_vcpu_info_pending(%eax)
+
+	/* If there's something pending, mask events again so we
+	   can jump back into xen_hypervisor_callback */
+	sete XEN_vcpu_info_mask(%eax)
+
+	popl %eax
+
+	/* From this point on the registers are restored and the stack
+	   updated, so we don't need to worry about it if we're preempted */
+iret_restore_end:
+
+	/* Jump to hypervisor_callback after fixing up the stack.
+	   Events are masked, so jumping out of the critical
+	   region is OK. */
+	je xen_hypervisor_callback
+
+	iret
+xen_iret_end_crit:
+
+hyper_iret:
+	/* put this out of line since its very rarely used */
+	jmp hypercall_page + __HYPERVISOR_iret * 32
+
+	.globl xen_iret_start_crit, xen_iret_end_crit
+
+/*
+   This is called by xen_hypervisor_callback in entry.S when it sees
+   that the EIP at the time of interrupt was between xen_iret_start_crit
+   and xen_iret_end_crit.  We're passed the EIP in %eax so we can do
+   a more refined determination of what to do.
+
+   The stack format at this point is:
+	----------------
+	 ss		: (ss/esp may be present if we came from usermode)
+	 esp		:
+	 eflags		}  outer exception info
+	 cs		}
+	 eip		}
+	---------------- <- edi (copy dest)
+	 eax		:  outer eax if it hasn't been restored
+	----------------
+	 eflags		}  nested exception info
+	 cs		}   (no ss/esp because we're nested
+	 eip		}    from the same ring)
+	 orig_eax	}<- esi (copy src)
+	 - - - - - - - -
+	 fs		}
+	 es		}
+	 ds		}  SAVE_ALL state
+	 eax		}
+	  :		:
+	 ebx		}
+	----------------
+	 return addr	 <- esp
+	----------------
+
+   In order to deliver the nested exception properly, we need to shift
+   everything from the return addr up to the error code so it
+   sits just under the outer exception info.  This means that when we
+   handle the exception, we do it in the context of the outer exception
+   rather than starting a new one.
+
+   The only caveat is that if the outer eax hasn't been
+   restored yet (ie, it's still on stack), we need to insert
+   its value into the SAVE_ALL state before going on, since
+   it's usermode state which we eventually need to restore.
+ */
+ENTRY(xen_iret_crit_fixup)
+	/* offsets +4 for return address */
+
+	/*
+	   Paranoia: Make sure we're really coming from userspace.
+	   One could imagine a case where userspace jumps into the
+	   critical range address, but just before the CPU delivers a GP,
+	   it decides to deliver an interrupt instead.  Unlikely?
+	   Definitely.  Easy to avoid?  Yes.  The Intel documents
+	   explicitly say that the reported EIP for a bad jump is the
+	   jump instruction itself, not the destination, but some virtual
+	   environments get this wrong.
+	 */
+	movl PT_CS+4(%esp), %ecx
+	andl $SEGMENT_RPL_MASK, %ecx
+	cmpl $USER_RPL, %ecx
+	je 2f
+
+	lea PT_ORIG_EAX+4(%esp), %esi
+	lea PT_EFLAGS+4(%esp), %edi
+
+	/* If eip is before iret_restore_end then stack
+	   hasn't been restored yet. */
+	cmp $iret_restore_end, %eax
+	jae 1f
+
+	movl 0+4(%edi),%eax		/* copy EAX */
+	movl %eax, PT_EAX+4(%esp)
+
+	lea ESP_OFFSET(%edi),%edi	/* move dest up over saved regs */
+
+	/* set up the copy */
+1:	std
+	mov $(PT_EIP+4) / 4, %ecx	/* copy ret+saved regs up to orig_eax */
+	rep movsl
+	cld
+
+	lea 4(%edi),%esp		/* point esp to new frame */
+2:	ret
+
+
+/*
+	Force an event check by making a hypercall,
+	but preserve regs before making the call.
+ */
+check_events:
+	push %eax
+	push %ecx
+	push %edx
+	call force_evtchn_callback
+	pop %edx
+	pop %ecx
+	pop %eax
+	ret
diff --git a/arch/i386/xen/xen-head.S b/arch/i386/xen/xen-head.S
new file mode 100644
index 00000000000..2998d55a001
--- /dev/null
+++ b/arch/i386/xen/xen-head.S
@@ -0,0 +1,36 @@
+/* Xen-specific pieces of head.S, intended to be included in the right
+	place in head.S */
+
+#ifdef CONFIG_XEN
+
+#include <linux/elfnote.h>
+#include <asm/boot.h>
+#include <xen/interface/elfnote.h>
+
+ENTRY(startup_xen)
+	movl %esi,xen_start_info
+	cld
+	movl $(init_thread_union+THREAD_SIZE),%esp
+	jmp xen_start_kernel
+
+.pushsection ".bss.page_aligned"
+	.align PAGE_SIZE_asm
+ENTRY(hypercall_page)
+	.skip 0x1000
+.popsection
+
+	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz "linux")
+	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION,  .asciz "2.6")
+	ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION,    .asciz "xen-3.0")
+	ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      .long  __PAGE_OFFSET)
+	ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .long  startup_xen)
+	ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long  hypercall_page)
+	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz "!writable_page_tables|pae_pgdir_above_4gb")
+#ifdef CONFIG_X86_PAE
+	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "yes")
+#else
+	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "no")
+#endif
+	ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")
+
+#endif /*CONFIG_XEN */
diff --git a/arch/i386/xen/xen-ops.h b/arch/i386/xen/xen-ops.h
new file mode 100644
index 00000000000..b9aaea45f07
--- /dev/null
+++ b/arch/i386/xen/xen-ops.h
@@ -0,0 +1,71 @@
+#ifndef XEN_OPS_H
+#define XEN_OPS_H
+
+#include <linux/init.h>
+
+/* These are code, but not functions.  Defined in entry.S */
+extern const char xen_hypervisor_callback[];
+extern const char xen_failsafe_callback[];
+
+void xen_copy_trap_info(struct trap_info *traps);
+
+DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
+DECLARE_PER_CPU(unsigned long, xen_cr3);
+
+extern struct start_info *xen_start_info;
+extern struct shared_info *HYPERVISOR_shared_info;
+
+char * __init xen_memory_setup(void);
+void __init xen_arch_setup(void);
+void __init xen_init_IRQ(void);
+
+void xen_setup_timer(int cpu);
+void xen_setup_cpu_clockevents(void);
+unsigned long xen_cpu_khz(void);
+void __init xen_time_init(void);
+unsigned long xen_get_wallclock(void);
+int xen_set_wallclock(unsigned long time);
+unsigned long long xen_sched_clock(void);
+
+void xen_mark_init_mm_pinned(void);
+
+DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
+
+static inline unsigned xen_get_lazy_mode(void)
+{
+	return x86_read_percpu(xen_lazy_mode);
+}
+
+void __init xen_fill_possible_map(void);
+
+void __init xen_setup_vcpu_info_placement(void);
+void xen_smp_prepare_boot_cpu(void);
+void xen_smp_prepare_cpus(unsigned int max_cpus);
+int xen_cpu_up(unsigned int cpu);
+void xen_smp_cpus_done(unsigned int max_cpus);
+
+void xen_smp_send_stop(void);
+void xen_smp_send_reschedule(int cpu);
+int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic,
+			   int wait);
+int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+				 int nonatomic, int wait);
+
+int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
+			       void *info, int wait);
+
+
+/* Declare an asm function, along with symbols needed to make it
+   inlineable */
+#define DECL_ASM(ret, name, ...)		\
+	ret name(__VA_ARGS__);			\
+	extern char name##_end[];		\
+	extern char name##_reloc[]		\
+
+DECL_ASM(void, xen_irq_enable_direct, void);
+DECL_ASM(void, xen_irq_disable_direct, void);
+DECL_ASM(unsigned long, xen_save_fl_direct, void);
+DECL_ASM(void, xen_restore_fl_direct, unsigned long);
+
+void xen_iret_direct(void);
+#endif /* XEN_OPS_H */
diff --git a/arch/mips/basler/excite/excite_setup.c b/arch/mips/basler/excite/excite_setup.c
index 2f0e4c08eb0..56003188f17 100644
--- a/arch/mips/basler/excite/excite_setup.c
+++ b/arch/mips/basler/excite/excite_setup.c
@@ -26,6 +26,7 @@
 #include <linux/tty.h>
 #include <linux/serial_core.h>
 #include <linux/serial.h>
+#include <linux/serial_8250.h>
 #include <linux/ioport.h>
 #include <linux/spinlock.h>
 #include <asm/bootinfo.h>
diff --git a/arch/mips/gt64120/wrppmc/setup.c b/arch/mips/gt64120/wrppmc/setup.c
index ea965529e5e..ed58c13b603 100644
--- a/arch/mips/gt64120/wrppmc/setup.c
+++ b/arch/mips/gt64120/wrppmc/setup.c
@@ -14,6 +14,7 @@
 #include <linux/tty.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 #include <linux/pm.h>
 
 #include <asm/io.h>
diff --git a/arch/mips/mips-boards/atlas/atlas_setup.c b/arch/mips/mips-boards/atlas/atlas_setup.c
index 1cc6ebbedfd..c68358a476d 100644
--- a/arch/mips/mips-boards/atlas/atlas_setup.c
+++ b/arch/mips/mips-boards/atlas/atlas_setup.c
@@ -22,6 +22,7 @@
 #include <linux/tty.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 
 #include <asm/cpu.h>
 #include <asm/bootinfo.h>
diff --git a/arch/mips/mips-boards/sead/sead_setup.c b/arch/mips/mips-boards/sead/sead_setup.c
index bb801409d39..5f70eaf01fa 100644
--- a/arch/mips/mips-boards/sead/sead_setup.c
+++ b/arch/mips/mips-boards/sead/sead_setup.c
@@ -23,6 +23,7 @@
 #include <linux/tty.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 
 #include <asm/cpu.h>
 #include <asm/bootinfo.h>
diff --git a/arch/mips/mipssim/sim_setup.c b/arch/mips/mipssim/sim_setup.c
index 60e66906be6..17819b59410 100644
--- a/arch/mips/mipssim/sim_setup.c
+++ b/arch/mips/mipssim/sim_setup.c
@@ -26,6 +26,7 @@
 #include <linux/tty.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 
 #include <asm/cpu.h>
 #include <asm/bootinfo.h>
diff --git a/arch/mips/pmc-sierra/msp71xx/msp_serial.c b/arch/mips/pmc-sierra/msp71xx/msp_serial.c
index c41b53faa8f..e25bac537d7 100644
--- a/arch/mips/pmc-sierra/msp71xx/msp_serial.c
+++ b/arch/mips/pmc-sierra/msp71xx/msp_serial.c
@@ -32,6 +32,7 @@
 #include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/serial.h>
+#include <linux/serial_8250.h>
 
 #include <msp_prom.h>
 #include <msp_int.h>
diff --git a/arch/mips/pmc-sierra/yosemite/setup.c b/arch/mips/pmc-sierra/yosemite/setup.c
index 6a6e15e4000..f7f93ae24c3 100644
--- a/arch/mips/pmc-sierra/yosemite/setup.c
+++ b/arch/mips/pmc-sierra/yosemite/setup.c
@@ -39,6 +39,7 @@
 #include <linux/tty.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 
 #include <asm/time.h>
 #include <asm/bootinfo.h>
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index faf5ef3e90d..94b4a028232 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -156,11 +156,14 @@ static DEVICE_ATTR(devspec, S_IRUGO, pci_show_devspec, NULL);
 #endif /* CONFIG_PPC_OF */
 
 /* Add sysfs properties */
-void pcibios_add_platform_entries(struct pci_dev *pdev)
+int pcibios_add_platform_entries(struct pci_dev *pdev)
 {
 #ifdef CONFIG_PPC_OF
-	device_create_file(&pdev->dev, &dev_attr_devspec);
+	return device_create_file(&pdev->dev, &dev_attr_devspec);
+#else
+	return 0;
 #endif /* CONFIG_PPC_OF */
+
 }
 
 char __init *pcibios_setup(char *str)
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index bc43bba05cf..6018178708a 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -350,11 +350,13 @@ void __init setup_system(void)
 {
 	DBG(" -> setup_system()\n");
 
-	/* Apply CPUs-specific fixups to kernel text (nop out sections
-	 * not relevant to this CPU)
+	/* Apply the CPUs-specific and firmware specific fixups to kernel
+	 * text (nop out sections not relevant to this CPU or this firmware)
 	 */
 	do_feature_fixups(cur_cpu_spec->cpu_features,
 			  &__start___ftr_fixup, &__stop___ftr_fixup);
+	do_feature_fixups(powerpc_firmware_features,
+			  &__start___fw_ftr_fixup, &__stop___fw_ftr_fixup);
 
 	/*
 	 * Unflatten the device-tree passed by prom_init or kexec
@@ -392,12 +394,6 @@ void __init setup_system(void)
 	if (ppc_md.init_early)
 		ppc_md.init_early();
 
-	/* Apply firmware specific fixups to kernel text (nop out
-	 * sections not relevant to this firmware)
-	 */
-	do_feature_fixups(powerpc_firmware_features,
-			  &__start___fw_ftr_fixup, &__stop___fw_ftr_fixup);
-
  	/*
 	 * We can discover serial ports now since the above did setup the
 	 * hash table management for us, thus ioremap works. We do that early
diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c
index b42cbf1e2d7..bd85b5fd08c 100644
--- a/arch/powerpc/kernel/sys_ppc32.c
+++ b/arch/powerpc/kernel/sys_ppc32.c
@@ -773,6 +773,13 @@ asmlinkage int compat_sys_truncate64(const char __user * path, u32 reg4,
 	return sys_truncate(path, (high << 32) | low);
 }
 
+asmlinkage long compat_sys_fallocate(int fd, int mode, u32 offhi, u32 offlo,
+				     u32 lenhi, u32 lenlo)
+{
+	return sys_fallocate(fd, mode, ((loff_t)offhi << 32) | offlo,
+			     ((loff_t)lenhi << 32) | lenlo);
+}
+
 asmlinkage int compat_sys_ftruncate64(unsigned int fd, u32 reg4, unsigned long high,
 				 unsigned long low)
 {
diff --git a/arch/ppc/platforms/4xx/bamboo.c b/arch/ppc/platforms/4xx/bamboo.c
index 349660b84a0..017623c9bc4 100644
--- a/arch/ppc/platforms/4xx/bamboo.c
+++ b/arch/ppc/platforms/4xx/bamboo.c
@@ -29,6 +29,7 @@
 #include <linux/tty.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 #include <linux/ethtool.h>
 
 #include <asm/system.h>
diff --git a/arch/ppc/platforms/4xx/bubinga.c b/arch/ppc/platforms/4xx/bubinga.c
index 1a7f075b754..cd696be55ac 100644
--- a/arch/ppc/platforms/4xx/bubinga.c
+++ b/arch/ppc/platforms/4xx/bubinga.c
@@ -21,6 +21,7 @@
 #include <linux/tty.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 
 #include <asm/system.h>
 #include <asm/pci-bridge.h>
diff --git a/arch/ppc/platforms/4xx/cpci405.c b/arch/ppc/platforms/4xx/cpci405.c
index 8474b05b795..2e7e25dd84c 100644
--- a/arch/ppc/platforms/4xx/cpci405.c
+++ b/arch/ppc/platforms/4xx/cpci405.c
@@ -23,6 +23,7 @@
 #include <asm/todc.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 #include <asm/ocp.h>
 #include <asm/ibm_ocp_pci.h>
 #include <platforms/4xx/ibm405gp.h>
diff --git a/arch/ppc/platforms/4xx/ebony.c b/arch/ppc/platforms/4xx/ebony.c
index f0f9cc8480c..05d7184d7e1 100644
--- a/arch/ppc/platforms/4xx/ebony.c
+++ b/arch/ppc/platforms/4xx/ebony.c
@@ -32,6 +32,7 @@
 #include <linux/tty.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 
 #include <asm/system.h>
 #include <asm/pgtable.h>
diff --git a/arch/ppc/platforms/4xx/luan.c b/arch/ppc/platforms/4xx/luan.c
index 61706ef3711..4b169610f15 100644
--- a/arch/ppc/platforms/4xx/luan.c
+++ b/arch/ppc/platforms/4xx/luan.c
@@ -30,6 +30,7 @@
 #include <linux/tty.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 
 #include <asm/system.h>
 #include <asm/pgtable.h>
diff --git a/arch/ppc/platforms/4xx/ocotea.c b/arch/ppc/platforms/4xx/ocotea.c
index 5e994e146ba..fd0f971881d 100644
--- a/arch/ppc/platforms/4xx/ocotea.c
+++ b/arch/ppc/platforms/4xx/ocotea.c
@@ -30,6 +30,7 @@
 #include <linux/tty.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 
 #include <asm/system.h>
 #include <asm/pgtable.h>
diff --git a/arch/ppc/platforms/4xx/taishan.c b/arch/ppc/platforms/4xx/taishan.c
index 5d9af8ddb15..888c492b4a4 100644
--- a/arch/ppc/platforms/4xx/taishan.c
+++ b/arch/ppc/platforms/4xx/taishan.c
@@ -30,6 +30,7 @@
 #include <linux/tty.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 #include <linux/platform_device.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/nand.h>
diff --git a/arch/ppc/platforms/4xx/yucca.c b/arch/ppc/platforms/4xx/yucca.c
index 346787df0dd..a83b0baea01 100644
--- a/arch/ppc/platforms/4xx/yucca.c
+++ b/arch/ppc/platforms/4xx/yucca.c
@@ -31,6 +31,7 @@
 #include <linux/tty.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 
 #include <asm/system.h>
 #include <asm/pgtable.h>
diff --git a/arch/ppc/platforms/85xx/sbc8560.c b/arch/ppc/platforms/85xx/sbc8560.c
index 1d10ab98f66..3d7addbdecf 100644
--- a/arch/ppc/platforms/85xx/sbc8560.c
+++ b/arch/ppc/platforms/85xx/sbc8560.c
@@ -26,6 +26,7 @@
 #include <linux/serial.h>
 #include <linux/tty.h>	/* for linux/serial_core.h */
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 #include <linux/initrd.h>
 #include <linux/module.h>
 #include <linux/fsl_devices.h>
diff --git a/arch/ppc/platforms/chestnut.c b/arch/ppc/platforms/chestnut.c
index a764ae71cbc..248684f50dd 100644
--- a/arch/ppc/platforms/chestnut.c
+++ b/arch/ppc/platforms/chestnut.c
@@ -25,6 +25,7 @@
 #include <linux/ide.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 #include <linux/mtd/physmap.h>
 #include <asm/system.h>
 #include <asm/pgtable.h>
diff --git a/arch/ppc/platforms/ev64260.c b/arch/ppc/platforms/ev64260.c
index 4957a7bcde2..976270d537c 100644
--- a/arch/ppc/platforms/ev64260.c
+++ b/arch/ppc/platforms/ev64260.c
@@ -35,6 +35,7 @@
 #include <linux/serial.h>
 #include <linux/tty.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 #else
 #include <linux/mv643xx.h>
 #endif
diff --git a/arch/ppc/platforms/radstone_ppc7d.c b/arch/ppc/platforms/radstone_ppc7d.c
index b55860734a7..44d4398a36f 100644
--- a/arch/ppc/platforms/radstone_ppc7d.c
+++ b/arch/ppc/platforms/radstone_ppc7d.c
@@ -35,6 +35,7 @@
 #include <linux/serial.h>
 #include <linux/tty.h>		/* for linux/serial_core.h */
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 #include <linux/mv643xx.h>
 #include <linux/netdevice.h>
 #include <linux/platform_device.h>
diff --git a/arch/ppc/platforms/spruce.c b/arch/ppc/platforms/spruce.c
index 3c784278487..f4de50ba292 100644
--- a/arch/ppc/platforms/spruce.c
+++ b/arch/ppc/platforms/spruce.c
@@ -27,6 +27,7 @@
 #include <linux/serial.h>
 #include <linux/tty.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 
 #include <asm/system.h>
 #include <asm/pgtable.h>
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 73df7115325..603d83ad65c 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -21,6 +21,9 @@ config GENERIC_ISA_DMA
 	bool
 	default y
 
+config ARCH_NO_VIRT_TO_BUS
+	def_bool y
+
 source "init/Kconfig"
 
 menu "General machine setup"
diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig
index b84b6af1241..df6ee71894d 100644
--- a/arch/sparc64/Kconfig
+++ b/arch/sparc64/Kconfig
@@ -62,6 +62,9 @@ config AUDIT_ARCH
 	bool
 	default y
 
+config ARCH_NO_VIRT_TO_BUS
+	def_bool y
+
 choice
 	prompt "Kernel page size"
 	default SPARC64_PAGE_SIZE_8KB
diff --git a/arch/sparc64/kernel/ds.c b/arch/sparc64/kernel/ds.c
index ba01533f4e0..fa1f04d756a 100644
--- a/arch/sparc64/kernel/ds.c
+++ b/arch/sparc64/kernel/ds.c
@@ -1013,6 +1013,19 @@ static void ds_up(struct ds_info *dp)
 		dp->hs_state = DS_HS_START;
 }
 
+static void ds_reset(struct ds_info *dp)
+{
+	int i;
+
+	dp->hs_state = 0;
+
+	for (i = 0; i < ARRAY_SIZE(ds_states); i++) {
+		struct ds_cap_state *cp = &ds_states[i];
+
+		cp->state = CAP_STATE_UNKNOWN;
+	}
+}
+
 static void ds_event(void *arg, int event)
 {
 	struct ds_info *dp = arg;
@@ -1028,6 +1041,12 @@ static void ds_event(void *arg, int event)
 		return;
 	}
 
+	if (event == LDC_EVENT_RESET) {
+		ds_reset(dp);
+		spin_unlock_irqrestore(&ds_lock, flags);
+		return;
+	}
+
 	if (event != LDC_EVENT_DATA_READY) {
 		printk(KERN_WARNING PFX "Unexpected LDC event %d\n", event);
 		spin_unlock_irqrestore(&ds_lock, flags);
diff --git a/arch/sparc64/kernel/mdesc.c b/arch/sparc64/kernel/mdesc.c
index de5310ffdb4..302ba5e5a0b 100644
--- a/arch/sparc64/kernel/mdesc.c
+++ b/arch/sparc64/kernel/mdesc.c
@@ -137,7 +137,7 @@ static struct mdesc_handle *mdesc_kmalloc(unsigned int mdesc_size)
 		       sizeof(struct mdesc_hdr) +
 		       mdesc_size);
 
-	base = kmalloc(handle_size + 15, GFP_KERNEL);
+	base = kmalloc(handle_size + 15, GFP_KERNEL | __GFP_NOFAIL);
 	if (base) {
 		struct mdesc_handle *hp;
 		unsigned long addr;
@@ -214,18 +214,83 @@ void mdesc_release(struct mdesc_handle *hp)
 }
 EXPORT_SYMBOL(mdesc_release);
 
+static DEFINE_MUTEX(mdesc_mutex);
+static struct mdesc_notifier_client *client_list;
+
+void mdesc_register_notifier(struct mdesc_notifier_client *client)
+{
+	u64 node;
+
+	mutex_lock(&mdesc_mutex);
+	client->next = client_list;
+	client_list = client;
+
+	mdesc_for_each_node_by_name(cur_mdesc, node, client->node_name)
+		client->add(cur_mdesc, node);
+
+	mutex_unlock(&mdesc_mutex);
+}
+
+/* Run 'func' on nodes which are in A but not in B.  */
+static void invoke_on_missing(const char *name,
+			      struct mdesc_handle *a,
+			      struct mdesc_handle *b,
+			      void (*func)(struct mdesc_handle *, u64))
+{
+	u64 node;
+
+	mdesc_for_each_node_by_name(a, node, name) {
+		const u64 *id = mdesc_get_property(a, node, "id", NULL);
+		int found = 0;
+		u64 fnode;
+
+		mdesc_for_each_node_by_name(b, fnode, name) {
+			const u64 *fid = mdesc_get_property(b, fnode,
+							    "id", NULL);
+
+			if (*id == *fid) {
+				found = 1;
+				break;
+			}
+		}
+		if (!found)
+			func(a, node);
+	}
+}
+
+static void notify_one(struct mdesc_notifier_client *p,
+		       struct mdesc_handle *old_hp,
+		       struct mdesc_handle *new_hp)
+{
+	invoke_on_missing(p->node_name, old_hp, new_hp, p->remove);
+	invoke_on_missing(p->node_name, new_hp, old_hp, p->add);
+}
+
+static void mdesc_notify_clients(struct mdesc_handle *old_hp,
+				 struct mdesc_handle *new_hp)
+{
+	struct mdesc_notifier_client *p = client_list;
+
+	while (p) {
+		notify_one(p, old_hp, new_hp);
+		p = p->next;
+	}
+}
+
 void mdesc_update(void)
 {
 	unsigned long len, real_len, status;
 	struct mdesc_handle *hp, *orig_hp;
 	unsigned long flags;
 
+	mutex_lock(&mdesc_mutex);
+
 	(void) sun4v_mach_desc(0UL, 0UL, &len);
 
 	hp = mdesc_alloc(len, &kmalloc_mdesc_memops);
 	if (!hp) {
 		printk(KERN_ERR "MD: mdesc alloc fails\n");
-		return;
+		goto out;
 	}
 
 	status = sun4v_mach_desc(__pa(&hp->mdesc), len, &real_len);
@@ -234,18 +299,25 @@ void mdesc_update(void)
 		       status);
 		atomic_dec(&hp->refcnt);
 		mdesc_free(hp);
-		return;
+		goto out;
 	}
 
 	spin_lock_irqsave(&mdesc_lock, flags);
 	orig_hp = cur_mdesc;
 	cur_mdesc = hp;
+	spin_unlock_irqrestore(&mdesc_lock, flags);
 
+	mdesc_notify_clients(orig_hp, hp);
+
+	spin_lock_irqsave(&mdesc_lock, flags);
 	if (atomic_dec_and_test(&orig_hp->refcnt))
 		mdesc_free(orig_hp);
 	else
 		list_add(&orig_hp->list, &mdesc_zombie_list);
 	spin_unlock_irqrestore(&mdesc_lock, flags);
+
+out:
+	mutex_unlock(&mdesc_mutex);
 }
 
 static struct mdesc_elem *node_block(struct mdesc_hdr *mdesc)
diff --git a/arch/sparc64/kernel/vio.c b/arch/sparc64/kernel/vio.c
index 49569b44ea1..8d3cc4fdb55 100644
--- a/arch/sparc64/kernel/vio.c
+++ b/arch/sparc64/kernel/vio.c
@@ -201,10 +201,11 @@ static void vio_fill_channel_info(struct mdesc_handle *hp, u64 mp,
 static struct vio_dev *vio_create_one(struct mdesc_handle *hp, u64 mp,
 				      struct device *parent)
 {
-	const char *type, *compat;
+	const char *type, *compat, *bus_id_name;
 	struct device_node *dp;
 	struct vio_dev *vdev;
 	int err, tlen, clen;
+	const u64 *id;
 
 	type = mdesc_get_property(hp, mp, "device-type", &tlen);
 	if (!type) {
@@ -220,6 +221,16 @@ static struct vio_dev *vio_create_one(struct mdesc_handle *hp, u64 mp,
 		return NULL;
 	}
 
+	bus_id_name = type;
+	if (!strcmp(type, "domain-services-port"))
+		bus_id_name = "ds";
+
+	if (strlen(bus_id_name) >= KOBJ_NAME_LEN - 4) {
+		printk(KERN_ERR "VIO: bus_id_name [%s] is too long.\n",
+		       bus_id_name);
+		return NULL;
+	}
+
 	compat = mdesc_get_property(hp, mp, "device-type", &clen);
 	if (!compat) {
 		clen = 0;
@@ -249,7 +260,14 @@ static struct vio_dev *vio_create_one(struct mdesc_handle *hp, u64 mp,
 
 	vio_fill_channel_info(hp, mp, vdev);
 
-	snprintf(vdev->dev.bus_id, BUS_ID_SIZE, "%lx", mp);
+	id = mdesc_get_property(hp, mp, "id", NULL);
+	if (!id)
+		snprintf(vdev->dev.bus_id, BUS_ID_SIZE, "%s",
+			 bus_id_name);
+	else
+		snprintf(vdev->dev.bus_id, BUS_ID_SIZE, "%s-%lu",
+			 bus_id_name, *id);
+
 	vdev->dev.parent = parent;
 	vdev->dev.bus = &vio_bus_type;
 	vdev->dev.release = vio_dev_release;
@@ -269,6 +287,8 @@ static struct vio_dev *vio_create_one(struct mdesc_handle *hp, u64 mp,
 	}
 	vdev->dp = dp;
 
+	printk(KERN_ERR "VIO: Adding device %s\n", vdev->dev.bus_id);
+
 	err = device_register(&vdev->dev);
 	if (err) {
 		printk(KERN_ERR "VIO: Could not register device %s, err=%d\n",
@@ -283,46 +303,46 @@ static struct vio_dev *vio_create_one(struct mdesc_handle *hp, u64 mp,
 	return vdev;
 }
 
-static void walk_tree(struct mdesc_handle *hp, u64 n, struct vio_dev *parent)
+static void vio_add(struct mdesc_handle *hp, u64 node)
 {
-	u64 a;
-
-	mdesc_for_each_arc(a, hp, n, MDESC_ARC_TYPE_FWD) {
-		struct vio_dev *vdev;
-		u64 target;
-
-		target = mdesc_arc_target(hp, a);
-		vdev = vio_create_one(hp, target, &parent->dev);
-		if (vdev)
-			walk_tree(hp, target, vdev);
-	}
+	(void) vio_create_one(hp, node, &root_vdev->dev);
 }
 
-static void create_devices(struct mdesc_handle *hp, u64 root)
+static int vio_md_node_match(struct device *dev, void *arg)
 {
-	u64 mp;
+	struct vio_dev *vdev = to_vio_dev(dev);
 
-	root_vdev = vio_create_one(hp, root, NULL);
-	if (!root_vdev) {
-		printk(KERN_ERR "VIO: Coult not create root device.\n");
-		return;
-	}
+	if (vdev->mp == (u64) arg)
+		return 1;
 
-	walk_tree(hp, root, root_vdev);
+	return 0;
+}
+
+static void vio_remove(struct mdesc_handle *hp, u64 node)
+{
+	struct device *dev;
 
-	/* Domain services is odd as it doesn't sit underneath the
-	 * channel-devices node, so we plug it in manually.
-	 */
-	mp = mdesc_node_by_name(hp, MDESC_NODE_NULL, "domain-services");
-	if (mp != MDESC_NODE_NULL) {
-		struct vio_dev *parent = vio_create_one(hp, mp,
-							&root_vdev->dev);
+	dev = device_find_child(&root_vdev->dev, (void *) node,
+				vio_md_node_match);
+	if (dev) {
+		printk(KERN_INFO "VIO: Removing device %s\n", dev->bus_id);
 
-		if (parent)
-			walk_tree(hp, mp, parent);
+		device_unregister(dev);
 	}
 }
 
+static struct mdesc_notifier_client vio_device_notifier = {
+	.add		= vio_add,
+	.remove		= vio_remove,
+	.node_name	= "virtual-device-port",
+};
+
+static struct mdesc_notifier_client vio_ds_notifier = {
+	.add		= vio_add,
+	.remove		= vio_remove,
+	.node_name	= "domain-services-port",
+};
+
 const char *channel_devices_node = "channel-devices";
 const char *channel_devices_compat = "SUNW,sun4v-channel-devices";
 const char *cfg_handle_prop = "cfg-handle";
@@ -381,11 +401,19 @@ static int __init vio_init(void)
 
 	cdev_cfg_handle = *cfg_handle;
 
-	create_devices(hp, root);
+	root_vdev = vio_create_one(hp, root, NULL);
+	err = -ENODEV;
+	if (!root_vdev) {
+		printk(KERN_ERR "VIO: Coult not create root device.\n");
+		goto out_release;
+	}
+
+	mdesc_register_notifier(&vio_device_notifier);
+	mdesc_register_notifier(&vio_ds_notifier);
 
 	mdesc_release(hp);
 
-	return 0;
+	return err;
 
 out_release:
 	mdesc_release(hp);
diff --git a/arch/sparc64/kernel/viohs.c b/arch/sparc64/kernel/viohs.c
index 15613add45d..09126fc338b 100644
--- a/arch/sparc64/kernel/viohs.c
+++ b/arch/sparc64/kernel/viohs.c
@@ -78,6 +78,24 @@ static int start_handshake(struct vio_driver_state *vio)
 	return 0;
 }
 
+static void flush_rx_dring(struct vio_driver_state *vio)
+{
+	struct vio_dring_state *dr;
+	u64 ident;
+
+	BUG_ON(!(vio->dr_state & VIO_DR_STATE_RXREG));
+
+	dr = &vio->drings[VIO_DRIVER_RX_RING];
+	ident = dr->ident;
+
+	BUG_ON(!vio->desc_buf);
+	kfree(vio->desc_buf);
+	vio->desc_buf = NULL;
+
+	memset(dr, 0, sizeof(*dr));
+	dr->ident = ident;
+}
+
 void vio_link_state_change(struct vio_driver_state *vio, int event)
 {
 	if (event == LDC_EVENT_UP) {
@@ -98,6 +116,16 @@ void vio_link_state_change(struct vio_driver_state *vio, int event)
 			break;
 		}
 		start_handshake(vio);
+	} else if (event == LDC_EVENT_RESET) {
+		vio->hs_state = VIO_HS_INVALID;
+
+		if (vio->dr_state & VIO_DR_STATE_RXREG)
+			flush_rx_dring(vio);
+
+		vio->dr_state = 0x00;
+		memset(&vio->ver, 0, sizeof(vio->ver));
+
+		ldc_disconnect(vio->lp);
 	}
 }
 EXPORT_SYMBOL(vio_link_state_change);
@@ -396,6 +424,8 @@ static int process_dreg_info(struct vio_driver_state *vio,
 	if (vio->dr_state & VIO_DR_STATE_RXREG)
 		goto send_nack;
 
+	BUG_ON(vio->desc_buf);
+
 	vio->desc_buf = kzalloc(pkt->descr_size, GFP_ATOMIC);
 	if (!vio->desc_buf)
 		goto send_nack;
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 782dea81943..3f66e970d86 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -719,4 +719,5 @@ ia32_sys_call_table:
 	.quad compat_sys_signalfd
 	.quad compat_sys_timerfd
 	.quad sys_eventfd
+	.quad sys32_fallocate
 ia32_syscall_end:
diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c
index 99a78a3cce7..bee96d61443 100644
--- a/arch/x86_64/ia32/sys_ia32.c
+++ b/arch/x86_64/ia32/sys_ia32.c
@@ -879,3 +879,11 @@ asmlinkage long sys32_fadvise64(int fd, unsigned offset_lo, unsigned offset_hi,
 	return sys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo,
 				len, advice);
 }
+
+asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_lo,
+				unsigned offset_hi, unsigned len_lo,
+				unsigned len_hi)
+{
+	return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo,
+			     ((u64)len_hi << 32) | len_lo);
+}
diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c
index 296d2b0c5d8..fd9aff3f389 100644
--- a/arch/x86_64/kernel/early_printk.c
+++ b/arch/x86_64/kernel/early_printk.c
@@ -6,6 +6,7 @@
 #include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/fcntl.h>
+#include <xen/hvc-console.h>
 
 /* Simple VGA output */
 
@@ -242,6 +243,10 @@ static int __init setup_early_printk(char *buf)
  		simnow_init(buf + 6);
  		early_console = &simnow_console;
  		keep_early = 1;
+#ifdef CONFIG_HVC_XEN
+	} else if (!strncmp(buf, "xen", 3)) {
+		early_console = &xenboot_console;
+#endif
 	}
 
 	if (keep_early)
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index aa1d1599179..f3fb8174559 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -174,7 +174,7 @@ static void do_mce_trigger(void)
 	if (events != atomic_read(&mce_logged) && trigger[0]) {
 		/* Small race window, but should be harmless.  */
 		atomic_set(&mce_logged, events);
-		call_usermodehelper(trigger, trigger_argv, NULL, -1);
+		call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
 	}
 }
 
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index fa6775ef729..e83cc67155a 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -102,16 +102,25 @@ unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *r
 		u32 *desc;
 		unsigned long base;
 
-		down(&child->mm->context.sem);
-		desc = child->mm->context.ldt + (seg & ~7);
-		base = (desc[0] >> 16) | ((desc[1] & 0xff) << 16) | (desc[1] & 0xff000000);
+		seg &= ~7UL;
 
-		/* 16-bit code segment? */
-		if (!((desc[1] >> 22) & 1))
-			addr &= 0xffff;
-		addr += base;
+		down(&child->mm->context.sem);
+		if (unlikely((seg >> 3) >= child->mm->context.size))
+			addr = -1L; /* bogus selector, access would fault */
+		else {
+			desc = child->mm->context.ldt + seg;
+			base = ((desc[0] >> 16) |
+				((desc[1] & 0xff) << 16) |
+				(desc[1] & 0xff000000));
+
+			/* 16-bit code segment? */
+			if (!((desc[1] >> 22) & 1))
+				addr &= 0xffff;
+			addr += base;
+		}
 		up(&child->mm->context.sem);
 	}
+
 	return addr;
 }