17 files changed, 10088 insertions, 290 deletions
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 962817e49fb..636af286230 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -22,6 +22,39 @@ config ATMEL_PWM
 	  purposes including software controlled power-efficent backlights
 	  on LCD displays, motor control, and waveform generation.
 
+config ATMEL_TCLIB
+	bool "Atmel AT32/AT91 Timer/Counter Library"
+	depends on (AVR32 || ARCH_AT91)
+	help
+	  Select this if you want a library to allocate the Timer/Counter
+	  blocks found on many Atmel processors.  This facilitates using
+	  these blocks by different drivers despite processor differences.
+
+config ATMEL_TCB_CLKSRC
+	bool "TC Block Clocksource"
+	depends on ATMEL_TCLIB && GENERIC_TIME
+	default y
+	help
+	  Select this to get a high precision clocksource based on a
+	  TC block with a 5+ MHz base clock rate.  Two timer channels
+	  are combined to make a single 32-bit timer.
+
+	  When GENERIC_CLOCKEVENTS is defined, the third timer channel
+	  may be used as a clock event device supporting oneshot mode
+	  (delays of up to two seconds) based on the 32 KiHz clock.
+
+config ATMEL_TCB_CLKSRC_BLOCK
+	int
+	depends on ATMEL_TCB_CLKSRC
+	prompt "TC Block" if ARCH_AT91RM9200 || ARCH_AT91SAM9260 || CPU_AT32AP700X
+	default 0
+	range 0 1
+	help
+	  Some chips provide more than one TC block, so you have the
+	  choice of which one to use for the clock framework.  The other
+	  TC can be used for other purposes, such as PWM generation and
+	  interval timing.
+
 config IBM_ASM
 	tristate "Device driver for IBM RSA service processor"
 	depends on X86 && PCI && INPUT && EXPERIMENTAL
@@ -107,6 +140,7 @@ config ACER_WMI
 	depends on EXPERIMENTAL
 	depends on ACPI
 	depends on LEDS_CLASS
+	depends on NEW_LEDS
 	depends on BACKLIGHT_CLASS_DEVICE
 	depends on SERIO_I8042
 	select ACPI_WMI
@@ -127,6 +161,7 @@ config ASUS_LAPTOP
         depends on ACPI
 	depends on EXPERIMENTAL && !ACPI_ASUS
 	depends on LEDS_CLASS
+	depends on NEW_LEDS
 	depends on BACKLIGHT_CLASS_DEVICE
         ---help---
 	  This is the new Linux driver for Asus laptops. It may also support some
@@ -208,10 +243,13 @@ config SONYPI_COMPAT
 config THINKPAD_ACPI
 	tristate "ThinkPad ACPI Laptop Extras"
 	depends on X86 && ACPI
+	select BACKLIGHT_LCD_SUPPORT
 	select BACKLIGHT_CLASS_DEVICE
 	select HWMON
 	select NVRAM
-	depends on INPUT
+	select INPUT
+	select NEW_LEDS
+	select LEDS_CLASS
 	---help---
 	  This is a driver for the IBM and Lenovo ThinkPad laptops. It adds
 	  support for Fn-Fx key combinations, Bluetooth control, video
@@ -311,6 +349,7 @@ config ATMEL_SSC
 config INTEL_MENLOW
 	tristate "Thermal Management driver for Intel menlow platform"
 	depends on ACPI_THERMAL
+	select THERMAL
 	depends on X86
 	---help---
 	  ACPI thermal management enhancement driver on
@@ -318,6 +357,19 @@ config INTEL_MENLOW
 
 	  If unsure, say N.
 
+config EEEPC_LAPTOP
+	tristate "Eee PC Hotkey Driver (EXPERIMENTAL)"
+	depends on X86
+	depends on ACPI
+	depends on BACKLIGHT_CLASS_DEVICE
+	depends on HWMON
+	depends on EXPERIMENTAL
+	---help---
+	  This driver supports the Fn-Fx keys on Eee PC laptops.
+	  It also adds the ability to switch camera/wlan on/off.
+
+	  If you have an Eee PC laptop, say Y or M here.
+
 config ENCLOSURE_SERVICES
 	tristate "Enclosure Services"
 	default n
@@ -327,4 +379,16 @@ config ENCLOSURE_SERVICES
 	  driver (SCSI/ATA) which supports enclosures
 	  or a SCSI enclosure device (SES) to use these services.
 
+config SGI_XP
+	tristate "Support communication between SGI SSIs"
+	depends on IA64_GENERIC || IA64_SGI_SN2
+	select IA64_UNCACHED_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
+	select GENERIC_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
+	---help---
+	  An SGI machine can be divided into multiple Single System
+	  Images which act independently of each other and have
+	  hardware based memory protection from the others.  Enabling
+	  this feature will allow for direct communication between SSIs
+	  based on a network adapter and DMA messaging.
+
 endif # MISC_DEVICES
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 3b12f5da856..1952875a272 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -7,9 +7,11 @@ obj-$(CONFIG_IBM_ASM)		+= ibmasm/
 obj-$(CONFIG_HDPU_FEATURES)	+= hdpuftrs/
 obj-$(CONFIG_MSI_LAPTOP)     += msi-laptop.o
 obj-$(CONFIG_ACER_WMI)     += acer-wmi.o
-obj-$(CONFIG_ASUS_LAPTOP)     += asus-laptop.o
+obj-$(CONFIG_ASUS_LAPTOP)	+= asus-laptop.o
+obj-$(CONFIG_EEEPC_LAPTOP)	+= eeepc-laptop.o
 obj-$(CONFIG_ATMEL_PWM)		+= atmel_pwm.o
 obj-$(CONFIG_ATMEL_SSC)		+= atmel-ssc.o
+obj-$(CONFIG_ATMEL_TCLIB)	+= atmel_tclib.o
 obj-$(CONFIG_TC1100_WMI)	+= tc1100-wmi.o
 obj-$(CONFIG_LKDTM)		+= lkdtm.o
 obj-$(CONFIG_TIFM_CORE)       	+= tifm_core.o
@@ -22,3 +24,5 @@ obj-$(CONFIG_FUJITSU_LAPTOP)	+= fujitsu-laptop.o
 obj-$(CONFIG_EEPROM_93CX6)	+= eeprom_93cx6.o
 obj-$(CONFIG_INTEL_MENLOW)	+= intel_menlow.o
 obj-$(CONFIG_ENCLOSURE_SERVICES) += enclosure.o
+obj-$(CONFIG_KGDB_TESTS)	+= kgdbts.o
+obj-$(CONFIG_SGI_XP)		+= sgi-xp/
diff --git a/drivers/misc/atmel_tclib.c b/drivers/misc/atmel_tclib.c
new file mode 100644
index 00000000000..05dc8a31f28
--- /dev/null
+++ b/drivers/misc/atmel_tclib.c
@@ -0,0 +1,161 @@
+#include <linux/atmel_tc.h>
+#include <linux/clk.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+
+/* Number of bytes to reserve for the iomem resource */
+#define ATMEL_TC_IOMEM_SIZE	256
+
+
+/*
+ * This is a thin library to solve the problem of how to portably allocate
+ * one of the TC blocks.  For simplicity, it doesn't currently expect to
+ * share individual timers between different drivers.
+ */
+
+#if defined(CONFIG_AVR32)
+/* AVR32 has these divide PBB */
+const u8 atmel_tc_divisors[5] = { 0, 4, 8, 16, 32, };
+EXPORT_SYMBOL(atmel_tc_divisors);
+
+#elif defined(CONFIG_ARCH_AT91)
+/* AT91 has these divide MCK */
+const u8 atmel_tc_divisors[5] = { 2, 8, 32, 128, 0, };
+EXPORT_SYMBOL(atmel_tc_divisors);
+
+#endif
+
+static DEFINE_SPINLOCK(tc_list_lock);
+static LIST_HEAD(tc_list);
+
+/**
+ * atmel_tc_alloc - allocate a specified TC block
+ * @block: which block to allocate
+ * @name: name to be associated with the iomem resource
+ *
+ * Caller allocates a block.  If it is available, a pointer to a
+ * pre-initialized struct atmel_tc is returned. The caller can access
+ * the registers directly through the "regs" field.
+ */
+struct atmel_tc *atmel_tc_alloc(unsigned block, const char *name)
+{
+	struct atmel_tc		*tc;
+	struct platform_device	*pdev = NULL;
+	struct resource		*r;
+
+	spin_lock(&tc_list_lock);
+	list_for_each_entry(tc, &tc_list, node) {
+		if (tc->pdev->id == block) {
+			pdev = tc->pdev;
+			break;
+		}
+	}
+
+	if (!pdev || tc->iomem)
+		goto fail;
+
+	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	r = request_mem_region(r->start, ATMEL_TC_IOMEM_SIZE, name);
+	if (!r)
+		goto fail;
+
+	tc->regs = ioremap(r->start, ATMEL_TC_IOMEM_SIZE);
+	if (!tc->regs)
+		goto fail_ioremap;
+
+	tc->iomem = r;
+
+out:
+	spin_unlock(&tc_list_lock);
+	return tc;
+
+fail_ioremap:
+	release_resource(r);
+fail:
+	tc = NULL;
+	goto out;
+}
+EXPORT_SYMBOL_GPL(atmel_tc_alloc);
+
+/**
+ * atmel_tc_free - release a specified TC block
+ * @tc: Timer/counter block that was returned by atmel_tc_alloc()
+ *
+ * This reverses the effect of atmel_tc_alloc(), unmapping the I/O
+ * registers, invalidating the resource returned by that routine and
+ * making the TC available to other drivers.
+ */
+void atmel_tc_free(struct atmel_tc *tc)
+{
+	spin_lock(&tc_list_lock);
+	if (tc->regs) {
+		iounmap(tc->regs);
+		release_resource(tc->iomem);
+		tc->regs = NULL;
+		tc->iomem = NULL;
+	}
+	spin_unlock(&tc_list_lock);
+}
+EXPORT_SYMBOL_GPL(atmel_tc_free);
+
+static int __init tc_probe(struct platform_device *pdev)
+{
+	struct atmel_tc *tc;
+	struct clk	*clk;
+	int		irq;
+
+	if (!platform_get_resource(pdev, IORESOURCE_MEM, 0))
+		return -EINVAL;
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0)
+		return -EINVAL;
+
+	tc = kzalloc(sizeof(struct atmel_tc), GFP_KERNEL);
+	if (!tc)
+		return -ENOMEM;
+
+	tc->pdev = pdev;
+
+	clk = clk_get(&pdev->dev, "t0_clk");
+	if (IS_ERR(clk)) {
+		kfree(tc);
+		return -EINVAL;
+	}
+
+	tc->clk[0] = clk;
+	tc->clk[1] = clk_get(&pdev->dev, "t1_clk");
+	if (IS_ERR(tc->clk[1]))
+		tc->clk[1] = clk;
+	tc->clk[2] = clk_get(&pdev->dev, "t2_clk");
+	if (IS_ERR(tc->clk[2]))
+		tc->clk[2] = clk;
+
+	tc->irq[0] = irq;
+	tc->irq[1] = platform_get_irq(pdev, 1);
+	if (tc->irq[1] < 0)
+		tc->irq[1] = irq;
+	tc->irq[2] = platform_get_irq(pdev, 2);
+	if (tc->irq[2] < 0)
+		tc->irq[2] = irq;
+
+	spin_lock(&tc_list_lock);
+	list_add_tail(&tc->node, &tc_list);
+	spin_unlock(&tc_list_lock);
+
+	return 0;
+}
+
+static struct platform_driver tc_driver = {
+	.driver.name	= "atmel_tcb",
+};
+
+static int __init tc_init(void)
+{
+	return platform_driver_probe(&tc_driver, tc_probe);
+}
+arch_initcall(tc_init);
diff --git a/drivers/misc/eeepc-laptop.c b/drivers/misc/eeepc-laptop.c
new file mode 100644
index 00000000000..6d727609097
--- /dev/null
+++ b/drivers/misc/eeepc-laptop.c
@@ -0,0 +1,666 @@
+/*
+ *  eepc-laptop.c - Asus Eee PC extras
+ *
+ *  Based on asus_acpi.c as patched for the Eee PC by Asus:
+ *  ftp://ftp.asus.com/pub/ASUS/EeePC/701/ASUS_ACPI_071126.rar
+ *  Based on eee.c from eeepc-linux
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/platform_device.h>
+#include <linux/backlight.h>
+#include <linux/fb.h>
+#include <linux/hwmon.h>
+#include <linux/hwmon-sysfs.h>
+#include <acpi/acpi_drivers.h>
+#include <acpi/acpi_bus.h>
+#include <linux/uaccess.h>
+
+#define EEEPC_LAPTOP_VERSION	"0.1"
+
+#define EEEPC_HOTK_NAME		"Eee PC Hotkey Driver"
+#define EEEPC_HOTK_FILE		"eeepc"
+#define EEEPC_HOTK_CLASS	"hotkey"
+#define EEEPC_HOTK_DEVICE_NAME	"Hotkey"
+#define EEEPC_HOTK_HID		"ASUS010"
+
+#define EEEPC_LOG	EEEPC_HOTK_FILE ": "
+#define EEEPC_ERR	KERN_ERR	EEEPC_LOG
+#define EEEPC_WARNING	KERN_WARNING	EEEPC_LOG
+#define EEEPC_NOTICE	KERN_NOTICE	EEEPC_LOG
+#define EEEPC_INFO	KERN_INFO	EEEPC_LOG
+
+/*
+ * Definitions for Asus EeePC
+ */
+#define	NOTIFY_WLAN_ON	0x10
+#define NOTIFY_BRN_MIN	0x20
+#define NOTIFY_BRN_MAX	0x2f
+
+enum {
+	DISABLE_ASL_WLAN = 0x0001,
+	DISABLE_ASL_BLUETOOTH = 0x0002,
+	DISABLE_ASL_IRDA = 0x0004,
+	DISABLE_ASL_CAMERA = 0x0008,
+	DISABLE_ASL_TV = 0x0010,
+	DISABLE_ASL_GPS = 0x0020,
+	DISABLE_ASL_DISPLAYSWITCH = 0x0040,
+	DISABLE_ASL_MODEM = 0x0080,
+	DISABLE_ASL_CARDREADER = 0x0100
+};
+
+enum {
+	CM_ASL_WLAN = 0,
+	CM_ASL_BLUETOOTH,
+	CM_ASL_IRDA,
+	CM_ASL_1394,
+	CM_ASL_CAMERA,
+	CM_ASL_TV,
+	CM_ASL_GPS,
+	CM_ASL_DVDROM,
+	CM_ASL_DISPLAYSWITCH,
+	CM_ASL_PANELBRIGHT,
+	CM_ASL_BIOSFLASH,
+	CM_ASL_ACPIFLASH,
+	CM_ASL_CPUFV,
+	CM_ASL_CPUTEMPERATURE,
+	CM_ASL_FANCPU,
+	CM_ASL_FANCHASSIS,
+	CM_ASL_USBPORT1,
+	CM_ASL_USBPORT2,
+	CM_ASL_USBPORT3,
+	CM_ASL_MODEM,
+	CM_ASL_CARDREADER,
+	CM_ASL_LID
+};
+
+const char *cm_getv[] = {
+	"WLDG", NULL, NULL, NULL,
+	"CAMG", NULL, NULL, NULL,
+	NULL, "PBLG", NULL, NULL,
+	"CFVG", NULL, NULL, NULL,
+	"USBG", NULL, NULL, "MODG",
+	"CRDG", "LIDG"
+};
+
+const char *cm_setv[] = {
+	"WLDS", NULL, NULL, NULL,
+	"CAMS", NULL, NULL, NULL,
+	"SDSP", "PBLS", "HDPS", NULL,
+	"CFVS", NULL, NULL, NULL,
+	"USBG", NULL, NULL, "MODS",
+	"CRDS", NULL
+};
+
+#define EEEPC_EC	"\\_SB.PCI0.SBRG.EC0."
+
+#define EEEPC_EC_FAN_PWM	EEEPC_EC "SC02" /* Fan PWM duty cycle (%) */
+#define EEEPC_EC_SC02		0x63
+#define EEEPC_EC_FAN_HRPM	EEEPC_EC "SC05" /* High byte, fan speed (RPM) */
+#define EEEPC_EC_FAN_LRPM	EEEPC_EC "SC06" /* Low byte, fan speed (RPM) */
+#define EEEPC_EC_FAN_CTRL	EEEPC_EC "SFB3" /* Byte containing SF25  */
+#define EEEPC_EC_SFB3		0xD3
+
+/*
+ * This is the main structure, we can use it to store useful information
+ * about the hotk device
+ */
+struct eeepc_hotk {
+	struct acpi_device *device;	/* the device we are in */
+	acpi_handle handle;		/* the handle of the hotk device */
+	u32 cm_supported;		/* the control methods supported
+					   by this BIOS */
+	uint init_flag;			/* Init flags */
+	u16 event_count[128];		/* count for each event */
+};
+
+/* The actual device the driver binds to */
+static struct eeepc_hotk *ehotk;
+
+/* Platform device/driver */
+static struct platform_driver platform_driver = {
+	.driver = {
+		.name = EEEPC_HOTK_FILE,
+		.owner = THIS_MODULE,
+	}
+};
+
+static struct platform_device *platform_device;
+
+/*
+ * The hotkey driver declaration
+ */
+static int eeepc_hotk_add(struct acpi_device *device);
+static int eeepc_hotk_remove(struct acpi_device *device, int type);
+
+static const struct acpi_device_id eeepc_device_ids[] = {
+	{EEEPC_HOTK_HID, 0},
+	{"", 0},
+};
+MODULE_DEVICE_TABLE(acpi, eeepc_device_ids);
+
+static struct acpi_driver eeepc_hotk_driver = {
+	.name = EEEPC_HOTK_NAME,
+	.class = EEEPC_HOTK_CLASS,
+	.ids = eeepc_device_ids,
+	.ops = {
+		.add = eeepc_hotk_add,
+		.remove = eeepc_hotk_remove,
+	},
+};
+
+/* The backlight device /sys/class/backlight */
+static struct backlight_device *eeepc_backlight_device;
+
+/* The hwmon device */
+static struct device *eeepc_hwmon_device;
+
+/*
+ * The backlight class declaration
+ */
+static int read_brightness(struct backlight_device *bd);
+static int update_bl_status(struct backlight_device *bd);
+static struct backlight_ops eeepcbl_ops = {
+	.get_brightness = read_brightness,
+	.update_status = update_bl_status,
+};
+
+MODULE_AUTHOR("Corentin Chary, Eric Cooper");
+MODULE_DESCRIPTION(EEEPC_HOTK_NAME);
+MODULE_LICENSE("GPL");
+
+/*
+ * ACPI Helpers
+ */
+static int write_acpi_int(acpi_handle handle, const char *method, int val,
+			  struct acpi_buffer *output)
+{
+	struct acpi_object_list params;
+	union acpi_object in_obj;
+	acpi_status status;
+
+	params.count = 1;
+	params.pointer = &in_obj;
+	in_obj.type = ACPI_TYPE_INTEGER;
+	in_obj.integer.value = val;
+
+	status = acpi_evaluate_object(handle, (char *)method, &params, output);
+	return (status == AE_OK ? 0 : -1);
+}
+
+static int read_acpi_int(acpi_handle handle, const char *method, int *val)
+{
+	acpi_status status;
+	ulong result;
+
+	status = acpi_evaluate_integer(handle, (char *)method, NULL, &result);
+	if (ACPI_FAILURE(status)) {
+		*val = -1;
+		return -1;
+	} else {
+		*val = result;
+		return 0;
+	}
+}
+
+static int set_acpi(int cm, int value)
+{
+	if (ehotk->cm_supported & (0x1 << cm)) {
+		const char *method = cm_setv[cm];
+		if (method == NULL)
+			return -ENODEV;
+		if (write_acpi_int(ehotk->handle, method, value, NULL))
+			printk(EEEPC_WARNING "Error writing %s\n", method);
+	}
+	return 0;
+}
+
+static int get_acpi(int cm)
+{
+	int value = -1;
+	if ((ehotk->cm_supported & (0x1 << cm))) {
+		const char *method = cm_getv[cm];
+		if (method == NULL)
+			return -ENODEV;
+		if (read_acpi_int(ehotk->handle, method, &value))
+			printk(EEEPC_WARNING "Error reading %s\n", method);
+	}
+	return value;
+}
+
+/*
+ * Backlight
+ */
+static int read_brightness(struct backlight_device *bd)
+{
+	return get_acpi(CM_ASL_PANELBRIGHT);
+}
+
+static int set_brightness(struct backlight_device *bd, int value)
+{
+	value = max(0, min(15, value));
+	return set_acpi(CM_ASL_PANELBRIGHT, value);
+}
+
+static int update_bl_status(struct backlight_device *bd)
+{
+	return set_brightness(bd, bd->props.brightness);
+}
+
+/*
+ * Sys helpers
+ */
+static int parse_arg(const char *buf, unsigned long count, int *val)
+{
+	if (!count)
+		return 0;
+	if (sscanf(buf, "%i", val) != 1)
+		return -EINVAL;
+	return count;
+}
+
+static ssize_t store_sys_acpi(int cm, const char *buf, size_t count)
+{
+	int rv, value;
+
+	rv = parse_arg(buf, count, &value);
+	if (rv > 0)
+		set_acpi(cm, value);
+	return rv;
+}
+
+static ssize_t show_sys_acpi(int cm, char *buf)
+{
+	return sprintf(buf, "%d\n", get_acpi(cm));
+}
+
+#define EEEPC_CREATE_DEVICE_ATTR(_name, _cm)				\
+	static ssize_t show_##_name(struct device *dev,			\
+				    struct device_attribute *attr,	\
+				    char *buf)				\
+	{								\
+		return show_sys_acpi(_cm, buf);				\
+	}								\
+	static ssize_t store_##_name(struct device *dev,		\
+				     struct device_attribute *attr,	\
+				     const char *buf, size_t count)	\
+	{								\
+		return store_sys_acpi(_cm, buf, count);			\
+	}								\
+	static struct device_attribute dev_attr_##_name = {		\
+		.attr = {						\
+			.name = __stringify(_name),			\
+			.mode = 0644 },					\
+		.show   = show_##_name,					\
+		.store  = store_##_name,				\
+	}
+
+EEEPC_CREATE_DEVICE_ATTR(camera, CM_ASL_CAMERA);
+EEEPC_CREATE_DEVICE_ATTR(cardr, CM_ASL_CARDREADER);
+EEEPC_CREATE_DEVICE_ATTR(disp, CM_ASL_DISPLAYSWITCH);
+EEEPC_CREATE_DEVICE_ATTR(wlan, CM_ASL_WLAN);
+
+static struct attribute *platform_attributes[] = {
+	&dev_attr_camera.attr,
+	&dev_attr_cardr.attr,
+	&dev_attr_disp.attr,
+	&dev_attr_wlan.attr,
+	NULL
+};
+
+static struct attribute_group platform_attribute_group = {
+	.attrs = platform_attributes
+};
+
+/*
+ * Hotkey functions
+ */
+static int eeepc_hotk_check(void)
+{
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	int result;
+
+	result = acpi_bus_get_status(ehotk->device);
+	if (result)
+		return result;
+	if (ehotk->device->status.present) {
+		if (write_acpi_int(ehotk->handle, "INIT", ehotk->init_flag,
+				    &buffer)) {
+			printk(EEEPC_ERR "Hotkey initialization failed\n");
+			return -ENODEV;
+		} else {
+			printk(EEEPC_NOTICE "Hotkey init flags 0x%x\n",
+			       ehotk->init_flag);
+		}
+		/* get control methods supported */
+		if (read_acpi_int(ehotk->handle, "CMSG"
+				   , &ehotk->cm_supported)) {
+			printk(EEEPC_ERR
+			       "Get control methods supported failed\n");
+			return -ENODEV;
+		} else {
+			printk(EEEPC_INFO
+			       "Get control methods supported: 0x%x\n",
+			       ehotk->cm_supported);
+		}
+	} else {
+		printk(EEEPC_ERR "Hotkey device not present, aborting\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void notify_wlan(u32 *event)
+{
+	/* if DISABLE_ASL_WLAN is set, the notify code for fn+f2
+	   will always be 0x10 */
+	if (ehotk->cm_supported & (0x1 << CM_ASL_WLAN)) {
+		const char *method = cm_getv[CM_ASL_WLAN];
+		int value;
+		if (read_acpi_int(ehotk->handle, method, &value))
+			printk(EEEPC_WARNING "Error reading %s\n",
+			       method);
+		else if (value == 1)
+			*event = 0x11;
+	}
+}
+
+static void notify_brn(void)
+{
+	struct backlight_device *bd = eeepc_backlight_device;
+	bd->props.brightness = read_brightness(bd);
+}
+
+static void eeepc_hotk_notify(acpi_handle handle, u32 event, void *data)
+{
+	if (!ehotk)
+		return;
+	if (event == NOTIFY_WLAN_ON && (DISABLE_ASL_WLAN & ehotk->init_flag))
+		notify_wlan(&event);
+	if (event >= NOTIFY_BRN_MIN && event <= NOTIFY_BRN_MAX)
+		notify_brn();
+	acpi_bus_generate_proc_event(ehotk->device, event,
+				     ehotk->event_count[event % 128]++);
+}
+
+static int eeepc_hotk_add(struct acpi_device *device)
+{
+	acpi_status status = AE_OK;
+	int result;
+
+	if (!device)
+		 return -EINVAL;
+	printk(EEEPC_NOTICE EEEPC_HOTK_NAME "\n");
+	ehotk = kzalloc(sizeof(struct eeepc_hotk), GFP_KERNEL);
+	if (!ehotk)
+		return -ENOMEM;
+	ehotk->init_flag = DISABLE_ASL_WLAN | DISABLE_ASL_DISPLAYSWITCH;
+	ehotk->handle = device->handle;
+	strcpy(acpi_device_name(device), EEEPC_HOTK_DEVICE_NAME);
+	strcpy(acpi_device_class(device), EEEPC_HOTK_CLASS);
+	acpi_driver_data(device) = ehotk;
+	ehotk->device = device;
+	result = eeepc_hotk_check();
+	if (result)
+		goto end;
+	status = acpi_install_notify_handler(ehotk->handle, ACPI_SYSTEM_NOTIFY,
+					     eeepc_hotk_notify, ehotk);
+	if (ACPI_FAILURE(status))
+		printk(EEEPC_ERR "Error installing notify handler\n");
+ end:
+	if (result) {
+		kfree(ehotk);
+		ehotk = NULL;
+	}
+	return result;
+}
+
+static int eeepc_hotk_remove(struct acpi_device *device, int type)
+{
+	acpi_status status = 0;
+
+	if (!device || !acpi_driver_data(device))
+		 return -EINVAL;
+	status = acpi_remove_notify_handler(ehotk->handle, ACPI_SYSTEM_NOTIFY,
+					    eeepc_hotk_notify);
+	if (ACPI_FAILURE(status))
+		printk(EEEPC_ERR "Error removing notify handler\n");
+	kfree(ehotk);
+	return 0;
+}
+
+/*
+ * Hwmon
+ */
+static int eeepc_get_fan_pwm(void)
+{
+	int value = 0;
+
+	read_acpi_int(NULL, EEEPC_EC_FAN_PWM, &value);
+	return (value);
+}
+
+static void eeepc_set_fan_pwm(int value)
+{
+	value = SENSORS_LIMIT(value, 0, 100);
+	ec_write(EEEPC_EC_SC02, value);
+}
+
+static int eeepc_get_fan_rpm(void)
+{
+	int high = 0;
+	int low = 0;
+
+	read_acpi_int(NULL, EEEPC_EC_FAN_HRPM, &high);
+	read_acpi_int(NULL, EEEPC_EC_FAN_LRPM, &low);
+	return (high << 8 | low);
+}
+
+static int eeepc_get_fan_ctrl(void)
+{
+	int value = 0;
+
+	read_acpi_int(NULL, EEEPC_EC_FAN_CTRL, &value);
+	return ((value & 0x02 ? 1 : 0));
+}
+
+static void eeepc_set_fan_ctrl(int manual)
+{
+	int value = 0;
+
+	read_acpi_int(NULL, EEEPC_EC_FAN_CTRL, &value);
+	if (manual)
+		value |= 0x02;
+	else
+		value &= ~0x02;
+	ec_write(EEEPC_EC_SFB3, value);
+}
+
+static ssize_t store_sys_hwmon(void (*set)(int), const char *buf, size_t count)
+{
+	int rv, value;
+
+	rv = parse_arg(buf, count, &value);
+	if (rv > 0)
+		set(value);
+	return rv;
+}
+
+static ssize_t show_sys_hwmon(int (*get)(void), char *buf)
+{
+	return sprintf(buf, "%d\n", get());
+}
+
+#define EEEPC_CREATE_SENSOR_ATTR(_name, _mode, _set, _get)		\
+	static ssize_t show_##_name(struct device *dev,			\
+				    struct device_attribute *attr,	\
+				    char *buf)				\
+	{								\
+		return show_sys_hwmon(_set, buf);			\
+	}								\
+	static ssize_t store_##_name(struct device *dev,		\
+				     struct device_attribute *attr,	\
+				     const char *buf, size_t count)	\
+	{								\
+		return store_sys_hwmon(_get, buf, count);		\
+	}								\
+	static SENSOR_DEVICE_ATTR(_name, _mode, show_##_name, store_##_name, 0);
+
+EEEPC_CREATE_SENSOR_ATTR(fan1_input, S_IRUGO, eeepc_get_fan_rpm, NULL);
+EEEPC_CREATE_SENSOR_ATTR(fan1_pwm, S_IRUGO | S_IWUSR,
+			 eeepc_get_fan_pwm, eeepc_set_fan_pwm);
+EEEPC_CREATE_SENSOR_ATTR(pwm1_enable, S_IRUGO | S_IWUSR,
+			 eeepc_get_fan_ctrl, eeepc_set_fan_ctrl);
+
+static struct attribute *hwmon_attributes[] = {
+	&sensor_dev_attr_fan1_pwm.dev_attr.attr,
+	&sensor_dev_attr_fan1_input.dev_attr.attr,
+	&sensor_dev_attr_pwm1_enable.dev_attr.attr,
+	NULL
+};
+
+static struct attribute_group hwmon_attribute_group = {
+	.attrs = hwmon_attributes
+};
+
+/*
+ * exit/init
+ */
+static void eeepc_backlight_exit(void)
+{
+	if (eeepc_backlight_device)
+		backlight_device_unregister(eeepc_backlight_device);
+	eeepc_backlight_device = NULL;
+}
+
+static void eeepc_hwmon_exit(void)
+{
+	struct device *hwmon;
+
+	hwmon = eeepc_hwmon_device;
+	if (!hwmon)
+		return ;
+	hwmon_device_unregister(hwmon);
+	sysfs_remove_group(&hwmon->kobj,
+			   &hwmon_attribute_group);
+	eeepc_hwmon_device = NULL;
+}
+
+static void __exit eeepc_laptop_exit(void)
+{
+	eeepc_backlight_exit();
+	eeepc_hwmon_exit();
+	acpi_bus_unregister_driver(&eeepc_hotk_driver);
+	sysfs_remove_group(&platform_device->dev.kobj,
+			   &platform_attribute_group);
+	platform_device_unregister(platform_device);
+	platform_driver_unregister(&platform_driver);
+}
+
+static int eeepc_backlight_init(struct device *dev)
+{
+	struct backlight_device *bd;
+
+	bd = backlight_device_register(EEEPC_HOTK_FILE, dev,
+				       NULL, &eeepcbl_ops);
+	if (IS_ERR(bd)) {
+		printk(EEEPC_ERR
+		       "Could not register eeepc backlight device\n");
+		eeepc_backlight_device = NULL;
+		return PTR_ERR(bd);
+	}
+	eeepc_backlight_device = bd;
+	bd->props.max_brightness = 15;
+	bd->props.brightness = read_brightness(NULL);
+	bd->props.power = FB_BLANK_UNBLANK;
+	backlight_update_status(bd);
+	return 0;
+}
+
+static int eeepc_hwmon_init(struct device *dev)
+{
+	struct device *hwmon;
+	int result;
+
+	hwmon = hwmon_device_register(dev);
+	if (IS_ERR(hwmon)) {
+		printk(EEEPC_ERR
+		       "Could not register eeepc hwmon device\n");
+		eeepc_hwmon_device = NULL;
+		return PTR_ERR(hwmon);
+	}
+	eeepc_hwmon_device = hwmon;
+	result = sysfs_create_group(&hwmon->kobj,
+				    &hwmon_attribute_group);
+	if (result)
+		eeepc_hwmon_exit();
+	return result;
+}
+
+static int __init eeepc_laptop_init(void)
+{
+	struct device *dev;
+	int result;
+
+	if (acpi_disabled)
+		return -ENODEV;
+	result = acpi_bus_register_driver(&eeepc_hotk_driver);
+	if (result < 0)
+		return result;
+	if (!ehotk) {
+		acpi_bus_unregister_driver(&eeepc_hotk_driver);
+		return -ENODEV;
+	}
+	dev = acpi_get_physical_device(ehotk->device->handle);
+	result = eeepc_backlight_init(dev);
+	if (result)
+		goto fail_backlight;
+	result = eeepc_hwmon_init(dev);
+	if (result)
+		goto fail_hwmon;
+	/* Register platform stuff */
+	result = platform_driver_register(&platform_driver);
+	if (result)
+		goto fail_platform_driver;
+	platform_device = platform_device_alloc(EEEPC_HOTK_FILE, -1);
+	if (!platform_device) {
+		result = -ENOMEM;
+		goto fail_platform_device1;
+	}
+	result = platform_device_add(platform_device);
+	if (result)
+		goto fail_platform_device2;
+	result = sysfs_create_group(&platform_device->dev.kobj,
+				    &platform_attribute_group);
+	if (result)
+		goto fail_sysfs;
+	return 0;
+fail_sysfs:
+	platform_device_del(platform_device);
+fail_platform_device2:
+	platform_device_put(platform_device);
+fail_platform_device1:
+	platform_driver_unregister(&platform_driver);
+fail_platform_driver:
+	eeepc_hwmon_exit();
+fail_hwmon:
+	eeepc_backlight_exit();
+fail_backlight:
+	return result;
+}
+
+module_init(eeepc_laptop_init);
+module_exit(eeepc_laptop_exit);
diff --git a/drivers/misc/enclosure.c b/drivers/misc/enclosure.c
index 6fcb0e96adf..0736cff9d97 100644
--- a/drivers/misc/enclosure.c
+++ b/drivers/misc/enclosure.c
@@ -31,7 +31,6 @@
 static LIST_HEAD(container_list);
 static DEFINE_MUTEX(container_list_lock);
 static struct class enclosure_class;
-static struct class enclosure_component_class;
 
 /**
  * enclosure_find - find an enclosure given a device
@@ -40,16 +39,16 @@ static struct class enclosure_component_class;
  * Looks through the list of registered enclosures to see
  * if it can find a match for a device.  Returns NULL if no
  * enclosure is found. Obtains a reference to the enclosure class
- * device which must be released with class_device_put().
+ * device which must be released with device_put().
  */
 struct enclosure_device *enclosure_find(struct device *dev)
 {
-	struct enclosure_device *edev = NULL;
+	struct enclosure_device *edev;
 
 	mutex_lock(&container_list_lock);
 	list_for_each_entry(edev, &container_list, node) {
-		if (edev->cdev.dev == dev) {
-			class_device_get(&edev->cdev);
+		if (edev->edev.parent == dev) {
+			get_device(&edev->edev);
 			mutex_unlock(&container_list_lock);
 			return edev;
 		}
@@ -117,11 +116,11 @@ enclosure_register(struct device *dev, const char *name, int components,
 
 	edev->components = components;
 
-	edev->cdev.class = &enclosure_class;
-	edev->cdev.dev = get_device(dev);
+	edev->edev.class = &enclosure_class;
+	edev->edev.parent = get_device(dev);
 	edev->cb = cb;
-	snprintf(edev->cdev.class_id, BUS_ID_SIZE, "%s", name);
-	err = class_device_register(&edev->cdev);
+	snprintf(edev->edev.bus_id, BUS_ID_SIZE, "%s", name);
+	err = device_register(&edev->edev);
 	if (err)
 		goto err;
 
@@ -135,7 +134,7 @@ enclosure_register(struct device *dev, const char *name, int components,
 	return edev;
 
  err:
-	put_device(edev->cdev.dev);
+	put_device(edev->edev.parent);
 	kfree(edev);
 	return ERR_PTR(err);
 }
@@ -158,29 +157,69 @@ void enclosure_unregister(struct enclosure_device *edev)
 
 	for (i = 0; i < edev->components; i++)
 		if (edev->component[i].number != -1)
-			class_device_unregister(&edev->component[i].cdev);
+			device_unregister(&edev->component[i].cdev);
 
 	/* prevent any callbacks into service user */
 	edev->cb = &enclosure_null_callbacks;
-	class_device_unregister(&edev->cdev);
+	device_unregister(&edev->edev);
 }
 EXPORT_SYMBOL_GPL(enclosure_unregister);
 
-static void enclosure_release(struct class_device *cdev)
+#define ENCLOSURE_NAME_SIZE	64
+
+static void enclosure_link_name(struct enclosure_component *cdev, char *name)
+{
+	strcpy(name, "enclosure_device:");
+	strcat(name, cdev->cdev.bus_id);
+}
+
+static void enclosure_remove_links(struct enclosure_component *cdev)
+{
+	char name[ENCLOSURE_NAME_SIZE];
+
+	enclosure_link_name(cdev, name);
+	sysfs_remove_link(&cdev->dev->kobj, name);
+	sysfs_remove_link(&cdev->cdev.kobj, "device");
+}
+
+static int enclosure_add_links(struct enclosure_component *cdev)
+{
+	int error;
+	char name[ENCLOSURE_NAME_SIZE];
+
+	error = sysfs_create_link(&cdev->cdev.kobj, &cdev->dev->kobj, "device");
+	if (error)
+		return error;
+
+	enclosure_link_name(cdev, name);
+	error = sysfs_create_link(&cdev->dev->kobj, &cdev->cdev.kobj, name);
+	if (error)
+		sysfs_remove_link(&cdev->cdev.kobj, "device");
+
+	return error;
+}
+
+static void enclosure_release(struct device *cdev)
 {
 	struct enclosure_device *edev = to_enclosure_device(cdev);
 
-	put_device(cdev->dev);
+	put_device(cdev->parent);
 	kfree(edev);
 }
 
-static void enclosure_component_release(struct class_device *cdev)
+static void enclosure_component_release(struct device *dev)
 {
-	if (cdev->dev)
+	struct enclosure_component *cdev = to_enclosure_component(dev);
+
+	if (cdev->dev) {
+		enclosure_remove_links(cdev);
 		put_device(cdev->dev);
-	class_device_put(cdev->parent);
+	}
+	put_device(dev->parent);
 }
 
+static struct attribute_group *enclosure_groups[];
+
 /**
  * enclosure_component_register - add a particular component to an enclosure
  * @edev:	the enclosure to add the component
@@ -201,7 +240,7 @@ enclosure_component_register(struct enclosure_device *edev,
 			     const char *name)
 {
 	struct enclosure_component *ecomp;
-	struct class_device *cdev;
+	struct device *cdev;
 	int err;
 
 	if (number >= edev->components)
@@ -215,14 +254,16 @@ enclosure_component_register(struct enclosure_device *edev,
 	ecomp->type = type;
 	ecomp->number = number;
 	cdev = &ecomp->cdev;
-	cdev->parent = class_device_get(&edev->cdev);
-	cdev->class = &enclosure_component_class;
+	cdev->parent = get_device(&edev->edev);
 	if (name)
-		snprintf(cdev->class_id, BUS_ID_SIZE, "%s", name);
+		snprintf(cdev->bus_id, BUS_ID_SIZE, "%s", name);
 	else
-		snprintf(cdev->class_id, BUS_ID_SIZE, "%u", number);
+		snprintf(cdev->bus_id, BUS_ID_SIZE, "%u", number);
 
-	err = class_device_register(cdev);
+	cdev->release = enclosure_component_release;
+	cdev->groups = enclosure_groups;
+
+	err = device_register(cdev);
 	if (err)
 		ERR_PTR(err);
 
@@ -247,18 +288,19 @@ EXPORT_SYMBOL_GPL(enclosure_component_register);
 int enclosure_add_device(struct enclosure_device *edev, int component,
 			 struct device *dev)
 {
-	struct class_device *cdev;
+	struct enclosure_component *cdev;
 
 	if (!edev || component >= edev->components)
 		return -EINVAL;
 
-	cdev = &edev->component[component].cdev;
+	cdev = &edev->component[component];
 
-	class_device_del(cdev);
 	if (cdev->dev)
-		put_device(cdev->dev);
+		enclosure_remove_links(cdev);
+
+	put_device(cdev->dev);
 	cdev->dev = get_device(dev);
-	return class_device_add(cdev);
+	return enclosure_add_links(cdev);
 }
 EXPORT_SYMBOL_GPL(enclosure_add_device);
 
@@ -272,18 +314,17 @@ EXPORT_SYMBOL_GPL(enclosure_add_device);
  */
 int enclosure_remove_device(struct enclosure_device *edev, int component)
 {
-	struct class_device *cdev;
+	struct enclosure_component *cdev;
 
 	if (!edev || component >= edev->components)
 		return -EINVAL;
 
-	cdev = &edev->component[component].cdev;
+	cdev = &edev->component[component];
 
-	class_device_del(cdev);
-	if (cdev->dev)
-		put_device(cdev->dev);
+	device_del(&cdev->cdev);
+	put_device(cdev->dev);
 	cdev->dev = NULL;
-	return class_device_add(cdev);
+	return device_add(&cdev->cdev);
 }
 EXPORT_SYMBOL_GPL(enclosure_remove_device);
 
@@ -291,14 +332,16 @@ EXPORT_SYMBOL_GPL(enclosure_remove_device);
  * sysfs pieces below
  */
 
-static ssize_t enclosure_show_components(struct class_device *cdev, char *buf)
+static ssize_t enclosure_show_components(struct device *cdev,
+					 struct device_attribute *attr,
+					 char *buf)
 {
 	struct enclosure_device *edev = to_enclosure_device(cdev);
 
 	return snprintf(buf, 40, "%d\n", edev->components);
 }
 
-static struct class_device_attribute enclosure_attrs[] = {
+static struct device_attribute enclosure_attrs[] = {
 	__ATTR(components, S_IRUGO, enclosure_show_components, NULL),
 	__ATTR_NULL
 };
@@ -306,8 +349,8 @@ static struct class_device_attribute enclosure_attrs[] = {
 static struct class enclosure_class = {
 	.name			= "enclosure",
 	.owner			= THIS_MODULE,
-	.release		= enclosure_release,
-	.class_dev_attrs	= enclosure_attrs,
+	.dev_release		= enclosure_release,
+	.dev_attrs		= enclosure_attrs,
 };
 
 static const char *const enclosure_status [] = {
@@ -326,7 +369,8 @@ static const char *const enclosure_type [] = {
 	[ENCLOSURE_COMPONENT_ARRAY_DEVICE] = "array device",
 };
 
-static ssize_t get_component_fault(struct class_device *cdev, char *buf)
+static ssize_t get_component_fault(struct device *cdev,
+				   struct device_attribute *attr, char *buf)
 {
 	struct enclosure_device *edev = to_enclosure_device(cdev->parent);
 	struct enclosure_component *ecomp = to_enclosure_component(cdev);
@@ -336,8 +380,9 @@ static ssize_t get_component_fault(struct class_device *cdev, char *buf)
 	return snprintf(buf, 40, "%d\n", ecomp->fault);
 }
 
-static ssize_t set_component_fault(struct class_device *cdev, const char *buf,
-				   size_t count)
+static ssize_t set_component_fault(struct device *cdev,
+				   struct device_attribute *attr,
+				   const char *buf, size_t count)
 {
 	struct enclosure_device *edev = to_enclosure_device(cdev->parent);
 	struct enclosure_component *ecomp = to_enclosure_component(cdev);
@@ -348,7 +393,8 @@ static ssize_t set_component_fault(struct class_device *cdev, const char *buf,
 	return count;
 }
 
-static ssize_t get_component_status(struct class_device *cdev, char *buf)
+static ssize_t get_component_status(struct device *cdev,
+				    struct device_attribute *attr,char *buf)
 {
 	struct enclosure_device *edev = to_enclosure_device(cdev->parent);
 	struct enclosure_component *ecomp = to_enclosure_component(cdev);
@@ -358,8 +404,9 @@ static ssize_t get_component_status(struct class_device *cdev, char *buf)
 	return snprintf(buf, 40, "%s\n", enclosure_status[ecomp->status]);
 }
 
-static ssize_t set_component_status(struct class_device *cdev, const char *buf,
-				   size_t count)
+static ssize_t set_component_status(struct device *cdev,
+				    struct device_attribute *attr,
+				    const char *buf, size_t count)
 {
 	struct enclosure_device *edev = to_enclosure_device(cdev->parent);
 	struct enclosure_component *ecomp = to_enclosure_component(cdev);
@@ -380,7 +427,8 @@ static ssize_t set_component_status(struct class_device *cdev, const char *buf,
 		return -EINVAL;
 }
 
-static ssize_t get_component_active(struct class_device *cdev, char *buf)
+static ssize_t get_component_active(struct device *cdev,
+				    struct device_attribute *attr, char *buf)
 {
 	struct enclosure_device *edev = to_enclosure_device(cdev->parent);
 	struct enclosure_component *ecomp = to_enclosure_component(cdev);
@@ -390,8 +438,9 @@ static ssize_t get_component_active(struct class_device *cdev, char *buf)
 	return snprintf(buf, 40, "%d\n", ecomp->active);
 }
 
-static ssize_t set_component_active(struct class_device *cdev, const char *buf,
-				   size_t count)
+static ssize_t set_component_active(struct device *cdev,
+				    struct device_attribute *attr,
+				    const char *buf, size_t count)
 {
 	struct enclosure_device *edev = to_enclosure_device(cdev->parent);
 	struct enclosure_component *ecomp = to_enclosure_component(cdev);
@@ -402,7 +451,8 @@ static ssize_t set_component_active(struct class_device *cdev, const char *buf,
 	return count;
 }
 
-static ssize_t get_component_locate(struct class_device *cdev, char *buf)
+static ssize_t get_component_locate(struct device *cdev,
+				    struct device_attribute *attr, char *buf)
 {
 	struct enclosure_device *edev = to_enclosure_device(cdev->parent);
 	struct enclosure_component *ecomp = to_enclosure_component(cdev);
@@ -412,8 +462,9 @@ static ssize_t get_component_locate(struct class_device *cdev, char *buf)
 	return snprintf(buf, 40, "%d\n", ecomp->locate);
 }
 
-static ssize_t set_component_locate(struct class_device *cdev, const char *buf,
-				   size_t count)
+static ssize_t set_component_locate(struct device *cdev,
+				    struct device_attribute *attr,
+				    const char *buf, size_t count)
 {
 	struct enclosure_device *edev = to_enclosure_device(cdev->parent);
 	struct enclosure_component *ecomp = to_enclosure_component(cdev);
@@ -424,7 +475,8 @@ static ssize_t set_component_locate(struct class_device *cdev, const char *buf,
 	return count;
 }
 
-static ssize_t get_component_type(struct class_device *cdev, char *buf)
+static ssize_t get_component_type(struct device *cdev,
+				  struct device_attribute *attr, char *buf)
 {
 	struct enclosure_component *ecomp = to_enclosure_component(cdev);
 
@@ -432,24 +484,32 @@ static ssize_t get_component_type(struct class_device *cdev, char *buf)
 }
 
 
-static struct class_device_attribute enclosure_component_attrs[] = {
-	__ATTR(fault, S_IRUGO | S_IWUSR, get_component_fault,
-	       set_component_fault),
-	__ATTR(status, S_IRUGO | S_IWUSR, get_component_status,
-	       set_component_status),
-	__ATTR(active, S_IRUGO | S_IWUSR, get_component_active,
-	       set_component_active),
-	__ATTR(locate, S_IRUGO | S_IWUSR, get_component_locate,
-	       set_component_locate),
-	__ATTR(type, S_IRUGO, get_component_type, NULL),
-	__ATTR_NULL
+static DEVICE_ATTR(fault, S_IRUGO | S_IWUSR, get_component_fault,
+		    set_component_fault);
+static DEVICE_ATTR(status, S_IRUGO | S_IWUSR, get_component_status,
+		   set_component_status);
+static DEVICE_ATTR(active, S_IRUGO | S_IWUSR, get_component_active,
+		   set_component_active);
+static DEVICE_ATTR(locate, S_IRUGO | S_IWUSR, get_component_locate,
+		   set_component_locate);
+static DEVICE_ATTR(type, S_IRUGO, get_component_type, NULL);
+
+static struct attribute *enclosure_component_attrs[] = {
+	&dev_attr_fault.attr,
+	&dev_attr_status.attr,
+	&dev_attr_active.attr,
+	&dev_attr_locate.attr,
+	&dev_attr_type.attr,
+	NULL
 };
 
-static struct class enclosure_component_class =  {
-	.name			= "enclosure_component",
-	.owner			= THIS_MODULE,
-	.class_dev_attrs	= enclosure_component_attrs,
-	.release		= enclosure_component_release,
+static struct attribute_group enclosure_group = {
+	.attrs = enclosure_component_attrs,
+};
+
+static struct attribute_group *enclosure_groups[] = {
+	&enclosure_group,
+	NULL
 };
 
 static int __init enclosure_init(void)
@@ -459,20 +519,12 @@ static int __init enclosure_init(void)
 	err = class_register(&enclosure_class);
 	if (err)
 		return err;
-	err = class_register(&enclosure_component_class);
-	if (err)
-		goto err_out;
 
 	return 0;
- err_out:
-	class_unregister(&enclosure_class);
-
-	return err;
 }
 
 static void __exit enclosure_exit(void)
 {
-	class_unregister(&enclosure_component_class);
 	class_unregister(&enclosure_class);
 }
 
diff --git a/drivers/misc/intel_menlow.c b/drivers/misc/intel_menlow.c
index de16e88eb8d..5bb8816c912 100644
--- a/drivers/misc/intel_menlow.c
+++ b/drivers/misc/intel_menlow.c
@@ -175,28 +175,18 @@ static int intel_menlow_memory_add(struct acpi_device *device)
 		goto end;
 	}
 
-	if (cdev) {
-		acpi_driver_data(device) = cdev;
-		result = sysfs_create_link(&device->dev.kobj,
-					&cdev->device.kobj, "thermal_cooling");
-		if (result)
-			goto unregister;
-
-		result = sysfs_create_link(&cdev->device.kobj,
-					&device->dev.kobj, "device");
-		if (result) {
-			sysfs_remove_link(&device->dev.kobj, "thermal_cooling");
-			goto unregister;
-		}
-	}
+	acpi_driver_data(device) = cdev;
+	result = sysfs_create_link(&device->dev.kobj,
+				&cdev->device.kobj, "thermal_cooling");
+	if (result)
+		printk(KERN_ERR PREFIX "Create sysfs link\n");
+	result = sysfs_create_link(&cdev->device.kobj,
+				&device->dev.kobj, "device");
+	if (result)
+		printk(KERN_ERR PREFIX "Create sysfs link\n");
 
  end:
 	return result;
-
- unregister:
-	thermal_cooling_device_unregister(cdev);
-	return result;
-
 }
 
 static int intel_menlow_memory_remove(struct acpi_device *device, int type)
@@ -213,7 +203,7 @@ static int intel_menlow_memory_remove(struct acpi_device *device, int type)
 	return 0;
 }
 
-const static struct acpi_device_id intel_menlow_memory_ids[] = {
+static const struct acpi_device_id intel_menlow_memory_ids[] = {
 	{"INT0002", 0},
 	{"", 0},
 };
diff --git a/drivers/misc/kgdbts.c b/drivers/misc/kgdbts.c
new file mode 100644
index 00000000000..6d6286c4eea
--- /dev/null
+++ b/drivers/misc/kgdbts.c
@@ -0,0 +1,1090 @@
+/*
+ * kgdbts is a test suite for kgdb for the sole purpose of validating
+ * that key pieces of the kgdb internals are working properly such as
+ * HW/SW breakpoints, single stepping, and NMI.
+ *
+ * Created by: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (c) 2008 Wind River Systems, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+/* Information about the kgdb test suite.
+ * -------------------------------------
+ *
+ * The kgdb test suite is designed as a KGDB I/O module which
+ * simulates the communications that a debugger would have with kgdb.
+ * The tests are broken up in to a line by line and referenced here as
+ * a "get" which is kgdb requesting input and "put" which is kgdb
+ * sending a response.
+ *
+ * The kgdb suite can be invoked from the kernel command line
+ * arguments system or executed dynamically at run time.  The test
+ * suite uses the variable "kgdbts" to obtain the information about
+ * which tests to run and to configure the verbosity level.  The
+ * following are the various characters you can use with the kgdbts=
+ * line:
+ *
+ * When using the "kgdbts=" you only choose one of the following core
+ * test types:
+ * A = Run all the core tests silently
+ * V1 = Run all the core tests with minimal output
+ * V2 = Run all the core tests in debug mode
+ *
+ * You can also specify optional tests:
+ * N## = Go to sleep with interrupts of for ## seconds
+ *       to test the HW NMI watchdog
+ * F## = Break at do_fork for ## iterations
+ * S## = Break at sys_open for ## iterations
+ *
+ * NOTE: that the do_fork and sys_open tests are mutually exclusive.
+ *
+ * To invoke the kgdb test suite from boot you use a kernel start
+ * argument as follows:
+ * 	kgdbts=V1 kgdbwait
+ * Or if you wanted to perform the NMI test for 6 seconds and do_fork
+ * test for 100 forks, you could use:
+ * 	kgdbts=V1N6F100 kgdbwait
+ *
+ * The test suite can also be invoked at run time with:
+ *	echo kgdbts=V1N6F100 > /sys/module/kgdbts/parameters/kgdbts
+ * Or as another example:
+ *	echo kgdbts=V2 > /sys/module/kgdbts/parameters/kgdbts
+ *
+ * When developing a new kgdb arch specific implementation or
+ * using these tests for the purpose of regression testing,
+ * several invocations are required.
+ *
+ * 1) Boot with the test suite enabled by using the kernel arguments
+ *       "kgdbts=V1F100 kgdbwait"
+ *    ## If kgdb arch specific implementation has NMI use
+ *       "kgdbts=V1N6F100
+ *
+ * 2) After the system boot run the basic test.
+ * echo kgdbts=V1 > /sys/module/kgdbts/parameters/kgdbts
+ *
+ * 3) Run the concurrency tests.  It is best to use n+1
+ *    while loops where n is the number of cpus you have
+ *    in your system.  The example below uses only two
+ *    loops.
+ *
+ * ## This tests break points on sys_open
+ * while [ 1 ] ; do find / > /dev/null 2>&1 ; done &
+ * while [ 1 ] ; do find / > /dev/null 2>&1 ; done &
+ * echo kgdbts=V1S10000 > /sys/module/kgdbts/parameters/kgdbts
+ * fg # and hit control-c
+ * fg # and hit control-c
+ * ## This tests break points on do_fork
+ * while [ 1 ] ; do date > /dev/null ; done &
+ * while [ 1 ] ; do date > /dev/null ; done &
+ * echo kgdbts=V1F1000 > /sys/module/kgdbts/parameters/kgdbts
+ * fg # and hit control-c
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/kgdb.h>
+#include <linux/ctype.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/nmi.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+
+#define v1printk(a...) do { \
+	if (verbose) \
+		printk(KERN_INFO a); \
+	} while (0)
+#define v2printk(a...) do { \
+	if (verbose > 1) \
+		printk(KERN_INFO a); \
+		touch_nmi_watchdog();	\
+	} while (0)
+#define eprintk(a...) do { \
+		printk(KERN_ERR a); \
+		WARN_ON(1); \
+	} while (0)
+#define MAX_CONFIG_LEN		40
+
+static const char hexchars[] = "0123456789abcdef";
+static struct kgdb_io kgdbts_io_ops;
+static char get_buf[BUFMAX];
+static int get_buf_cnt;
+static char put_buf[BUFMAX];
+static int put_buf_cnt;
+static char scratch_buf[BUFMAX];
+static int verbose;
+static int repeat_test;
+static int test_complete;
+static int send_ack;
+static int final_ack;
+static int hw_break_val;
+static int hw_break_val2;
+#if defined(CONFIG_ARM) || defined(CONFIG_MIPS)
+static int arch_needs_sstep_emulation = 1;
+#else
+static int arch_needs_sstep_emulation;
+#endif
+static unsigned long sstep_addr;
+static int sstep_state;
+
+/* Storage for the registers, in GDB format. */
+static unsigned long kgdbts_gdb_regs[(NUMREGBYTES +
+					sizeof(unsigned long) - 1) /
+					sizeof(unsigned long)];
+static struct pt_regs kgdbts_regs;
+
+/* -1 = init not run yet, 0 = unconfigured, 1 = configured. */
+static int configured		= -1;
+
+#ifdef CONFIG_KGDB_TESTS_BOOT_STRING
+static char config[MAX_CONFIG_LEN] = CONFIG_KGDB_TESTS_BOOT_STRING;
+#else
+static char config[MAX_CONFIG_LEN];
+#endif
+static struct kparam_string kps = {
+	.string			= config,
+	.maxlen			= MAX_CONFIG_LEN,
+};
+
+static void fill_get_buf(char *buf);
+
+struct test_struct {
+	char *get;
+	char *put;
+	void (*get_handler)(char *);
+	int (*put_handler)(char *, char *);
+};
+
+struct test_state {
+	char *name;
+	struct test_struct *tst;
+	int idx;
+	int (*run_test) (int, int);
+	int (*validate_put) (char *);
+};
+
+static struct test_state ts;
+
+static int kgdbts_unreg_thread(void *ptr)
+{
+	/* Wait until the tests are complete and then ungresiter the I/O
+	 * driver.
+	 */
+	while (!final_ack)
+		msleep_interruptible(1500);
+
+	if (configured)
+		kgdb_unregister_io_module(&kgdbts_io_ops);
+	configured = 0;
+
+	return 0;
+}
+
+/* This is noinline such that it can be used for a single location to
+ * place a breakpoint
+ */
+static noinline void kgdbts_break_test(void)
+{
+	v2printk("kgdbts: breakpoint complete\n");
+}
+
+/* Lookup symbol info in the kernel */
+static unsigned long lookup_addr(char *arg)
+{
+	unsigned long addr = 0;
+
+	if (!strcmp(arg, "kgdbts_break_test"))
+		addr = (unsigned long)kgdbts_break_test;
+	else if (!strcmp(arg, "sys_open"))
+		addr = (unsigned long)sys_open;
+	else if (!strcmp(arg, "do_fork"))
+		addr = (unsigned long)do_fork;
+	else if (!strcmp(arg, "hw_break_val"))
+		addr = (unsigned long)&hw_break_val;
+	return addr;
+}
+
+static void break_helper(char *bp_type, char *arg, unsigned long vaddr)
+{
+	unsigned long addr;
+
+	if (arg)
+		addr = lookup_addr(arg);
+	else
+		addr = vaddr;
+
+	sprintf(scratch_buf, "%s,%lx,%i", bp_type, addr,
+		BREAK_INSTR_SIZE);
+	fill_get_buf(scratch_buf);
+}
+
+static void sw_break(char *arg)
+{
+	break_helper("Z0", arg, 0);
+}
+
+static void sw_rem_break(char *arg)
+{
+	break_helper("z0", arg, 0);
+}
+
+static void hw_break(char *arg)
+{
+	break_helper("Z1", arg, 0);
+}
+
+static void hw_rem_break(char *arg)
+{
+	break_helper("z1", arg, 0);
+}
+
+static void hw_write_break(char *arg)
+{
+	break_helper("Z2", arg, 0);
+}
+
+static void hw_rem_write_break(char *arg)
+{
+	break_helper("z2", arg, 0);
+}
+
+static void hw_access_break(char *arg)
+{
+	break_helper("Z4", arg, 0);
+}
+
+static void hw_rem_access_break(char *arg)
+{
+	break_helper("z4", arg, 0);
+}
+
+static void hw_break_val_access(void)
+{
+	hw_break_val2 = hw_break_val;
+}
+
+static void hw_break_val_write(void)
+{
+	hw_break_val++;
+}
+
+static int check_and_rewind_pc(char *put_str, char *arg)
+{
+	unsigned long addr = lookup_addr(arg);
+	int offset = 0;
+
+	kgdb_hex2mem(&put_str[1], (char *)kgdbts_gdb_regs,
+		 NUMREGBYTES);
+	gdb_regs_to_pt_regs(kgdbts_gdb_regs, &kgdbts_regs);
+	v2printk("Stopped at IP: %lx\n", instruction_pointer(&kgdbts_regs));
+#ifdef CONFIG_X86
+	/* On x86 a breakpoint stop requires it to be decremented */
+	if (addr + 1 == kgdbts_regs.ip)
+		offset = -1;
+#endif
+	if (strcmp(arg, "silent") &&
+		instruction_pointer(&kgdbts_regs) + offset != addr) {
+		eprintk("kgdbts: BP mismatch %lx expected %lx\n",
+			   instruction_pointer(&kgdbts_regs) + offset, addr);
+		return 1;
+	}
+#ifdef CONFIG_X86
+	/* On x86 adjust the instruction pointer if needed */
+	kgdbts_regs.ip += offset;
+#endif
+	return 0;
+}
+
+static int check_single_step(char *put_str, char *arg)
+{
+	unsigned long addr = lookup_addr(arg);
+	/*
+	 * From an arch indepent point of view the instruction pointer
+	 * should be on a different instruction
+	 */
+	kgdb_hex2mem(&put_str[1], (char *)kgdbts_gdb_regs,
+		 NUMREGBYTES);
+	gdb_regs_to_pt_regs(kgdbts_gdb_regs, &kgdbts_regs);
+	v2printk("Singlestep stopped at IP: %lx\n",
+		   instruction_pointer(&kgdbts_regs));
+	if (instruction_pointer(&kgdbts_regs) == addr) {
+		eprintk("kgdbts: SingleStep failed at %lx\n",
+			   instruction_pointer(&kgdbts_regs));
+		return 1;
+	}
+
+	return 0;
+}
+
+static void write_regs(char *arg)
+{
+	memset(scratch_buf, 0, sizeof(scratch_buf));
+	scratch_buf[0] = 'G';
+	pt_regs_to_gdb_regs(kgdbts_gdb_regs, &kgdbts_regs);
+	kgdb_mem2hex((char *)kgdbts_gdb_regs, &scratch_buf[1], NUMREGBYTES);
+	fill_get_buf(scratch_buf);
+}
+
+static void skip_back_repeat_test(char *arg)
+{
+	int go_back = simple_strtol(arg, NULL, 10);
+
+	repeat_test--;
+	if (repeat_test <= 0)
+		ts.idx++;
+	else
+		ts.idx -= go_back;
+	fill_get_buf(ts.tst[ts.idx].get);
+}
+
+static int got_break(char *put_str, char *arg)
+{
+	test_complete = 1;
+	if (!strncmp(put_str+1, arg, 2)) {
+		if (!strncmp(arg, "T0", 2))
+			test_complete = 2;
+		return 0;
+	}
+	return 1;
+}
+
+static void emul_sstep_get(char *arg)
+{
+	if (!arch_needs_sstep_emulation) {
+		fill_get_buf(arg);
+		return;
+	}
+	switch (sstep_state) {
+	case 0:
+		v2printk("Emulate single step\n");
+		/* Start by looking at the current PC */
+		fill_get_buf("g");
+		break;
+	case 1:
+		/* set breakpoint */
+		break_helper("Z0", 0, sstep_addr);
+		break;
+	case 2:
+		/* Continue */
+		fill_get_buf("c");
+		break;
+	case 3:
+		/* Clear breakpoint */
+		break_helper("z0", 0, sstep_addr);
+		break;
+	default:
+		eprintk("kgdbts: ERROR failed sstep get emulation\n");
+	}
+	sstep_state++;
+}
+
+static int emul_sstep_put(char *put_str, char *arg)
+{
+	if (!arch_needs_sstep_emulation) {
+		if (!strncmp(put_str+1, arg, 2))
+			return 0;
+		return 1;
+	}
+	switch (sstep_state) {
+	case 1:
+		/* validate the "g" packet to get the IP */
+		kgdb_hex2mem(&put_str[1], (char *)kgdbts_gdb_regs,
+			 NUMREGBYTES);
+		gdb_regs_to_pt_regs(kgdbts_gdb_regs, &kgdbts_regs);
+		v2printk("Stopped at IP: %lx\n",
+			 instruction_pointer(&kgdbts_regs));
+		/* Want to stop at IP + break instruction size by default */
+		sstep_addr = instruction_pointer(&kgdbts_regs) +
+			BREAK_INSTR_SIZE;
+		break;
+	case 2:
+		if (strncmp(put_str, "$OK", 3)) {
+			eprintk("kgdbts: failed sstep break set\n");
+			return 1;
+		}
+		break;
+	case 3:
+		if (strncmp(put_str, "$T0", 3)) {
+			eprintk("kgdbts: failed continue sstep\n");
+			return 1;
+		}
+		break;
+	case 4:
+		if (strncmp(put_str, "$OK", 3)) {
+			eprintk("kgdbts: failed sstep break unset\n");
+			return 1;
+		}
+		/* Single step is complete so continue on! */
+		sstep_state = 0;
+		return 0;
+	default:
+		eprintk("kgdbts: ERROR failed sstep put emulation\n");
+	}
+
+	/* Continue on the same test line until emulation is complete */
+	ts.idx--;
+	return 0;
+}
+
+static int final_ack_set(char *put_str, char *arg)
+{
+	if (strncmp(put_str+1, arg, 2))
+		return 1;
+	final_ack = 1;
+	return 0;
+}
+/*
+ * Test to plant a breakpoint and detach, which should clear out the
+ * breakpoint and restore the original instruction.
+ */
+static struct test_struct plant_and_detach_test[] = {
+	{ "?", "S0*" }, /* Clear break points */
+	{ "kgdbts_break_test", "OK", sw_break, }, /* set sw breakpoint */
+	{ "D", "OK" }, /* Detach */
+	{ "", "" },
+};
+
+/*
+ * Simple test to write in a software breakpoint, check for the
+ * correct stop location and detach.
+ */
+static struct test_struct sw_breakpoint_test[] = {
+	{ "?", "S0*" }, /* Clear break points */
+	{ "kgdbts_break_test", "OK", sw_break, }, /* set sw breakpoint */
+	{ "c", "T0*", }, /* Continue */
+	{ "g", "kgdbts_break_test", 0, check_and_rewind_pc },
+	{ "write", "OK", write_regs },
+	{ "kgdbts_break_test", "OK", sw_rem_break }, /*remove breakpoint */
+	{ "D", "OK" }, /* Detach */
+	{ "D", "OK", 0,  got_break }, /* If the test worked we made it here */
+	{ "", "" },
+};
+
+/*
+ * Test a known bad memory read location to test the fault handler and
+ * read bytes 1-8 at the bad address
+ */
+static struct test_struct bad_read_test[] = {
+	{ "?", "S0*" }, /* Clear break points */
+	{ "m0,1", "E*" }, /* read 1 byte at address 1 */
+	{ "m0,2", "E*" }, /* read 1 byte at address 2 */
+	{ "m0,3", "E*" }, /* read 1 byte at address 3 */
+	{ "m0,4", "E*" }, /* read 1 byte at address 4 */
+	{ "m0,5", "E*" }, /* read 1 byte at address 5 */
+	{ "m0,6", "E*" }, /* read 1 byte at address 6 */
+	{ "m0,7", "E*" }, /* read 1 byte at address 7 */
+	{ "m0,8", "E*" }, /* read 1 byte at address 8 */
+	{ "D", "OK" }, /* Detach which removes all breakpoints and continues */
+	{ "", "" },
+};
+
+/*
+ * Test for hitting a breakpoint, remove it, single step, plant it
+ * again and detach.
+ */
+static struct test_struct singlestep_break_test[] = {
+	{ "?", "S0*" }, /* Clear break points */
+	{ "kgdbts_break_test", "OK", sw_break, }, /* set sw breakpoint */
+	{ "c", "T0*", }, /* Continue */
+	{ "g", "kgdbts_break_test", 0, check_and_rewind_pc },
+	{ "write", "OK", write_regs }, /* Write registers */
+	{ "kgdbts_break_test", "OK", sw_rem_break }, /*remove breakpoint */
+	{ "s", "T0*", emul_sstep_get, emul_sstep_put }, /* Single step */
+	{ "g", "kgdbts_break_test", 0, check_single_step },
+	{ "kgdbts_break_test", "OK", sw_break, }, /* set sw breakpoint */
+	{ "c", "T0*", }, /* Continue */
+	{ "g", "kgdbts_break_test", 0, check_and_rewind_pc },
+	{ "write", "OK", write_regs }, /* Write registers */
+	{ "D", "OK" }, /* Remove all breakpoints and continues */
+	{ "", "" },
+};
+
+/*
+ * Test for hitting a breakpoint at do_fork for what ever the number
+ * of iterations required by the variable repeat_test.
+ */
+static struct test_struct do_fork_test[] = {
+	{ "?", "S0*" }, /* Clear break points */
+	{ "do_fork", "OK", sw_break, }, /* set sw breakpoint */
+	{ "c", "T0*", }, /* Continue */
+	{ "g", "do_fork", 0, check_and_rewind_pc }, /* check location */
+	{ "write", "OK", write_regs }, /* Write registers */
+	{ "do_fork", "OK", sw_rem_break }, /*remove breakpoint */
+	{ "s", "T0*", emul_sstep_get, emul_sstep_put }, /* Single step */
+	{ "g", "do_fork", 0, check_single_step },
+	{ "do_fork", "OK", sw_break, }, /* set sw breakpoint */
+	{ "7", "T0*", skip_back_repeat_test }, /* Loop based on repeat_test */
+	{ "D", "OK", 0, final_ack_set }, /* detach and unregister I/O */
+	{ "", "" },
+};
+
+/* Test for hitting a breakpoint at sys_open for what ever the number
+ * of iterations required by the variable repeat_test.
+ */
+static struct test_struct sys_open_test[] = {
+	{ "?", "S0*" }, /* Clear break points */
+	{ "sys_open", "OK", sw_break, }, /* set sw breakpoint */
+	{ "c", "T0*", }, /* Continue */
+	{ "g", "sys_open", 0, check_and_rewind_pc }, /* check location */
+	{ "write", "OK", write_regs }, /* Write registers */
+	{ "sys_open", "OK", sw_rem_break }, /*remove breakpoint */
+	{ "s", "T0*", emul_sstep_get, emul_sstep_put }, /* Single step */
+	{ "g", "sys_open", 0, check_single_step },
+	{ "sys_open", "OK", sw_break, }, /* set sw breakpoint */
+	{ "7", "T0*", skip_back_repeat_test }, /* Loop based on repeat_test */
+	{ "D", "OK", 0, final_ack_set }, /* detach and unregister I/O */
+	{ "", "" },
+};
+
+/*
+ * Test for hitting a simple hw breakpoint
+ */
+static struct test_struct hw_breakpoint_test[] = {
+	{ "?", "S0*" }, /* Clear break points */
+	{ "kgdbts_break_test", "OK", hw_break, }, /* set hw breakpoint */
+	{ "c", "T0*", }, /* Continue */
+	{ "g", "kgdbts_break_test", 0, check_and_rewind_pc },
+	{ "write", "OK", write_regs },
+	{ "kgdbts_break_test", "OK", hw_rem_break }, /*remove breakpoint */
+	{ "D", "OK" }, /* Detach */
+	{ "D", "OK", 0,  got_break }, /* If the test worked we made it here */
+	{ "", "" },
+};
+
+/*
+ * Test for hitting a hw write breakpoint
+ */
+static struct test_struct hw_write_break_test[] = {
+	{ "?", "S0*" }, /* Clear break points */
+	{ "hw_break_val", "OK", hw_write_break, }, /* set hw breakpoint */
+	{ "c", "T0*", 0, got_break }, /* Continue */
+	{ "g", "silent", 0, check_and_rewind_pc },
+	{ "write", "OK", write_regs },
+	{ "hw_break_val", "OK", hw_rem_write_break }, /*remove breakpoint */
+	{ "D", "OK" }, /* Detach */
+	{ "D", "OK", 0,  got_break }, /* If the test worked we made it here */
+	{ "", "" },
+};
+
+/*
+ * Test for hitting a hw access breakpoint
+ */
+static struct test_struct hw_access_break_test[] = {
+	{ "?", "S0*" }, /* Clear break points */
+	{ "hw_break_val", "OK", hw_access_break, }, /* set hw breakpoint */
+	{ "c", "T0*", 0, got_break }, /* Continue */
+	{ "g", "silent", 0, check_and_rewind_pc },
+	{ "write", "OK", write_regs },
+	{ "hw_break_val", "OK", hw_rem_access_break }, /*remove breakpoint */
+	{ "D", "OK" }, /* Detach */
+	{ "D", "OK", 0,  got_break }, /* If the test worked we made it here */
+	{ "", "" },
+};
+
+/*
+ * Test for hitting a hw access breakpoint
+ */
+static struct test_struct nmi_sleep_test[] = {
+	{ "?", "S0*" }, /* Clear break points */
+	{ "c", "T0*", 0, got_break }, /* Continue */
+	{ "D", "OK" }, /* Detach */
+	{ "D", "OK", 0,  got_break }, /* If the test worked we made it here */
+	{ "", "" },
+};
+
+static void fill_get_buf(char *buf)
+{
+	unsigned char checksum = 0;
+	int count = 0;
+	char ch;
+
+	strcpy(get_buf, "$");
+	strcat(get_buf, buf);
+	while ((ch = buf[count])) {
+		checksum += ch;
+		count++;
+	}
+	strcat(get_buf, "#");
+	get_buf[count + 2] = hexchars[checksum >> 4];
+	get_buf[count + 3] = hexchars[checksum & 0xf];
+	get_buf[count + 4] = '\0';
+	v2printk("get%i: %s\n", ts.idx, get_buf);
+}
+
+static int validate_simple_test(char *put_str)
+{
+	char *chk_str;
+
+	if (ts.tst[ts.idx].put_handler)
+		return ts.tst[ts.idx].put_handler(put_str,
+			ts.tst[ts.idx].put);
+
+	chk_str = ts.tst[ts.idx].put;
+	if (*put_str == '$')
+		put_str++;
+
+	while (*chk_str != '\0' && *put_str != '\0') {
+		/* If someone does a * to match the rest of the string, allow
+		 * it, or stop if the recieved string is complete.
+		 */
+		if (*put_str == '#' || *chk_str == '*')
+			return 0;
+		if (*put_str != *chk_str)
+			return 1;
+
+		chk_str++;
+		put_str++;
+	}
+	if (*chk_str == '\0' && (*put_str == '\0' || *put_str == '#'))
+		return 0;
+
+	return 1;
+}
+
+static int run_simple_test(int is_get_char, int chr)
+{
+	int ret = 0;
+	if (is_get_char) {
+		/* Send an ACK on the get if a prior put completed and set the
+		 * send ack variable
+		 */
+		if (send_ack) {
+			send_ack = 0;
+			return '+';
+		}
+		/* On the first get char, fill the transmit buffer and then
+		 * take from the get_string.
+		 */
+		if (get_buf_cnt == 0) {
+			if (ts.tst[ts.idx].get_handler)
+				ts.tst[ts.idx].get_handler(ts.tst[ts.idx].get);
+			else
+				fill_get_buf(ts.tst[ts.idx].get);
+		}
+
+		if (get_buf[get_buf_cnt] == '\0') {
+			eprintk("kgdbts: ERROR GET: EOB on '%s' at %i\n",
+			   ts.name, ts.idx);
+			get_buf_cnt = 0;
+			fill_get_buf("D");
+		}
+		ret = get_buf[get_buf_cnt];
+		get_buf_cnt++;
+		return ret;
+	}
+
+	/* This callback is a put char which is when kgdb sends data to
+	 * this I/O module.
+	 */
+	if (ts.tst[ts.idx].get[0] == '\0' &&
+		ts.tst[ts.idx].put[0] == '\0') {
+		eprintk("kgdbts: ERROR: beyond end of test on"
+			   " '%s' line %i\n", ts.name, ts.idx);
+		return 0;
+	}
+
+	if (put_buf_cnt >= BUFMAX) {
+		eprintk("kgdbts: ERROR: put buffer overflow on"
+			   " '%s' line %i\n", ts.name, ts.idx);
+		put_buf_cnt = 0;
+		return 0;
+	}
+	/* Ignore everything until the first valid packet start '$' */
+	if (put_buf_cnt == 0 && chr != '$')
+		return 0;
+
+	put_buf[put_buf_cnt] = chr;
+	put_buf_cnt++;
+
+	/* End of packet == #XX so look for the '#' */
+	if (put_buf_cnt > 3 && put_buf[put_buf_cnt - 3] == '#') {
+		put_buf[put_buf_cnt] = '\0';
+		v2printk("put%i: %s\n", ts.idx, put_buf);
+		/* Trigger check here */
+		if (ts.validate_put && ts.validate_put(put_buf)) {
+			eprintk("kgdbts: ERROR PUT: end of test "
+			   "buffer on '%s' line %i expected %s got %s\n",
+			   ts.name, ts.idx, ts.tst[ts.idx].put, put_buf);
+		}
+		ts.idx++;
+		put_buf_cnt = 0;
+		get_buf_cnt = 0;
+		send_ack = 1;
+	}
+	return 0;
+}
+
+static void init_simple_test(void)
+{
+	memset(&ts, 0, sizeof(ts));
+	ts.run_test = run_simple_test;
+	ts.validate_put = validate_simple_test;
+}
+
+static void run_plant_and_detach_test(int is_early)
+{
+	char before[BREAK_INSTR_SIZE];
+	char after[BREAK_INSTR_SIZE];
+
+	probe_kernel_read(before, (char *)kgdbts_break_test,
+	  BREAK_INSTR_SIZE);
+	init_simple_test();
+	ts.tst = plant_and_detach_test;
+	ts.name = "plant_and_detach_test";
+	/* Activate test with initial breakpoint */
+	if (!is_early)
+		kgdb_breakpoint();
+	probe_kernel_read(after, (char *)kgdbts_break_test,
+	  BREAK_INSTR_SIZE);
+	if (memcmp(before, after, BREAK_INSTR_SIZE)) {
+		printk(KERN_CRIT "kgdbts: ERROR kgdb corrupted memory\n");
+		panic("kgdb memory corruption");
+	}
+
+	/* complete the detach test */
+	if (!is_early)
+		kgdbts_break_test();
+}
+
+static void run_breakpoint_test(int is_hw_breakpoint)
+{
+	test_complete = 0;
+	init_simple_test();
+	if (is_hw_breakpoint) {
+		ts.tst = hw_breakpoint_test;
+		ts.name = "hw_breakpoint_test";
+	} else {
+		ts.tst = sw_breakpoint_test;
+		ts.name = "sw_breakpoint_test";
+	}
+	/* Activate test with initial breakpoint */
+	kgdb_breakpoint();
+	/* run code with the break point in it */
+	kgdbts_break_test();
+	kgdb_breakpoint();
+
+	if (test_complete)
+		return;
+
+	eprintk("kgdbts: ERROR %s test failed\n", ts.name);
+}
+
+static void run_hw_break_test(int is_write_test)
+{
+	test_complete = 0;
+	init_simple_test();
+	if (is_write_test) {
+		ts.tst = hw_write_break_test;
+		ts.name = "hw_write_break_test";
+	} else {
+		ts.tst = hw_access_break_test;
+		ts.name = "hw_access_break_test";
+	}
+	/* Activate test with initial breakpoint */
+	kgdb_breakpoint();
+	hw_break_val_access();
+	if (is_write_test) {
+		if (test_complete == 2)
+			eprintk("kgdbts: ERROR %s broke on access\n",
+				ts.name);
+		hw_break_val_write();
+	}
+	kgdb_breakpoint();
+
+	if (test_complete == 1)
+		return;
+
+	eprintk("kgdbts: ERROR %s test failed\n", ts.name);
+}
+
+static void run_nmi_sleep_test(int nmi_sleep)
+{
+	unsigned long flags;
+
+	init_simple_test();
+	ts.tst = nmi_sleep_test;
+	ts.name = "nmi_sleep_test";
+	/* Activate test with initial breakpoint */
+	kgdb_breakpoint();
+	local_irq_save(flags);
+	mdelay(nmi_sleep*1000);
+	touch_nmi_watchdog();
+	local_irq_restore(flags);
+	if (test_complete != 2)
+		eprintk("kgdbts: ERROR nmi_test did not hit nmi\n");
+	kgdb_breakpoint();
+	if (test_complete == 1)
+		return;
+
+	eprintk("kgdbts: ERROR %s test failed\n", ts.name);
+}
+
+static void run_bad_read_test(void)
+{
+	init_simple_test();
+	ts.tst = bad_read_test;
+	ts.name = "bad_read_test";
+	/* Activate test with initial breakpoint */
+	kgdb_breakpoint();
+}
+
+static void run_do_fork_test(void)
+{
+	init_simple_test();
+	ts.tst = do_fork_test;
+	ts.name = "do_fork_test";
+	/* Activate test with initial breakpoint */
+	kgdb_breakpoint();
+}
+
+static void run_sys_open_test(void)
+{
+	init_simple_test();
+	ts.tst = sys_open_test;
+	ts.name = "sys_open_test";
+	/* Activate test with initial breakpoint */
+	kgdb_breakpoint();
+}
+
+static void run_singlestep_break_test(void)
+{
+	init_simple_test();
+	ts.tst = singlestep_break_test;
+	ts.name = "singlestep_breakpoint_test";
+	/* Activate test with initial breakpoint */
+	kgdb_breakpoint();
+	kgdbts_break_test();
+	kgdbts_break_test();
+}
+
+static void kgdbts_run_tests(void)
+{
+	char *ptr;
+	int fork_test = 0;
+	int sys_open_test = 0;
+	int nmi_sleep = 0;
+
+	ptr = strstr(config, "F");
+	if (ptr)
+		fork_test = simple_strtol(ptr+1, NULL, 10);
+	ptr = strstr(config, "S");
+	if (ptr)
+		sys_open_test = simple_strtol(ptr+1, NULL, 10);
+	ptr = strstr(config, "N");
+	if (ptr)
+		nmi_sleep = simple_strtol(ptr+1, NULL, 10);
+
+	/* required internal KGDB tests */
+	v1printk("kgdbts:RUN plant and detach test\n");
+	run_plant_and_detach_test(0);
+	v1printk("kgdbts:RUN sw breakpoint test\n");
+	run_breakpoint_test(0);
+	v1printk("kgdbts:RUN bad memory access test\n");
+	run_bad_read_test();
+	v1printk("kgdbts:RUN singlestep breakpoint test\n");
+	run_singlestep_break_test();
+
+	/* ===Optional tests=== */
+
+	/* All HW break point tests */
+	if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT) {
+		v1printk("kgdbts:RUN hw breakpoint test\n");
+		run_breakpoint_test(1);
+		v1printk("kgdbts:RUN hw write breakpoint test\n");
+		run_hw_break_test(1);
+		v1printk("kgdbts:RUN access write breakpoint test\n");
+		run_hw_break_test(0);
+	}
+
+	if (nmi_sleep) {
+		v1printk("kgdbts:RUN NMI sleep %i seconds test\n", nmi_sleep);
+		run_nmi_sleep_test(nmi_sleep);
+	}
+
+	/* If the do_fork test is run it will be the last test that is
+	 * executed because a kernel thread will be spawned at the very
+	 * end to unregister the debug hooks.
+	 */
+	if (fork_test) {
+		repeat_test = fork_test;
+		printk(KERN_INFO "kgdbts:RUN do_fork for %i breakpoints\n",
+			repeat_test);
+		kthread_run(kgdbts_unreg_thread, 0, "kgdbts_unreg");
+		run_do_fork_test();
+		return;
+	}
+
+	/* If the sys_open test is run it will be the last test that is
+	 * executed because a kernel thread will be spawned at the very
+	 * end to unregister the debug hooks.
+	 */
+	if (sys_open_test) {
+		repeat_test = sys_open_test;
+		printk(KERN_INFO "kgdbts:RUN sys_open for %i breakpoints\n",
+			repeat_test);
+		kthread_run(kgdbts_unreg_thread, 0, "kgdbts_unreg");
+		run_sys_open_test();
+		return;
+	}
+	/* Shutdown and unregister */
+	kgdb_unregister_io_module(&kgdbts_io_ops);
+	configured = 0;
+}
+
+static int kgdbts_option_setup(char *opt)
+{
+	if (strlen(opt) > MAX_CONFIG_LEN) {
+		printk(KERN_ERR "kgdbts: config string too long\n");
+		return -ENOSPC;
+	}
+	strcpy(config, opt);
+
+	verbose = 0;
+	if (strstr(config, "V1"))
+		verbose = 1;
+	if (strstr(config, "V2"))
+		verbose = 2;
+
+	return 0;
+}
+
+__setup("kgdbts=", kgdbts_option_setup);
+
+static int configure_kgdbts(void)
+{
+	int err = 0;
+
+	if (!strlen(config) || isspace(config[0]))
+		goto noconfig;
+	err = kgdbts_option_setup(config);
+	if (err)
+		goto noconfig;
+
+	final_ack = 0;
+	run_plant_and_detach_test(1);
+
+	err = kgdb_register_io_module(&kgdbts_io_ops);
+	if (err) {
+		configured = 0;
+		return err;
+	}
+	configured = 1;
+	kgdbts_run_tests();
+
+	return err;
+
+noconfig:
+	config[0] = 0;
+	configured = 0;
+
+	return err;
+}
+
+static int __init init_kgdbts(void)
+{
+	/* Already configured? */
+	if (configured == 1)
+		return 0;
+
+	return configure_kgdbts();
+}
+
+static void cleanup_kgdbts(void)
+{
+	if (configured == 1)
+		kgdb_unregister_io_module(&kgdbts_io_ops);
+}
+
+static int kgdbts_get_char(void)
+{
+	int val = 0;
+
+	if (ts.run_test)
+		val = ts.run_test(1, 0);
+
+	return val;
+}
+
+static void kgdbts_put_char(u8 chr)
+{
+	if (ts.run_test)
+		ts.run_test(0, chr);
+}
+
+static int param_set_kgdbts_var(const char *kmessage, struct kernel_param *kp)
+{
+	int len = strlen(kmessage);
+
+	if (len >= MAX_CONFIG_LEN) {
+		printk(KERN_ERR "kgdbts: config string too long\n");
+		return -ENOSPC;
+	}
+
+	/* Only copy in the string if the init function has not run yet */
+	if (configured < 0) {
+		strcpy(config, kmessage);
+		return 0;
+	}
+
+	if (kgdb_connected) {
+		printk(KERN_ERR
+	       "kgdbts: Cannot reconfigure while KGDB is connected.\n");
+
+		return -EBUSY;
+	}
+
+	strcpy(config, kmessage);
+	/* Chop out \n char as a result of echo */
+	if (config[len - 1] == '\n')
+		config[len - 1] = '\0';
+
+	if (configured == 1)
+		cleanup_kgdbts();
+
+	/* Go and configure with the new params. */
+	return configure_kgdbts();
+}
+
+static void kgdbts_pre_exp_handler(void)
+{
+	/* Increment the module count when the debugger is active */
+	if (!kgdb_connected)
+		try_module_get(THIS_MODULE);
+}
+
+static void kgdbts_post_exp_handler(void)
+{
+	/* decrement the module count when the debugger detaches */
+	if (!kgdb_connected)
+		module_put(THIS_MODULE);
+}
+
+static struct kgdb_io kgdbts_io_ops = {
+	.name			= "kgdbts",
+	.read_char		= kgdbts_get_char,
+	.write_char		= kgdbts_put_char,
+	.pre_exception		= kgdbts_pre_exp_handler,
+	.post_exception		= kgdbts_post_exp_handler,
+};
+
+module_init(init_kgdbts);
+module_exit(cleanup_kgdbts);
+module_param_call(kgdbts, param_set_kgdbts_var, param_get_string, &kps, 0644);
+MODULE_PARM_DESC(kgdbts, "<A|V1|V2>[F#|S#][N#]");
+MODULE_DESCRIPTION("KGDB Test Suite");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Wind River Systems, Inc.");
+
diff --git a/drivers/misc/sgi-xp/Makefile b/drivers/misc/sgi-xp/Makefile
new file mode 100644
index 00000000000..b6e40a7958c
--- /dev/null
+++ b/drivers/misc/sgi-xp/Makefile
@@ -0,0 +1,11 @@
+#
+# Makefile for SGI's XP devices.
+#
+
+obj-$(CONFIG_SGI_XP)		+= xp.o
+xp-y				:= xp_main.o xp_nofault.o
+
+obj-$(CONFIG_SGI_XP)		+= xpc.o
+xpc-y				:= xpc_main.o xpc_channel.o xpc_partition.o
+
+obj-$(CONFIG_SGI_XP)		+= xpnet.o
diff --git a/drivers/misc/sgi-xp/xp.h b/drivers/misc/sgi-xp/xp.h
new file mode 100644
index 00000000000..5515234be86
--- /dev/null
+++ b/drivers/misc/sgi-xp/xp.h
@@ -0,0 +1,463 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2004-2008 Silicon Graphics, Inc. All rights reserved.
+ */
+
+/*
+ * External Cross Partition (XP) structures and defines.
+ */
+
+#ifndef _DRIVERS_MISC_SGIXP_XP_H
+#define _DRIVERS_MISC_SGIXP_XP_H
+
+#include <linux/cache.h>
+#include <linux/hardirq.h>
+#include <linux/mutex.h>
+#include <asm/sn/types.h>
+#include <asm/sn/bte.h>
+
+#ifdef USE_DBUG_ON
+#define DBUG_ON(condition)	BUG_ON(condition)
+#else
+#define DBUG_ON(condition)
+#endif
+
+/*
+ * Define the maximum number of logically defined partitions the system
+ * can support. It is constrained by the maximum number of hardware
+ * partitionable regions. The term 'region' in this context refers to the
+ * minimum number of nodes that can comprise an access protection grouping.
+ * The access protection is in regards to memory, IPI and IOI.
+ *
+ * The maximum number of hardware partitionable regions is equal to the
+ * maximum number of nodes in the entire system divided by the minimum number
+ * of nodes that comprise an access protection grouping.
+ */
+#define XP_MAX_PARTITIONS	64
+
+/*
+ * Define the number of u64s required to represent all the C-brick nasids
+ * as a bitmap.  The cross-partition kernel modules deal only with
+ * C-brick nasids, thus the need for bitmaps which don't account for
+ * odd-numbered (non C-brick) nasids.
+ */
+#define XP_MAX_PHYSNODE_ID	(MAX_NUMALINK_NODES / 2)
+#define XP_NASID_MASK_BYTES	((XP_MAX_PHYSNODE_ID + 7) / 8)
+#define XP_NASID_MASK_WORDS	((XP_MAX_PHYSNODE_ID + 63) / 64)
+
+/*
+ * Wrapper for bte_copy() that should it return a failure status will retry
+ * the bte_copy() once in the hope that the failure was due to a temporary
+ * aberration (i.e., the link going down temporarily).
+ *
+ * 	src - physical address of the source of the transfer.
+ *	vdst - virtual address of the destination of the transfer.
+ *	len - number of bytes to transfer from source to destination.
+ *	mode - see bte_copy() for definition.
+ *	notification - see bte_copy() for definition.
+ *
+ * Note: xp_bte_copy() should never be called while holding a spinlock.
+ */
+static inline bte_result_t
+xp_bte_copy(u64 src, u64 vdst, u64 len, u64 mode, void *notification)
+{
+	bte_result_t ret;
+	u64 pdst = ia64_tpa(vdst);
+
+	/*
+	 * Ensure that the physically mapped memory is contiguous.
+	 *
+	 * We do this by ensuring that the memory is from region 7 only.
+	 * If the need should arise to use memory from one of the other
+	 * regions, then modify the BUG_ON() statement to ensure that the
+	 * memory from that region is always physically contiguous.
+	 */
+	BUG_ON(REGION_NUMBER(vdst) != RGN_KERNEL);
+
+	ret = bte_copy(src, pdst, len, mode, notification);
+	if ((ret != BTE_SUCCESS) && BTE_ERROR_RETRY(ret)) {
+		if (!in_interrupt())
+			cond_resched();
+
+		ret = bte_copy(src, pdst, len, mode, notification);
+	}
+
+	return ret;
+}
+
+/*
+ * XPC establishes channel connections between the local partition and any
+ * other partition that is currently up. Over these channels, kernel-level
+ * `users' can communicate with their counterparts on the other partitions.
+ *
+ * The maxinum number of channels is limited to eight. For performance reasons,
+ * the internal cross partition structures require sixteen bytes per channel,
+ * and eight allows all of this interface-shared info to fit in one cache line.
+ *
+ * XPC_NCHANNELS reflects the total number of channels currently defined.
+ * If the need for additional channels arises, one can simply increase
+ * XPC_NCHANNELS accordingly. If the day should come where that number
+ * exceeds the MAXIMUM number of channels allowed (eight), then one will need
+ * to make changes to the XPC code to allow for this.
+ */
+#define XPC_MEM_CHANNEL		0	/* memory channel number */
+#define	XPC_NET_CHANNEL		1	/* network channel number */
+
+#define	XPC_NCHANNELS		2	/* #of defined channels */
+#define XPC_MAX_NCHANNELS	8	/* max #of channels allowed */
+
+#if XPC_NCHANNELS > XPC_MAX_NCHANNELS
+#error	XPC_NCHANNELS exceeds MAXIMUM allowed.
+#endif
+
+/*
+ * The format of an XPC message is as follows:
+ *
+ *      +-------+--------------------------------+
+ *      | flags |////////////////////////////////|
+ *      +-------+--------------------------------+
+ *      |             message #                  |
+ *      +----------------------------------------+
+ *      |     payload (user-defined message)     |
+ *      |                                        |
+ *         		:
+ *      |                                        |
+ *      +----------------------------------------+
+ *
+ * The size of the payload is defined by the user via xpc_connect(). A user-
+ * defined message resides in the payload area.
+ *
+ * The user should have no dealings with the message header, but only the
+ * message's payload. When a message entry is allocated (via xpc_allocate())
+ * a pointer to the payload area is returned and not the actual beginning of
+ * the XPC message. The user then constructs a message in the payload area
+ * and passes that pointer as an argument on xpc_send() or xpc_send_notify().
+ *
+ * The size of a message entry (within a message queue) must be a cacheline
+ * sized multiple in order to facilitate the BTE transfer of messages from one
+ * message queue to another. A macro, XPC_MSG_SIZE(), is provided for the user
+ * that wants to fit as many msg entries as possible in a given memory size
+ * (e.g. a memory page).
+ */
+struct xpc_msg {
+	u8 flags;		/* FOR XPC INTERNAL USE ONLY */
+	u8 reserved[7];		/* FOR XPC INTERNAL USE ONLY */
+	s64 number;		/* FOR XPC INTERNAL USE ONLY */
+
+	u64 payload;		/* user defined portion of message */
+};
+
+#define XPC_MSG_PAYLOAD_OFFSET	(u64) (&((struct xpc_msg *)0)->payload)
+#define XPC_MSG_SIZE(_payload_size) \
+		L1_CACHE_ALIGN(XPC_MSG_PAYLOAD_OFFSET + (_payload_size))
+
+/*
+ * Define the return values and values passed to user's callout functions.
+ * (It is important to add new value codes at the end just preceding
+ * xpcUnknownReason, which must have the highest numerical value.)
+ */
+enum xpc_retval {
+	xpcSuccess = 0,
+
+	xpcNotConnected,	/*  1: channel is not connected */
+	xpcConnected,		/*  2: channel connected (opened) */
+	xpcRETIRED1,		/*  3: (formerly xpcDisconnected) */
+
+	xpcMsgReceived,		/*  4: message received */
+	xpcMsgDelivered,	/*  5: message delivered and acknowledged */
+
+	xpcRETIRED2,		/*  6: (formerly xpcTransferFailed) */
+
+	xpcNoWait,		/*  7: operation would require wait */
+	xpcRetry,		/*  8: retry operation */
+	xpcTimeout,		/*  9: timeout in xpc_allocate_msg_wait() */
+	xpcInterrupted,		/* 10: interrupted wait */
+
+	xpcUnequalMsgSizes,	/* 11: message size disparity between sides */
+	xpcInvalidAddress,	/* 12: invalid address */
+
+	xpcNoMemory,		/* 13: no memory available for XPC structures */
+	xpcLackOfResources,	/* 14: insufficient resources for operation */
+	xpcUnregistered,	/* 15: channel is not registered */
+	xpcAlreadyRegistered,	/* 16: channel is already registered */
+
+	xpcPartitionDown,	/* 17: remote partition is down */
+	xpcNotLoaded,		/* 18: XPC module is not loaded */
+	xpcUnloading,		/* 19: this side is unloading XPC module */
+
+	xpcBadMagic,		/* 20: XPC MAGIC string not found */
+
+	xpcReactivating,	/* 21: remote partition was reactivated */
+
+	xpcUnregistering,	/* 22: this side is unregistering channel */
+	xpcOtherUnregistering,	/* 23: other side is unregistering channel */
+
+	xpcCloneKThread,	/* 24: cloning kernel thread */
+	xpcCloneKThreadFailed,	/* 25: cloning kernel thread failed */
+
+	xpcNoHeartbeat,		/* 26: remote partition has no heartbeat */
+
+	xpcPioReadError,	/* 27: PIO read error */
+	xpcPhysAddrRegFailed,	/* 28: registration of phys addr range failed */
+
+	xpcBteDirectoryError,	/* 29: maps to BTEFAIL_DIR */
+	xpcBtePoisonError,	/* 30: maps to BTEFAIL_POISON */
+	xpcBteWriteError,	/* 31: maps to BTEFAIL_WERR */
+	xpcBteAccessError,	/* 32: maps to BTEFAIL_ACCESS */
+	xpcBtePWriteError,	/* 33: maps to BTEFAIL_PWERR */
+	xpcBtePReadError,	/* 34: maps to BTEFAIL_PRERR */
+	xpcBteTimeOutError,	/* 35: maps to BTEFAIL_TOUT */
+	xpcBteXtalkError,	/* 36: maps to BTEFAIL_XTERR */
+	xpcBteNotAvailable,	/* 37: maps to BTEFAIL_NOTAVAIL */
+	xpcBteUnmappedError,	/* 38: unmapped BTEFAIL_ error */
+
+	xpcBadVersion,		/* 39: bad version number */
+	xpcVarsNotSet,		/* 40: the XPC variables are not set up */
+	xpcNoRsvdPageAddr,	/* 41: unable to get rsvd page's phys addr */
+	xpcInvalidPartid,	/* 42: invalid partition ID */
+	xpcLocalPartid,		/* 43: local partition ID */
+
+	xpcOtherGoingDown,	/* 44: other side going down, reason unknown */
+	xpcSystemGoingDown,	/* 45: system is going down, reason unknown */
+	xpcSystemHalt,		/* 46: system is being halted */
+	xpcSystemReboot,	/* 47: system is being rebooted */
+	xpcSystemPoweroff,	/* 48: system is being powered off */
+
+	xpcDisconnecting,	/* 49: channel disconnecting (closing) */
+
+	xpcOpenCloseError,	/* 50: channel open/close protocol error */
+
+	xpcDisconnected,	/* 51: channel disconnected (closed) */
+
+	xpcBteSh2Start,		/* 52: BTE CRB timeout */
+
+				/* 53: 0x1 BTE Error Response Short */
+	xpcBteSh2RspShort = xpcBteSh2Start + BTEFAIL_SH2_RESP_SHORT,
+
+				/* 54: 0x2 BTE Error Response Long */
+	xpcBteSh2RspLong = xpcBteSh2Start + BTEFAIL_SH2_RESP_LONG,
+
+				/* 56: 0x4 BTE Error Response DSB */
+	xpcBteSh2RspDSB = xpcBteSh2Start + BTEFAIL_SH2_RESP_DSP,
+
+				/* 60: 0x8 BTE Error Response Access */
+	xpcBteSh2RspAccess = xpcBteSh2Start + BTEFAIL_SH2_RESP_ACCESS,
+
+				/* 68: 0x10 BTE Error CRB timeout */
+	xpcBteSh2CRBTO = xpcBteSh2Start + BTEFAIL_SH2_CRB_TO,
+
+				/* 84: 0x20 BTE Error NACK limit */
+	xpcBteSh2NACKLimit = xpcBteSh2Start + BTEFAIL_SH2_NACK_LIMIT,
+
+				/* 115: BTE end */
+	xpcBteSh2End = xpcBteSh2Start + BTEFAIL_SH2_ALL,
+
+	xpcUnknownReason	/* 116: unknown reason - must be last in enum */
+};
+
+/*
+ * Define the callout function types used by XPC to update the user on
+ * connection activity and state changes (via the user function registered by
+ * xpc_connect()) and to notify them of messages received and delivered (via
+ * the user function registered by xpc_send_notify()).
+ *
+ * The two function types are xpc_channel_func and xpc_notify_func and
+ * both share the following arguments, with the exception of "data", which
+ * only xpc_channel_func has.
+ *
+ * Arguments:
+ *
+ *	reason - reason code. (See following table.)
+ *	partid - partition ID associated with condition.
+ *	ch_number - channel # associated with condition.
+ *	data - pointer to optional data. (See following table.)
+ *	key - pointer to optional user-defined value provided as the "key"
+ *	      argument to xpc_connect() or xpc_send_notify().
+ *
+ * In the following table the "Optional Data" column applies to callouts made
+ * to functions registered by xpc_connect(). A "NA" in that column indicates
+ * that this reason code can be passed to functions registered by
+ * xpc_send_notify() (i.e. they don't have data arguments).
+ *
+ * Also, the first three reason codes in the following table indicate
+ * success, whereas the others indicate failure. When a failure reason code
+ * is received, one can assume that the channel is not connected.
+ *
+ *
+ * Reason Code          | Cause                          | Optional Data
+ * =====================+================================+=====================
+ * xpcConnected         | connection has been established| max #of entries
+ *                      | to the specified partition on  | allowed in message
+ *                      | the specified channel          | queue
+ * ---------------------+--------------------------------+---------------------
+ * xpcMsgReceived       | an XPC message arrived from    | address of payload
+ *                      | the specified partition on the |
+ *                      | specified channel              | [the user must call
+ *                      |                                | xpc_received() when
+ *                      |                                | finished with the
+ *                      |                                | payload]
+ * ---------------------+--------------------------------+---------------------
+ * xpcMsgDelivered      | notification that the message  | NA
+ *                      | was delivered to the intended  |
+ *                      | recipient and that they have   |
+ *                      | acknowledged its receipt by    |
+ *                      | calling xpc_received()         |
+ * =====================+================================+=====================
+ * xpcUnequalMsgSizes   | can't connect to the specified | NULL
+ *                      | partition on the specified     |
+ *                      | channel because of mismatched  |
+ *                      | message sizes                  |
+ * ---------------------+--------------------------------+---------------------
+ * xpcNoMemory          | insufficient memory avaiable   | NULL
+ *                      | to allocate message queue      |
+ * ---------------------+--------------------------------+---------------------
+ * xpcLackOfResources   | lack of resources to create    | NULL
+ *                      | the necessary kthreads to      |
+ *                      | support the channel            |
+ * ---------------------+--------------------------------+---------------------
+ * xpcUnregistering     | this side's user has           | NULL or NA
+ *                      | unregistered by calling        |
+ *                      | xpc_disconnect()               |
+ * ---------------------+--------------------------------+---------------------
+ * xpcOtherUnregistering| the other side's user has      | NULL or NA
+ *                      | unregistered by calling        |
+ *                      | xpc_disconnect()               |
+ * ---------------------+--------------------------------+---------------------
+ * xpcNoHeartbeat       | the other side's XPC is no     | NULL or NA
+ *                      | longer heartbeating            |
+ *                      |                                |
+ * ---------------------+--------------------------------+---------------------
+ * xpcUnloading         | this side's XPC module is      | NULL or NA
+ *                      | being unloaded                 |
+ *                      |                                |
+ * ---------------------+--------------------------------+---------------------
+ * xpcOtherUnloading    | the other side's XPC module is | NULL or NA
+ *                      | is being unloaded              |
+ *                      |                                |
+ * ---------------------+--------------------------------+---------------------
+ * xpcPioReadError      | xp_nofault_PIOR() returned an  | NULL or NA
+ *                      | error while sending an IPI     |
+ *                      |                                |
+ * ---------------------+--------------------------------+---------------------
+ * xpcInvalidAddress    | the address either received or | NULL or NA
+ *                      | sent by the specified partition|
+ *                      | is invalid                     |
+ * ---------------------+--------------------------------+---------------------
+ * xpcBteNotAvailable   | attempt to pull data from the  | NULL or NA
+ * xpcBtePoisonError    | specified partition over the   |
+ * xpcBteWriteError     | specified channel via a        |
+ * xpcBteAccessError    | bte_copy() failed              |
+ * xpcBteTimeOutError   |                                |
+ * xpcBteXtalkError     |                                |
+ * xpcBteDirectoryError |                                |
+ * xpcBteGenericError   |                                |
+ * xpcBteUnmappedError  |                                |
+ * ---------------------+--------------------------------+---------------------
+ * xpcUnknownReason     | the specified channel to the   | NULL or NA
+ *                      | specified partition was        |
+ *                      | unavailable for unknown reasons|
+ * =====================+================================+=====================
+ */
+
+typedef void (*xpc_channel_func) (enum xpc_retval reason, partid_t partid,
+				  int ch_number, void *data, void *key);
+
+typedef void (*xpc_notify_func) (enum xpc_retval reason, partid_t partid,
+				 int ch_number, void *key);
+
+/*
+ * The following is a registration entry. There is a global array of these,
+ * one per channel. It is used to record the connection registration made
+ * by the users of XPC. As long as a registration entry exists, for any
+ * partition that comes up, XPC will attempt to establish a connection on
+ * that channel. Notification that a connection has been made will occur via
+ * the xpc_channel_func function.
+ *
+ * The 'func' field points to the function to call when aynchronous
+ * notification is required for such events as: a connection established/lost,
+ * or an incoming message received, or an error condition encountered. A
+ * non-NULL 'func' field indicates that there is an active registration for
+ * the channel.
+ */
+struct xpc_registration {
+	struct mutex mutex;
+	xpc_channel_func func;	/* function to call */
+	void *key;		/* pointer to user's key */
+	u16 nentries;		/* #of msg entries in local msg queue */
+	u16 msg_size;		/* message queue's message size */
+	u32 assigned_limit;	/* limit on #of assigned kthreads */
+	u32 idle_limit;		/* limit on #of idle kthreads */
+} ____cacheline_aligned;
+
+#define XPC_CHANNEL_REGISTERED(_c)	(xpc_registrations[_c].func != NULL)
+
+/* the following are valid xpc_allocate() flags */
+#define XPC_WAIT	0	/* wait flag */
+#define XPC_NOWAIT	1	/* no wait flag */
+
+struct xpc_interface {
+	void (*connect) (int);
+	void (*disconnect) (int);
+	enum xpc_retval (*allocate) (partid_t, int, u32, void **);
+	enum xpc_retval (*send) (partid_t, int, void *);
+	enum xpc_retval (*send_notify) (partid_t, int, void *,
+					xpc_notify_func, void *);
+	void (*received) (partid_t, int, void *);
+	enum xpc_retval (*partid_to_nasids) (partid_t, void *);
+};
+
+extern struct xpc_interface xpc_interface;
+
+extern void xpc_set_interface(void (*)(int),
+			      void (*)(int),
+			      enum xpc_retval (*)(partid_t, int, u32, void **),
+			      enum xpc_retval (*)(partid_t, int, void *),
+			      enum xpc_retval (*)(partid_t, int, void *,
+						  xpc_notify_func, void *),
+			      void (*)(partid_t, int, void *),
+			      enum xpc_retval (*)(partid_t, void *));
+extern void xpc_clear_interface(void);
+
+extern enum xpc_retval xpc_connect(int, xpc_channel_func, void *, u16,
+				   u16, u32, u32);
+extern void xpc_disconnect(int);
+
+static inline enum xpc_retval
+xpc_allocate(partid_t partid, int ch_number, u32 flags, void **payload)
+{
+	return xpc_interface.allocate(partid, ch_number, flags, payload);
+}
+
+static inline enum xpc_retval
+xpc_send(partid_t partid, int ch_number, void *payload)
+{
+	return xpc_interface.send(partid, ch_number, payload);
+}
+
+static inline enum xpc_retval
+xpc_send_notify(partid_t partid, int ch_number, void *payload,
+		xpc_notify_func func, void *key)
+{
+	return xpc_interface.send_notify(partid, ch_number, payload, func, key);
+}
+
+static inline void
+xpc_received(partid_t partid, int ch_number, void *payload)
+{
+	return xpc_interface.received(partid, ch_number, payload);
+}
+
+static inline enum xpc_retval
+xpc_partid_to_nasids(partid_t partid, void *nasids)
+{
+	return xpc_interface.partid_to_nasids(partid, nasids);
+}
+
+extern u64 xp_nofault_PIOR_target;
+extern int xp_nofault_PIOR(void *);
+extern int xp_error_PIOR(void);
+
+#endif /* _DRIVERS_MISC_SGIXP_XP_H */
diff --git a/drivers/misc/sgi-xp/xp_main.c b/drivers/misc/sgi-xp/xp_main.c
new file mode 100644
index 00000000000..1fbf99bae96
--- /dev/null
+++ b/drivers/misc/sgi-xp/xp_main.c
@@ -0,0 +1,279 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2004-2008 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+/*
+ * Cross Partition (XP) base.
+ *
+ *	XP provides a base from which its users can interact
+ *	with XPC, yet not be dependent on XPC.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <asm/sn/intr.h>
+#include <asm/sn/sn_sal.h>
+#include "xp.h"
+
+/*
+ * The export of xp_nofault_PIOR needs to happen here since it is defined
+ * in drivers/misc/sgi-xp/xp_nofault.S. The target of the nofault read is
+ * defined here.
+ */
+EXPORT_SYMBOL_GPL(xp_nofault_PIOR);
+
+u64 xp_nofault_PIOR_target;
+EXPORT_SYMBOL_GPL(xp_nofault_PIOR_target);
+
+/*
+ * xpc_registrations[] keeps track of xpc_connect()'s done by the kernel-level
+ * users of XPC.
+ */
+struct xpc_registration xpc_registrations[XPC_NCHANNELS];
+EXPORT_SYMBOL_GPL(xpc_registrations);
+
+/*
+ * Initialize the XPC interface to indicate that XPC isn't loaded.
+ */
+static enum xpc_retval
+xpc_notloaded(void)
+{
+	return xpcNotLoaded;
+}
+
+struct xpc_interface xpc_interface = {
+	(void (*)(int))xpc_notloaded,
+	(void (*)(int))xpc_notloaded,
+	(enum xpc_retval(*)(partid_t, int, u32, void **))xpc_notloaded,
+	(enum xpc_retval(*)(partid_t, int, void *))xpc_notloaded,
+	(enum xpc_retval(*)(partid_t, int, void *, xpc_notify_func, void *))
+	    xpc_notloaded,
+	(void (*)(partid_t, int, void *))xpc_notloaded,
+	(enum xpc_retval(*)(partid_t, void *))xpc_notloaded
+};
+EXPORT_SYMBOL_GPL(xpc_interface);
+
+/*
+ * XPC calls this when it (the XPC module) has been loaded.
+ */
+void
+xpc_set_interface(void (*connect) (int),
+		  void (*disconnect) (int),
+		  enum xpc_retval (*allocate) (partid_t, int, u32, void **),
+		  enum xpc_retval (*send) (partid_t, int, void *),
+		  enum xpc_retval (*send_notify) (partid_t, int, void *,
+						  xpc_notify_func, void *),
+		  void (*received) (partid_t, int, void *),
+		  enum xpc_retval (*partid_to_nasids) (partid_t, void *))
+{
+	xpc_interface.connect = connect;
+	xpc_interface.disconnect = disconnect;
+	xpc_interface.allocate = allocate;
+	xpc_interface.send = send;
+	xpc_interface.send_notify = send_notify;
+	xpc_interface.received = received;
+	xpc_interface.partid_to_nasids = partid_to_nasids;
+}
+EXPORT_SYMBOL_GPL(xpc_set_interface);
+
+/*
+ * XPC calls this when it (the XPC module) is being unloaded.
+ */
+void
+xpc_clear_interface(void)
+{
+	xpc_interface.connect = (void (*)(int))xpc_notloaded;
+	xpc_interface.disconnect = (void (*)(int))xpc_notloaded;
+	xpc_interface.allocate = (enum xpc_retval(*)(partid_t, int, u32,
+						     void **))xpc_notloaded;
+	xpc_interface.send = (enum xpc_retval(*)(partid_t, int, void *))
+	    xpc_notloaded;
+	xpc_interface.send_notify = (enum xpc_retval(*)(partid_t, int, void *,
+							xpc_notify_func,
+							void *))xpc_notloaded;
+	xpc_interface.received = (void (*)(partid_t, int, void *))
+	    xpc_notloaded;
+	xpc_interface.partid_to_nasids = (enum xpc_retval(*)(partid_t, void *))
+	    xpc_notloaded;
+}
+EXPORT_SYMBOL_GPL(xpc_clear_interface);
+
+/*
+ * Register for automatic establishment of a channel connection whenever
+ * a partition comes up.
+ *
+ * Arguments:
+ *
+ *	ch_number - channel # to register for connection.
+ *	func - function to call for asynchronous notification of channel
+ *	       state changes (i.e., connection, disconnection, error) and
+ *	       the arrival of incoming messages.
+ *      key - pointer to optional user-defined value that gets passed back
+ *	      to the user on any callouts made to func.
+ *	payload_size - size in bytes of the XPC message's payload area which
+ *		       contains a user-defined message. The user should make
+ *		       this large enough to hold their largest message.
+ *	nentries - max #of XPC message entries a message queue can contain.
+ *		   The actual number, which is determined when a connection
+ * 		   is established and may be less then requested, will be
+ *		   passed to the user via the xpcConnected callout.
+ *	assigned_limit - max number of kthreads allowed to be processing
+ * 			 messages (per connection) at any given instant.
+ *	idle_limit - max number of kthreads allowed to be idle at any given
+ * 		     instant.
+ */
+enum xpc_retval
+xpc_connect(int ch_number, xpc_channel_func func, void *key, u16 payload_size,
+	    u16 nentries, u32 assigned_limit, u32 idle_limit)
+{
+	struct xpc_registration *registration;
+
+	DBUG_ON(ch_number < 0 || ch_number >= XPC_NCHANNELS);
+	DBUG_ON(payload_size == 0 || nentries == 0);
+	DBUG_ON(func == NULL);
+	DBUG_ON(assigned_limit == 0 || idle_limit > assigned_limit);
+
+	registration = &xpc_registrations[ch_number];
+
+	if (mutex_lock_interruptible(&registration->mutex) != 0)
+		return xpcInterrupted;
+
+	/* if XPC_CHANNEL_REGISTERED(ch_number) */
+	if (registration->func != NULL) {
+		mutex_unlock(&registration->mutex);
+		return xpcAlreadyRegistered;
+	}
+
+	/* register the channel for connection */
+	registration->msg_size = XPC_MSG_SIZE(payload_size);
+	registration->nentries = nentries;
+	registration->assigned_limit = assigned_limit;
+	registration->idle_limit = idle_limit;
+	registration->key = key;
+	registration->func = func;
+
+	mutex_unlock(&registration->mutex);
+
+	xpc_interface.connect(ch_number);
+
+	return xpcSuccess;
+}
+EXPORT_SYMBOL_GPL(xpc_connect);
+
+/*
+ * Remove the registration for automatic connection of the specified channel
+ * when a partition comes up.
+ *
+ * Before returning this xpc_disconnect() will wait for all connections on the
+ * specified channel have been closed/torndown. So the caller can be assured
+ * that they will not be receiving any more callouts from XPC to their
+ * function registered via xpc_connect().
+ *
+ * Arguments:
+ *
+ *	ch_number - channel # to unregister.
+ */
+void
+xpc_disconnect(int ch_number)
+{
+	struct xpc_registration *registration;
+
+	DBUG_ON(ch_number < 0 || ch_number >= XPC_NCHANNELS);
+
+	registration = &xpc_registrations[ch_number];
+
+	/*
+	 * We've decided not to make this a down_interruptible(), since we
+	 * figured XPC's users will just turn around and call xpc_disconnect()
+	 * again anyways, so we might as well wait, if need be.
+	 */
+	mutex_lock(&registration->mutex);
+
+	/* if !XPC_CHANNEL_REGISTERED(ch_number) */
+	if (registration->func == NULL) {
+		mutex_unlock(&registration->mutex);
+		return;
+	}
+
+	/* remove the connection registration for the specified channel */
+	registration->func = NULL;
+	registration->key = NULL;
+	registration->nentries = 0;
+	registration->msg_size = 0;
+	registration->assigned_limit = 0;
+	registration->idle_limit = 0;
+
+	xpc_interface.disconnect(ch_number);
+
+	mutex_unlock(&registration->mutex);
+
+	return;
+}
+EXPORT_SYMBOL_GPL(xpc_disconnect);
+
+int __init
+xp_init(void)
+{
+	int ret, ch_number;
+	u64 func_addr = *(u64 *)xp_nofault_PIOR;
+	u64 err_func_addr = *(u64 *)xp_error_PIOR;
+
+	if (!ia64_platform_is("sn2"))
+		return -ENODEV;
+
+	/*
+	 * Register a nofault code region which performs a cross-partition
+	 * PIO read. If the PIO read times out, the MCA handler will consume
+	 * the error and return to a kernel-provided instruction to indicate
+	 * an error. This PIO read exists because it is guaranteed to timeout
+	 * if the destination is down (AMO operations do not timeout on at
+	 * least some CPUs on Shubs <= v1.2, which unfortunately we have to
+	 * work around).
+	 */
+	ret = sn_register_nofault_code(func_addr, err_func_addr, err_func_addr,
+				       1, 1);
+	if (ret != 0) {
+		printk(KERN_ERR "XP: can't register nofault code, error=%d\n",
+		       ret);
+	}
+	/*
+	 * Setup the nofault PIO read target. (There is no special reason why
+	 * SH_IPI_ACCESS was selected.)
+	 */
+	if (is_shub2())
+		xp_nofault_PIOR_target = SH2_IPI_ACCESS0;
+	else
+		xp_nofault_PIOR_target = SH1_IPI_ACCESS;
+
+	/* initialize the connection registration mutex */
+	for (ch_number = 0; ch_number < XPC_NCHANNELS; ch_number++)
+		mutex_init(&xpc_registrations[ch_number].mutex);
+
+	return 0;
+}
+
+module_init(xp_init);
+
+void __exit
+xp_exit(void)
+{
+	u64 func_addr = *(u64 *)xp_nofault_PIOR;
+	u64 err_func_addr = *(u64 *)xp_error_PIOR;
+
+	/* unregister the PIO read nofault code region */
+	(void)sn_register_nofault_code(func_addr, err_func_addr,
+				       err_func_addr, 1, 0);
+}
+
+module_exit(xp_exit);
+
+MODULE_AUTHOR("Silicon Graphics, Inc.");
+MODULE_DESCRIPTION("Cross Partition (XP) base");
+MODULE_LICENSE("GPL");
diff --git a/drivers/misc/sgi-xp/xp_nofault.S b/drivers/misc/sgi-xp/xp_nofault.S
new file mode 100644
index 00000000000..e38d4331942
--- /dev/null
+++ b/drivers/misc/sgi-xp/xp_nofault.S
@@ -0,0 +1,35 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2004-2008 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+/*
+ * The xp_nofault_PIOR function takes a pointer to a remote PIO register
+ * and attempts to load and consume a value from it.  This function
+ * will be registered as a nofault code block.  In the event that the
+ * PIO read fails, the MCA handler will force the error to look
+ * corrected and vector to the xp_error_PIOR which will return an error.
+ *
+ * The definition of "consumption" and the time it takes for an MCA
+ * to surface is processor implementation specific.  This code
+ * is sufficient on Itanium through the Montvale processor family.
+ * It may need to be adjusted for future processor implementations.
+ *
+ *	extern int xp_nofault_PIOR(void *remote_register);
+ */
+
+	.global xp_nofault_PIOR
+xp_nofault_PIOR:
+	mov	r8=r0			// Stage a success return value
+	ld8.acq	r9=[r32];;		// PIO Read the specified register
+	adds	r9=1,r9;;		// Add to force consumption
+	srlz.i;;			// Allow time for MCA to surface
+	br.ret.sptk.many b0;;		// Return success
+
+	.global xp_error_PIOR
+xp_error_PIOR:
+	mov	r8=1			// Return value of 1
+	br.ret.sptk.many b0;;		// Return failure
diff --git a/drivers/misc/sgi-xp/xpc.h b/drivers/misc/sgi-xp/xpc.h
new file mode 100644
index 00000000000..9eb6d4a3269
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc.h
@@ -0,0 +1,1187 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2004-2008 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+/*
+ * Cross Partition Communication (XPC) structures and macros.
+ */
+
+#ifndef _DRIVERS_MISC_SGIXP_XPC_H
+#define _DRIVERS_MISC_SGIXP_XPC_H
+
+#include <linux/interrupt.h>
+#include <linux/sysctl.h>
+#include <linux/device.h>
+#include <linux/mutex.h>
+#include <linux/completion.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/sn/bte.h>
+#include <asm/sn/clksupport.h>
+#include <asm/sn/addrs.h>
+#include <asm/sn/mspec.h>
+#include <asm/sn/shub_mmr.h>
+#include "xp.h"
+
+/*
+ * XPC Version numbers consist of a major and minor number. XPC can always
+ * talk to versions with same major #, and never talk to versions with a
+ * different major #.
+ */
+#define _XPC_VERSION(_maj, _min)	(((_maj) << 4) | ((_min) & 0xf))
+#define XPC_VERSION_MAJOR(_v)		((_v) >> 4)
+#define XPC_VERSION_MINOR(_v)		((_v) & 0xf)
+
+/*
+ * The next macros define word or bit representations for given
+ * C-brick nasid in either the SAL provided bit array representing
+ * nasids in the partition/machine or the AMO_t array used for
+ * inter-partition initiation communications.
+ *
+ * For SN2 machines, C-Bricks are alway even numbered NASIDs.  As
+ * such, some space will be saved by insisting that nasid information
+ * passed from SAL always be packed for C-Bricks and the
+ * cross-partition interrupts use the same packing scheme.
+ */
+#define XPC_NASID_W_INDEX(_n)	(((_n) / 64) / 2)
+#define XPC_NASID_B_INDEX(_n)	(((_n) / 2) & (64 - 1))
+#define XPC_NASID_IN_ARRAY(_n, _p) ((_p)[XPC_NASID_W_INDEX(_n)] & \
+				    (1UL << XPC_NASID_B_INDEX(_n)))
+#define XPC_NASID_FROM_W_B(_w, _b) (((_w) * 64 + (_b)) * 2)
+
+#define XPC_HB_DEFAULT_INTERVAL		5	/* incr HB every x secs */
+#define XPC_HB_CHECK_DEFAULT_INTERVAL	20	/* check HB every x secs */
+
+/* define the process name of HB checker and the CPU it is pinned to */
+#define XPC_HB_CHECK_THREAD_NAME	"xpc_hb"
+#define XPC_HB_CHECK_CPU		0
+
+/* define the process name of the discovery thread */
+#define XPC_DISCOVERY_THREAD_NAME	"xpc_discovery"
+
+/*
+ * the reserved page
+ *
+ *   SAL reserves one page of memory per partition for XPC. Though a full page
+ *   in length (16384 bytes), its starting address is not page aligned, but it
+ *   is cacheline aligned. The reserved page consists of the following:
+ *
+ *   reserved page header
+ *
+ *     The first cacheline of the reserved page contains the header
+ *     (struct xpc_rsvd_page). Before SAL initialization has completed,
+ *     SAL has set up the following fields of the reserved page header:
+ *     SAL_signature, SAL_version, partid, and nasids_size. The other
+ *     fields are set up by XPC. (xpc_rsvd_page points to the local
+ *     partition's reserved page.)
+ *
+ *   part_nasids mask
+ *   mach_nasids mask
+ *
+ *     SAL also sets up two bitmaps (or masks), one that reflects the actual
+ *     nasids in this partition (part_nasids), and the other that reflects
+ *     the actual nasids in the entire machine (mach_nasids). We're only
+ *     interested in the even numbered nasids (which contain the processors
+ *     and/or memory), so we only need half as many bits to represent the
+ *     nasids. The part_nasids mask is located starting at the first cacheline
+ *     following the reserved page header. The mach_nasids mask follows right
+ *     after the part_nasids mask. The size in bytes of each mask is reflected
+ *     by the reserved page header field 'nasids_size'. (Local partition's
+ *     mask pointers are xpc_part_nasids and xpc_mach_nasids.)
+ *
+ *   vars
+ *   vars part
+ *
+ *     Immediately following the mach_nasids mask are the XPC variables
+ *     required by other partitions. First are those that are generic to all
+ *     partitions (vars), followed on the next available cacheline by those
+ *     which are partition specific (vars part). These are setup by XPC.
+ *     (Local partition's vars pointers are xpc_vars and xpc_vars_part.)
+ *
+ * Note: Until vars_pa is set, the partition XPC code has not been initialized.
+ */
+struct xpc_rsvd_page {
+	u64 SAL_signature;	/* SAL: unique signature */
+	u64 SAL_version;	/* SAL: version */
+	u8 partid;		/* SAL: partition ID */
+	u8 version;
+	u8 pad1[6];		/* align to next u64 in cacheline */
+	u64 vars_pa;		/* physical address of struct xpc_vars */
+	struct timespec stamp;	/* time when reserved page was setup by XPC */
+	u64 pad2[9];		/* align to last u64 in cacheline */
+	u64 nasids_size;	/* SAL: size of each nasid mask in bytes */
+};
+
+#define XPC_RP_VERSION _XPC_VERSION(1, 1) /* version 1.1 of the reserved page */
+
+#define XPC_SUPPORTS_RP_STAMP(_version) \
+			(_version >= _XPC_VERSION(1, 1))
+
+/*
+ * compare stamps - the return value is:
+ *
+ *	< 0,	if stamp1 < stamp2
+ *	= 0,	if stamp1 == stamp2
+ *	> 0,	if stamp1 > stamp2
+ */
+static inline int
+xpc_compare_stamps(struct timespec *stamp1, struct timespec *stamp2)
+{
+	int ret;
+
+	ret = stamp1->tv_sec - stamp2->tv_sec;
+	if (ret == 0)
+		ret = stamp1->tv_nsec - stamp2->tv_nsec;
+
+	return ret;
+}
+
+/*
+ * Define the structures by which XPC variables can be exported to other
+ * partitions. (There are two: struct xpc_vars and struct xpc_vars_part)
+ */
+
+/*
+ * The following structure describes the partition generic variables
+ * needed by other partitions in order to properly initialize.
+ *
+ * struct xpc_vars version number also applies to struct xpc_vars_part.
+ * Changes to either structure and/or related functionality should be
+ * reflected by incrementing either the major or minor version numbers
+ * of struct xpc_vars.
+ */
+struct xpc_vars {
+	u8 version;
+	u64 heartbeat;
+	u64 heartbeating_to_mask;
+	u64 heartbeat_offline;	/* if 0, heartbeat should be changing */
+	int act_nasid;
+	int act_phys_cpuid;
+	u64 vars_part_pa;
+	u64 amos_page_pa;	/* paddr of page of AMOs from MSPEC driver */
+	AMO_t *amos_page;	/* vaddr of page of AMOs from MSPEC driver */
+};
+
+#define XPC_V_VERSION _XPC_VERSION(3, 1)    /* version 3.1 of the cross vars */
+
+#define XPC_SUPPORTS_DISENGAGE_REQUEST(_version) \
+			(_version >= _XPC_VERSION(3, 1))
+
+static inline int
+xpc_hb_allowed(partid_t partid, struct xpc_vars *vars)
+{
+	return ((vars->heartbeating_to_mask & (1UL << partid)) != 0);
+}
+
+static inline void
+xpc_allow_hb(partid_t partid, struct xpc_vars *vars)
+{
+	u64 old_mask, new_mask;
+
+	do {
+		old_mask = vars->heartbeating_to_mask;
+		new_mask = (old_mask | (1UL << partid));
+	} while (cmpxchg(&vars->heartbeating_to_mask, old_mask, new_mask) !=
+		 old_mask);
+}
+
+static inline void
+xpc_disallow_hb(partid_t partid, struct xpc_vars *vars)
+{
+	u64 old_mask, new_mask;
+
+	do {
+		old_mask = vars->heartbeating_to_mask;
+		new_mask = (old_mask & ~(1UL << partid));
+	} while (cmpxchg(&vars->heartbeating_to_mask, old_mask, new_mask) !=
+		 old_mask);
+}
+
+/*
+ * The AMOs page consists of a number of AMO variables which are divided into
+ * four groups, The first two groups are used to identify an IRQ's sender.
+ * These two groups consist of 64 and 128 AMO variables respectively. The last
+ * two groups, consisting of just one AMO variable each, are used to identify
+ * the remote partitions that are currently engaged (from the viewpoint of
+ * the XPC running on the remote partition).
+ */
+#define XPC_NOTIFY_IRQ_AMOS	   0
+#define XPC_ACTIVATE_IRQ_AMOS	   (XPC_NOTIFY_IRQ_AMOS + XP_MAX_PARTITIONS)
+#define XPC_ENGAGED_PARTITIONS_AMO (XPC_ACTIVATE_IRQ_AMOS + XP_NASID_MASK_WORDS)
+#define XPC_DISENGAGE_REQUEST_AMO  (XPC_ENGAGED_PARTITIONS_AMO + 1)
+
+/*
+ * The following structure describes the per partition specific variables.
+ *
+ * An array of these structures, one per partition, will be defined. As a
+ * partition becomes active XPC will copy the array entry corresponding to
+ * itself from that partition. It is desirable that the size of this
+ * structure evenly divide into a cacheline, such that none of the entries
+ * in this array crosses a cacheline boundary. As it is now, each entry
+ * occupies half a cacheline.
+ */
+struct xpc_vars_part {
+	u64 magic;
+
+	u64 openclose_args_pa;	/* physical address of open and close args */
+	u64 GPs_pa;		/* physical address of Get/Put values */
+
+	u64 IPI_amo_pa;		/* physical address of IPI AMO_t structure */
+	int IPI_nasid;		/* nasid of where to send IPIs */
+	int IPI_phys_cpuid;	/* physical CPU ID of where to send IPIs */
+
+	u8 nchannels;		/* #of defined channels supported */
+
+	u8 reserved[23];	/* pad to a full 64 bytes */
+};
+
+/*
+ * The vars_part MAGIC numbers play a part in the first contact protocol.
+ *
+ * MAGIC1 indicates that the per partition specific variables for a remote
+ * partition have been initialized by this partition.
+ *
+ * MAGIC2 indicates that this partition has pulled the remote partititions
+ * per partition variables that pertain to this partition.
+ */
+#define XPC_VP_MAGIC1	0x0053524156435058L   /* 'XPCVARS\0'L (little endian) */
+#define XPC_VP_MAGIC2	0x0073726176435058L   /* 'XPCvars\0'L (little endian) */
+
+/* the reserved page sizes and offsets */
+
+#define XPC_RP_HEADER_SIZE	L1_CACHE_ALIGN(sizeof(struct xpc_rsvd_page))
+#define XPC_RP_VARS_SIZE	L1_CACHE_ALIGN(sizeof(struct xpc_vars))
+
+#define XPC_RP_PART_NASIDS(_rp) ((u64 *)((u8 *)(_rp) + XPC_RP_HEADER_SIZE))
+#define XPC_RP_MACH_NASIDS(_rp) (XPC_RP_PART_NASIDS(_rp) + xp_nasid_mask_words)
+#define XPC_RP_VARS(_rp)	((struct xpc_vars *)(XPC_RP_MACH_NASIDS(_rp) + \
+				    xp_nasid_mask_words))
+#define XPC_RP_VARS_PART(_rp)	((struct xpc_vars_part *) \
+				    ((u8 *)XPC_RP_VARS(_rp) + XPC_RP_VARS_SIZE))
+
+/*
+ * Functions registered by add_timer() or called by kernel_thread() only
+ * allow for a single 64-bit argument. The following macros can be used to
+ * pack and unpack two (32-bit, 16-bit or 8-bit) arguments into or out from
+ * the passed argument.
+ */
+#define XPC_PACK_ARGS(_arg1, _arg2) \
+			((((u64) _arg1) & 0xffffffff) | \
+			((((u64) _arg2) & 0xffffffff) << 32))
+
+#define XPC_UNPACK_ARG1(_args)	(((u64) _args) & 0xffffffff)
+#define XPC_UNPACK_ARG2(_args)	((((u64) _args) >> 32) & 0xffffffff)
+
+/*
+ * Define a Get/Put value pair (pointers) used with a message queue.
+ */
+struct xpc_gp {
+	s64 get;		/* Get value */
+	s64 put;		/* Put value */
+};
+
+#define XPC_GP_SIZE \
+		L1_CACHE_ALIGN(sizeof(struct xpc_gp) * XPC_NCHANNELS)
+
+/*
+ * Define a structure that contains arguments associated with opening and
+ * closing a channel.
+ */
+struct xpc_openclose_args {
+	u16 reason;		/* reason why channel is closing */
+	u16 msg_size;		/* sizeof each message entry */
+	u16 remote_nentries;	/* #of message entries in remote msg queue */
+	u16 local_nentries;	/* #of message entries in local msg queue */
+	u64 local_msgqueue_pa;	/* physical address of local message queue */
+};
+
+#define XPC_OPENCLOSE_ARGS_SIZE \
+	      L1_CACHE_ALIGN(sizeof(struct xpc_openclose_args) * XPC_NCHANNELS)
+
+/* struct xpc_msg flags */
+
+#define	XPC_M_DONE		0x01	/* msg has been received/consumed */
+#define	XPC_M_READY		0x02	/* msg is ready to be sent */
+#define	XPC_M_INTERRUPT		0x04	/* send interrupt when msg consumed */
+
+#define XPC_MSG_ADDRESS(_payload) \
+		((struct xpc_msg *)((u8 *)(_payload) - XPC_MSG_PAYLOAD_OFFSET))
+
+/*
+ * Defines notify entry.
+ *
+ * This is used to notify a message's sender that their message was received
+ * and consumed by the intended recipient.
+ */
+struct xpc_notify {
+	u8 type;		/* type of notification */
+
+	/* the following two fields are only used if type == XPC_N_CALL */
+	xpc_notify_func func;	/* user's notify function */
+	void *key;		/* pointer to user's key */
+};
+
+/* struct xpc_notify type of notification */
+
+#define	XPC_N_CALL		0x01	/* notify function provided by user */
+
+/*
+ * Define the structure that manages all the stuff required by a channel. In
+ * particular, they are used to manage the messages sent across the channel.
+ *
+ * This structure is private to a partition, and is NOT shared across the
+ * partition boundary.
+ *
+ * There is an array of these structures for each remote partition. It is
+ * allocated at the time a partition becomes active. The array contains one
+ * of these structures for each potential channel connection to that partition.
+ *
+ * Each of these structures manages two message queues (circular buffers).
+ * They are allocated at the time a channel connection is made. One of
+ * these message queues (local_msgqueue) holds the locally created messages
+ * that are destined for the remote partition. The other of these message
+ * queues (remote_msgqueue) is a locally cached copy of the remote partition's
+ * own local_msgqueue.
+ *
+ * The following is a description of the Get/Put pointers used to manage these
+ * two message queues. Consider the local_msgqueue to be on one partition
+ * and the remote_msgqueue to be its cached copy on another partition. A
+ * description of what each of the lettered areas contains is included.
+ *
+ *
+ *                     local_msgqueue      remote_msgqueue
+ *
+ *                        |/////////|      |/////////|
+ *    w_remote_GP.get --> +---------+      |/////////|
+ *                        |    F    |      |/////////|
+ *     remote_GP.get  --> +---------+      +---------+ <-- local_GP->get
+ *                        |         |      |         |
+ *                        |         |      |    E    |
+ *                        |         |      |         |
+ *                        |         |      +---------+ <-- w_local_GP.get
+ *                        |    B    |      |/////////|
+ *                        |         |      |////D////|
+ *                        |         |      |/////////|
+ *                        |         |      +---------+ <-- w_remote_GP.put
+ *                        |         |      |////C////|
+ *      local_GP->put --> +---------+      +---------+ <-- remote_GP.put
+ *                        |         |      |/////////|
+ *                        |    A    |      |/////////|
+ *                        |         |      |/////////|
+ *     w_local_GP.put --> +---------+      |/////////|
+ *                        |/////////|      |/////////|
+ *
+ *
+ *	    ( remote_GP.[get|put] are cached copies of the remote
+ *	      partition's local_GP->[get|put], and thus their values can
+ *	      lag behind their counterparts on the remote partition. )
+ *
+ *
+ *  A - Messages that have been allocated, but have not yet been sent to the
+ *	remote partition.
+ *
+ *  B - Messages that have been sent, but have not yet been acknowledged by the
+ *      remote partition as having been received.
+ *
+ *  C - Area that needs to be prepared for the copying of sent messages, by
+ *	the clearing of the message flags of any previously received messages.
+ *
+ *  D - Area into which sent messages are to be copied from the remote
+ *	partition's local_msgqueue and then delivered to their intended
+ *	recipients. [ To allow for a multi-message copy, another pointer
+ *	(next_msg_to_pull) has been added to keep track of the next message
+ *	number needing to be copied (pulled). It chases after w_remote_GP.put.
+ *	Any messages lying between w_local_GP.get and next_msg_to_pull have
+ *	been copied and are ready to be delivered. ]
+ *
+ *  E - Messages that have been copied and delivered, but have not yet been
+ *	acknowledged by the recipient as having been received.
+ *
+ *  F - Messages that have been acknowledged, but XPC has not yet notified the
+ *	sender that the message was received by its intended recipient.
+ *	This is also an area that needs to be prepared for the allocating of
+ *	new messages, by the clearing of the message flags of the acknowledged
+ *	messages.
+ */
+struct xpc_channel {
+	partid_t partid;	/* ID of remote partition connected */
+	spinlock_t lock;	/* lock for updating this structure */
+	u32 flags;		/* general flags */
+
+	enum xpc_retval reason;	/* reason why channel is disconnect'g */
+	int reason_line;	/* line# disconnect initiated from */
+
+	u16 number;		/* channel # */
+
+	u16 msg_size;		/* sizeof each msg entry */
+	u16 local_nentries;	/* #of msg entries in local msg queue */
+	u16 remote_nentries;	/* #of msg entries in remote msg queue */
+
+	void *local_msgqueue_base;	/* base address of kmalloc'd space */
+	struct xpc_msg *local_msgqueue;	/* local message queue */
+	void *remote_msgqueue_base;	/* base address of kmalloc'd space */
+	struct xpc_msg *remote_msgqueue; /* cached copy of remote partition's */
+					 /* local message queue */
+	u64 remote_msgqueue_pa;	/* phys addr of remote partition's */
+				/* local message queue */
+
+	atomic_t references;	/* #of external references to queues */
+
+	atomic_t n_on_msg_allocate_wq;	/* #on msg allocation wait queue */
+	wait_queue_head_t msg_allocate_wq;	/* msg allocation wait queue */
+
+	u8 delayed_IPI_flags;	/* IPI flags received, but delayed */
+				/* action until channel disconnected */
+
+	/* queue of msg senders who want to be notified when msg received */
+
+	atomic_t n_to_notify;	/* #of msg senders to notify */
+	struct xpc_notify *notify_queue;    /* notify queue for messages sent */
+
+	xpc_channel_func func;	/* user's channel function */
+	void *key;		/* pointer to user's key */
+
+	struct mutex msg_to_pull_mutex;	/* next msg to pull serialization */
+	struct completion wdisconnect_wait;    /* wait for channel disconnect */
+
+	struct xpc_openclose_args *local_openclose_args; /* args passed on */
+					     /* opening or closing of channel */
+
+	/* various flavors of local and remote Get/Put values */
+
+	struct xpc_gp *local_GP;	/* local Get/Put values */
+	struct xpc_gp remote_GP;	/* remote Get/Put values */
+	struct xpc_gp w_local_GP;	/* working local Get/Put values */
+	struct xpc_gp w_remote_GP;	/* working remote Get/Put values */
+	s64 next_msg_to_pull;	/* Put value of next msg to pull */
+
+	/* kthread management related fields */
+
+	atomic_t kthreads_assigned;	/* #of kthreads assigned to channel */
+	u32 kthreads_assigned_limit;	/* limit on #of kthreads assigned */
+	atomic_t kthreads_idle;	/* #of kthreads idle waiting for work */
+	u32 kthreads_idle_limit;	/* limit on #of kthreads idle */
+	atomic_t kthreads_active;	/* #of kthreads actively working */
+
+	wait_queue_head_t idle_wq;	/* idle kthread wait queue */
+
+} ____cacheline_aligned;
+
+/* struct xpc_channel flags */
+
+#define	XPC_C_WASCONNECTED	0x00000001	/* channel was connected */
+
+#define	XPC_C_ROPENREPLY	0x00000002	/* remote open channel reply */
+#define	XPC_C_OPENREPLY		0x00000004	/* local open channel reply */
+#define	XPC_C_ROPENREQUEST	0x00000008     /* remote open channel request */
+#define	XPC_C_OPENREQUEST	0x00000010	/* local open channel request */
+
+#define	XPC_C_SETUP		0x00000020 /* channel's msgqueues are alloc'd */
+#define	XPC_C_CONNECTEDCALLOUT	0x00000040     /* connected callout initiated */
+#define	XPC_C_CONNECTEDCALLOUT_MADE \
+				0x00000080     /* connected callout completed */
+#define	XPC_C_CONNECTED		0x00000100	/* local channel is connected */
+#define	XPC_C_CONNECTING	0x00000200	/* channel is being connected */
+
+#define	XPC_C_RCLOSEREPLY	0x00000400	/* remote close channel reply */
+#define	XPC_C_CLOSEREPLY	0x00000800	/* local close channel reply */
+#define	XPC_C_RCLOSEREQUEST	0x00001000    /* remote close channel request */
+#define	XPC_C_CLOSEREQUEST	0x00002000     /* local close channel request */
+
+#define	XPC_C_DISCONNECTED	0x00004000	/* channel is disconnected */
+#define	XPC_C_DISCONNECTING	0x00008000   /* channel is being disconnected */
+#define	XPC_C_DISCONNECTINGCALLOUT \
+				0x00010000 /* disconnecting callout initiated */
+#define	XPC_C_DISCONNECTINGCALLOUT_MADE \
+				0x00020000 /* disconnecting callout completed */
+#define	XPC_C_WDISCONNECT	0x00040000  /* waiting for channel disconnect */
+
+/*
+ * Manages channels on a partition basis. There is one of these structures
+ * for each partition (a partition will never utilize the structure that
+ * represents itself).
+ */
+struct xpc_partition {
+
+	/* XPC HB infrastructure */
+
+	u8 remote_rp_version;	/* version# of partition's rsvd pg */
+	struct timespec remote_rp_stamp; /* time when rsvd pg was initialized */
+	u64 remote_rp_pa;	/* phys addr of partition's rsvd pg */
+	u64 remote_vars_pa;	/* phys addr of partition's vars */
+	u64 remote_vars_part_pa;	/* phys addr of partition's vars part */
+	u64 last_heartbeat;	/* HB at last read */
+	u64 remote_amos_page_pa;	/* phys addr of partition's amos page */
+	int remote_act_nasid;	/* active part's act/deact nasid */
+	int remote_act_phys_cpuid;	/* active part's act/deact phys cpuid */
+	u32 act_IRQ_rcvd;	/* IRQs since activation */
+	spinlock_t act_lock;	/* protect updating of act_state */
+	u8 act_state;		/* from XPC HB viewpoint */
+	u8 remote_vars_version;	/* version# of partition's vars */
+	enum xpc_retval reason;	/* reason partition is deactivating */
+	int reason_line;	/* line# deactivation initiated from */
+	int reactivate_nasid;	/* nasid in partition to reactivate */
+
+	unsigned long disengage_request_timeout;	/* timeout in jiffies */
+	struct timer_list disengage_request_timer;
+
+	/* XPC infrastructure referencing and teardown control */
+
+	u8 setup_state;		/* infrastructure setup state */
+	wait_queue_head_t teardown_wq;	/* kthread waiting to teardown infra */
+	atomic_t references;	/* #of references to infrastructure */
+
+	/*
+	 * NONE OF THE PRECEDING FIELDS OF THIS STRUCTURE WILL BE CLEARED WHEN
+	 * XPC SETS UP THE NECESSARY INFRASTRUCTURE TO SUPPORT CROSS PARTITION
+	 * COMMUNICATION. ALL OF THE FOLLOWING FIELDS WILL BE CLEARED. (THE
+	 * 'nchannels' FIELD MUST BE THE FIRST OF THE FIELDS TO BE CLEARED.)
+	 */
+
+	u8 nchannels;		/* #of defined channels supported */
+	atomic_t nchannels_active;  /* #of channels that are not DISCONNECTED */
+	atomic_t nchannels_engaged;  /* #of channels engaged with remote part */
+	struct xpc_channel *channels;	/* array of channel structures */
+
+	void *local_GPs_base;	/* base address of kmalloc'd space */
+	struct xpc_gp *local_GPs;	/* local Get/Put values */
+	void *remote_GPs_base;	/* base address of kmalloc'd space */
+	struct xpc_gp *remote_GPs;	/* copy of remote partition's local */
+					/* Get/Put values */
+	u64 remote_GPs_pa;	/* phys address of remote partition's local */
+				/* Get/Put values */
+
+	/* fields used to pass args when opening or closing a channel */
+
+	void *local_openclose_args_base;   /* base address of kmalloc'd space */
+	struct xpc_openclose_args *local_openclose_args;      /* local's args */
+	void *remote_openclose_args_base;  /* base address of kmalloc'd space */
+	struct xpc_openclose_args *remote_openclose_args; /* copy of remote's */
+							  /* args */
+	u64 remote_openclose_args_pa;	/* phys addr of remote's args */
+
+	/* IPI sending, receiving and handling related fields */
+
+	int remote_IPI_nasid;	/* nasid of where to send IPIs */
+	int remote_IPI_phys_cpuid;	/* phys CPU ID of where to send IPIs */
+	AMO_t *remote_IPI_amo_va;    /* address of remote IPI AMO_t structure */
+
+	AMO_t *local_IPI_amo_va;	/* address of IPI AMO_t structure */
+	u64 local_IPI_amo;	/* IPI amo flags yet to be handled */
+	char IPI_owner[8];	/* IPI owner's name */
+	struct timer_list dropped_IPI_timer;	/* dropped IPI timer */
+
+	spinlock_t IPI_lock;	/* IPI handler lock */
+
+	/* channel manager related fields */
+
+	atomic_t channel_mgr_requests;	/* #of requests to activate chan mgr */
+	wait_queue_head_t channel_mgr_wq;	/* channel mgr's wait queue */
+
+} ____cacheline_aligned;
+
+/* struct xpc_partition act_state values (for XPC HB) */
+
+#define	XPC_P_INACTIVE		0x00	/* partition is not active */
+#define XPC_P_ACTIVATION_REQ	0x01	/* created thread to activate */
+#define XPC_P_ACTIVATING	0x02	/* activation thread started */
+#define XPC_P_ACTIVE		0x03	/* xpc_partition_up() was called */
+#define XPC_P_DEACTIVATING	0x04	/* partition deactivation initiated */
+
+#define XPC_DEACTIVATE_PARTITION(_p, _reason) \
+			xpc_deactivate_partition(__LINE__, (_p), (_reason))
+
+/* struct xpc_partition setup_state values */
+
+#define XPC_P_UNSET		0x00	/* infrastructure was never setup */
+#define XPC_P_SETUP		0x01	/* infrastructure is setup */
+#define XPC_P_WTEARDOWN		0x02	/* waiting to teardown infrastructure */
+#define XPC_P_TORNDOWN		0x03	/* infrastructure is torndown */
+
+/*
+ * struct xpc_partition IPI_timer #of seconds to wait before checking for
+ * dropped IPIs. These occur whenever an IPI amo write doesn't complete until
+ * after the IPI was received.
+ */
+#define XPC_P_DROPPED_IPI_WAIT	(0.25 * HZ)
+
+/* number of seconds to wait for other partitions to disengage */
+#define XPC_DISENGAGE_REQUEST_DEFAULT_TIMELIMIT	90
+
+/* interval in seconds to print 'waiting disengagement' messages */
+#define XPC_DISENGAGE_PRINTMSG_INTERVAL		10
+
+#define XPC_PARTID(_p)	((partid_t) ((_p) - &xpc_partitions[0]))
+
+/* found in xp_main.c */
+extern struct xpc_registration xpc_registrations[];
+
+/* found in xpc_main.c */
+extern struct device *xpc_part;
+extern struct device *xpc_chan;
+extern int xpc_disengage_request_timelimit;
+extern int xpc_disengage_request_timedout;
+extern irqreturn_t xpc_notify_IRQ_handler(int, void *);
+extern void xpc_dropped_IPI_check(struct xpc_partition *);
+extern void xpc_activate_partition(struct xpc_partition *);
+extern void xpc_activate_kthreads(struct xpc_channel *, int);
+extern void xpc_create_kthreads(struct xpc_channel *, int, int);
+extern void xpc_disconnect_wait(int);
+
+/* found in xpc_partition.c */
+extern int xpc_exiting;
+extern struct xpc_vars *xpc_vars;
+extern struct xpc_rsvd_page *xpc_rsvd_page;
+extern struct xpc_vars_part *xpc_vars_part;
+extern struct xpc_partition xpc_partitions[XP_MAX_PARTITIONS + 1];
+extern char *xpc_remote_copy_buffer;
+extern void *xpc_remote_copy_buffer_base;
+extern void *xpc_kmalloc_cacheline_aligned(size_t, gfp_t, void **);
+extern struct xpc_rsvd_page *xpc_rsvd_page_init(void);
+extern void xpc_allow_IPI_ops(void);
+extern void xpc_restrict_IPI_ops(void);
+extern int xpc_identify_act_IRQ_sender(void);
+extern int xpc_partition_disengaged(struct xpc_partition *);
+extern enum xpc_retval xpc_mark_partition_active(struct xpc_partition *);
+extern void xpc_mark_partition_inactive(struct xpc_partition *);
+extern void xpc_discovery(void);
+extern void xpc_check_remote_hb(void);
+extern void xpc_deactivate_partition(const int, struct xpc_partition *,
+				     enum xpc_retval);
+extern enum xpc_retval xpc_initiate_partid_to_nasids(partid_t, void *);
+
+/* found in xpc_channel.c */
+extern void xpc_initiate_connect(int);
+extern void xpc_initiate_disconnect(int);
+extern enum xpc_retval xpc_initiate_allocate(partid_t, int, u32, void **);
+extern enum xpc_retval xpc_initiate_send(partid_t, int, void *);
+extern enum xpc_retval xpc_initiate_send_notify(partid_t, int, void *,
+						xpc_notify_func, void *);
+extern void xpc_initiate_received(partid_t, int, void *);
+extern enum xpc_retval xpc_setup_infrastructure(struct xpc_partition *);
+extern enum xpc_retval xpc_pull_remote_vars_part(struct xpc_partition *);
+extern void xpc_process_channel_activity(struct xpc_partition *);
+extern void xpc_connected_callout(struct xpc_channel *);
+extern void xpc_deliver_msg(struct xpc_channel *);
+extern void xpc_disconnect_channel(const int, struct xpc_channel *,
+				   enum xpc_retval, unsigned long *);
+extern void xpc_disconnect_callout(struct xpc_channel *, enum xpc_retval);
+extern void xpc_partition_going_down(struct xpc_partition *, enum xpc_retval);
+extern void xpc_teardown_infrastructure(struct xpc_partition *);
+
+static inline void
+xpc_wakeup_channel_mgr(struct xpc_partition *part)
+{
+	if (atomic_inc_return(&part->channel_mgr_requests) == 1)
+		wake_up(&part->channel_mgr_wq);
+}
+
+/*
+ * These next two inlines are used to keep us from tearing down a channel's
+ * msg queues while a thread may be referencing them.
+ */
+static inline void
+xpc_msgqueue_ref(struct xpc_channel *ch)
+{
+	atomic_inc(&ch->references);
+}
+
+static inline void
+xpc_msgqueue_deref(struct xpc_channel *ch)
+{
+	s32 refs = atomic_dec_return(&ch->references);
+
+	DBUG_ON(refs < 0);
+	if (refs == 0)
+		xpc_wakeup_channel_mgr(&xpc_partitions[ch->partid]);
+}
+
+#define XPC_DISCONNECT_CHANNEL(_ch, _reason, _irqflgs) \
+		xpc_disconnect_channel(__LINE__, _ch, _reason, _irqflgs)
+
+/*
+ * These two inlines are used to keep us from tearing down a partition's
+ * setup infrastructure while a thread may be referencing it.
+ */
+static inline void
+xpc_part_deref(struct xpc_partition *part)
+{
+	s32 refs = atomic_dec_return(&part->references);
+
+	DBUG_ON(refs < 0);
+	if (refs == 0 && part->setup_state == XPC_P_WTEARDOWN)
+		wake_up(&part->teardown_wq);
+}
+
+static inline int
+xpc_part_ref(struct xpc_partition *part)
+{
+	int setup;
+
+	atomic_inc(&part->references);
+	setup = (part->setup_state == XPC_P_SETUP);
+	if (!setup)
+		xpc_part_deref(part);
+
+	return setup;
+}
+
+/*
+ * The following macro is to be used for the setting of the reason and
+ * reason_line fields in both the struct xpc_channel and struct xpc_partition
+ * structures.
+ */
+#define XPC_SET_REASON(_p, _reason, _line) \
+	{ \
+		(_p)->reason = _reason; \
+		(_p)->reason_line = _line; \
+	}
+
+/*
+ * This next set of inlines are used to keep track of when a partition is
+ * potentially engaged in accessing memory belonging to another partition.
+ */
+
+static inline void
+xpc_mark_partition_engaged(struct xpc_partition *part)
+{
+	unsigned long irq_flags;
+	AMO_t *amo = (AMO_t *)__va(part->remote_amos_page_pa +
+				   (XPC_ENGAGED_PARTITIONS_AMO *
+				    sizeof(AMO_t)));
+
+	local_irq_save(irq_flags);
+
+	/* set bit corresponding to our partid in remote partition's AMO */
+	FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_OR,
+			 (1UL << sn_partition_id));
+	/*
+	 * We must always use the nofault function regardless of whether we
+	 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
+	 * didn't, we'd never know that the other partition is down and would
+	 * keep sending IPIs and AMOs to it until the heartbeat times out.
+	 */
+	(void)xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->
+							       variable),
+						     xp_nofault_PIOR_target));
+
+	local_irq_restore(irq_flags);
+}
+
+static inline void
+xpc_mark_partition_disengaged(struct xpc_partition *part)
+{
+	unsigned long irq_flags;
+	AMO_t *amo = (AMO_t *)__va(part->remote_amos_page_pa +
+				   (XPC_ENGAGED_PARTITIONS_AMO *
+				    sizeof(AMO_t)));
+
+	local_irq_save(irq_flags);
+
+	/* clear bit corresponding to our partid in remote partition's AMO */
+	FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_AND,
+			 ~(1UL << sn_partition_id));
+	/*
+	 * We must always use the nofault function regardless of whether we
+	 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
+	 * didn't, we'd never know that the other partition is down and would
+	 * keep sending IPIs and AMOs to it until the heartbeat times out.
+	 */
+	(void)xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->
+							       variable),
+						     xp_nofault_PIOR_target));
+
+	local_irq_restore(irq_flags);
+}
+
+static inline void
+xpc_request_partition_disengage(struct xpc_partition *part)
+{
+	unsigned long irq_flags;
+	AMO_t *amo = (AMO_t *)__va(part->remote_amos_page_pa +
+				   (XPC_DISENGAGE_REQUEST_AMO * sizeof(AMO_t)));
+
+	local_irq_save(irq_flags);
+
+	/* set bit corresponding to our partid in remote partition's AMO */
+	FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_OR,
+			 (1UL << sn_partition_id));
+	/*
+	 * We must always use the nofault function regardless of whether we
+	 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
+	 * didn't, we'd never know that the other partition is down and would
+	 * keep sending IPIs and AMOs to it until the heartbeat times out.
+	 */
+	(void)xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->
+							       variable),
+						     xp_nofault_PIOR_target));
+
+	local_irq_restore(irq_flags);
+}
+
+static inline void
+xpc_cancel_partition_disengage_request(struct xpc_partition *part)
+{
+	unsigned long irq_flags;
+	AMO_t *amo = (AMO_t *)__va(part->remote_amos_page_pa +
+				   (XPC_DISENGAGE_REQUEST_AMO * sizeof(AMO_t)));
+
+	local_irq_save(irq_flags);
+
+	/* clear bit corresponding to our partid in remote partition's AMO */
+	FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_AND,
+			 ~(1UL << sn_partition_id));
+	/*
+	 * We must always use the nofault function regardless of whether we
+	 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
+	 * didn't, we'd never know that the other partition is down and would
+	 * keep sending IPIs and AMOs to it until the heartbeat times out.
+	 */
+	(void)xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->
+							       variable),
+						     xp_nofault_PIOR_target));
+
+	local_irq_restore(irq_flags);
+}
+
+static inline u64
+xpc_partition_engaged(u64 partid_mask)
+{
+	AMO_t *amo = xpc_vars->amos_page + XPC_ENGAGED_PARTITIONS_AMO;
+
+	/* return our partition's AMO variable ANDed with partid_mask */
+	return (FETCHOP_LOAD_OP(TO_AMO((u64)&amo->variable), FETCHOP_LOAD) &
+		partid_mask);
+}
+
+static inline u64
+xpc_partition_disengage_requested(u64 partid_mask)
+{
+	AMO_t *amo = xpc_vars->amos_page + XPC_DISENGAGE_REQUEST_AMO;
+
+	/* return our partition's AMO variable ANDed with partid_mask */
+	return (FETCHOP_LOAD_OP(TO_AMO((u64)&amo->variable), FETCHOP_LOAD) &
+		partid_mask);
+}
+
+static inline void
+xpc_clear_partition_engaged(u64 partid_mask)
+{
+	AMO_t *amo = xpc_vars->amos_page + XPC_ENGAGED_PARTITIONS_AMO;
+
+	/* clear bit(s) based on partid_mask in our partition's AMO */
+	FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_AND,
+			 ~partid_mask);
+}
+
+static inline void
+xpc_clear_partition_disengage_request(u64 partid_mask)
+{
+	AMO_t *amo = xpc_vars->amos_page + XPC_DISENGAGE_REQUEST_AMO;
+
+	/* clear bit(s) based on partid_mask in our partition's AMO */
+	FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_AND,
+			 ~partid_mask);
+}
+
+/*
+ * The following set of macros and inlines are used for the sending and
+ * receiving of IPIs (also known as IRQs). There are two flavors of IPIs,
+ * one that is associated with partition activity (SGI_XPC_ACTIVATE) and
+ * the other that is associated with channel activity (SGI_XPC_NOTIFY).
+ */
+
+static inline u64
+xpc_IPI_receive(AMO_t *amo)
+{
+	return FETCHOP_LOAD_OP(TO_AMO((u64)&amo->variable), FETCHOP_CLEAR);
+}
+
+static inline enum xpc_retval
+xpc_IPI_send(AMO_t *amo, u64 flag, int nasid, int phys_cpuid, int vector)
+{
+	int ret = 0;
+	unsigned long irq_flags;
+
+	local_irq_save(irq_flags);
+
+	FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_OR, flag);
+	sn_send_IPI_phys(nasid, phys_cpuid, vector, 0);
+
+	/*
+	 * We must always use the nofault function regardless of whether we
+	 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
+	 * didn't, we'd never know that the other partition is down and would
+	 * keep sending IPIs and AMOs to it until the heartbeat times out.
+	 */
+	ret = xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->variable),
+						     xp_nofault_PIOR_target));
+
+	local_irq_restore(irq_flags);
+
+	return ((ret == 0) ? xpcSuccess : xpcPioReadError);
+}
+
+/*
+ * IPIs associated with SGI_XPC_ACTIVATE IRQ.
+ */
+
+/*
+ * Flag the appropriate AMO variable and send an IPI to the specified node.
+ */
+static inline void
+xpc_activate_IRQ_send(u64 amos_page_pa, int from_nasid, int to_nasid,
+		      int to_phys_cpuid)
+{
+	int w_index = XPC_NASID_W_INDEX(from_nasid);
+	int b_index = XPC_NASID_B_INDEX(from_nasid);
+	AMO_t *amos = (AMO_t *)__va(amos_page_pa +
+				    (XPC_ACTIVATE_IRQ_AMOS * sizeof(AMO_t)));
+
+	(void)xpc_IPI_send(&amos[w_index], (1UL << b_index), to_nasid,
+			   to_phys_cpuid, SGI_XPC_ACTIVATE);
+}
+
+static inline void
+xpc_IPI_send_activate(struct xpc_vars *vars)
+{
+	xpc_activate_IRQ_send(vars->amos_page_pa, cnodeid_to_nasid(0),
+			      vars->act_nasid, vars->act_phys_cpuid);
+}
+
+static inline void
+xpc_IPI_send_activated(struct xpc_partition *part)
+{
+	xpc_activate_IRQ_send(part->remote_amos_page_pa, cnodeid_to_nasid(0),
+			      part->remote_act_nasid,
+			      part->remote_act_phys_cpuid);
+}
+
+static inline void
+xpc_IPI_send_reactivate(struct xpc_partition *part)
+{
+	xpc_activate_IRQ_send(xpc_vars->amos_page_pa, part->reactivate_nasid,
+			      xpc_vars->act_nasid, xpc_vars->act_phys_cpuid);
+}
+
+static inline void
+xpc_IPI_send_disengage(struct xpc_partition *part)
+{
+	xpc_activate_IRQ_send(part->remote_amos_page_pa, cnodeid_to_nasid(0),
+			      part->remote_act_nasid,
+			      part->remote_act_phys_cpuid);
+}
+
+/*
+ * IPIs associated with SGI_XPC_NOTIFY IRQ.
+ */
+
+/*
+ * Send an IPI to the remote partition that is associated with the
+ * specified channel.
+ */
+#define XPC_NOTIFY_IRQ_SEND(_ch, _ipi_f, _irq_f) \
+		xpc_notify_IRQ_send(_ch, _ipi_f, #_ipi_f, _irq_f)
+
+static inline void
+xpc_notify_IRQ_send(struct xpc_channel *ch, u8 ipi_flag, char *ipi_flag_string,
+		    unsigned long *irq_flags)
+{
+	struct xpc_partition *part = &xpc_partitions[ch->partid];
+	enum xpc_retval ret;
+
+	if (likely(part->act_state != XPC_P_DEACTIVATING)) {
+		ret = xpc_IPI_send(part->remote_IPI_amo_va,
+				   (u64)ipi_flag << (ch->number * 8),
+				   part->remote_IPI_nasid,
+				   part->remote_IPI_phys_cpuid, SGI_XPC_NOTIFY);
+		dev_dbg(xpc_chan, "%s sent to partid=%d, channel=%d, ret=%d\n",
+			ipi_flag_string, ch->partid, ch->number, ret);
+		if (unlikely(ret != xpcSuccess)) {
+			if (irq_flags != NULL)
+				spin_unlock_irqrestore(&ch->lock, *irq_flags);
+			XPC_DEACTIVATE_PARTITION(part, ret);
+			if (irq_flags != NULL)
+				spin_lock_irqsave(&ch->lock, *irq_flags);
+		}
+	}
+}
+
+/*
+ * Make it look like the remote partition, which is associated with the
+ * specified channel, sent us an IPI. This faked IPI will be handled
+ * by xpc_dropped_IPI_check().
+ */
+#define XPC_NOTIFY_IRQ_SEND_LOCAL(_ch, _ipi_f) \
+		xpc_notify_IRQ_send_local(_ch, _ipi_f, #_ipi_f)
+
+static inline void
+xpc_notify_IRQ_send_local(struct xpc_channel *ch, u8 ipi_flag,
+			  char *ipi_flag_string)
+{
+	struct xpc_partition *part = &xpc_partitions[ch->partid];
+
+	FETCHOP_STORE_OP(TO_AMO((u64)&part->local_IPI_amo_va->variable),
+			 FETCHOP_OR, ((u64)ipi_flag << (ch->number * 8)));
+	dev_dbg(xpc_chan, "%s sent local from partid=%d, channel=%d\n",
+		ipi_flag_string, ch->partid, ch->number);
+}
+
+/*
+ * The sending and receiving of IPIs includes the setting of an AMO variable
+ * to indicate the reason the IPI was sent. The 64-bit variable is divided
+ * up into eight bytes, ordered from right to left. Byte zero pertains to
+ * channel 0, byte one to channel 1, and so on. Each byte is described by
+ * the following IPI flags.
+ */
+
+#define	XPC_IPI_CLOSEREQUEST	0x01
+#define	XPC_IPI_CLOSEREPLY	0x02
+#define	XPC_IPI_OPENREQUEST	0x04
+#define	XPC_IPI_OPENREPLY	0x08
+#define	XPC_IPI_MSGREQUEST	0x10
+
+/* given an AMO variable and a channel#, get its associated IPI flags */
+#define XPC_GET_IPI_FLAGS(_amo, _c)	((u8) (((_amo) >> ((_c) * 8)) & 0xff))
+#define XPC_SET_IPI_FLAGS(_amo, _c, _f)	(_amo) |= ((u64) (_f) << ((_c) * 8))
+
+#define	XPC_ANY_OPENCLOSE_IPI_FLAGS_SET(_amo) ((_amo) & 0x0f0f0f0f0f0f0f0fUL)
+#define XPC_ANY_MSG_IPI_FLAGS_SET(_amo)       ((_amo) & 0x1010101010101010UL)
+
+static inline void
+xpc_IPI_send_closerequest(struct xpc_channel *ch, unsigned long *irq_flags)
+{
+	struct xpc_openclose_args *args = ch->local_openclose_args;
+
+	args->reason = ch->reason;
+
+	XPC_NOTIFY_IRQ_SEND(ch, XPC_IPI_CLOSEREQUEST, irq_flags);
+}
+
+static inline void
+xpc_IPI_send_closereply(struct xpc_channel *ch, unsigned long *irq_flags)
+{
+	XPC_NOTIFY_IRQ_SEND(ch, XPC_IPI_CLOSEREPLY, irq_flags);
+}
+
+static inline void
+xpc_IPI_send_openrequest(struct xpc_channel *ch, unsigned long *irq_flags)
+{
+	struct xpc_openclose_args *args = ch->local_openclose_args;
+
+	args->msg_size = ch->msg_size;
+	args->local_nentries = ch->local_nentries;
+
+	XPC_NOTIFY_IRQ_SEND(ch, XPC_IPI_OPENREQUEST, irq_flags);
+}
+
+static inline void
+xpc_IPI_send_openreply(struct xpc_channel *ch, unsigned long *irq_flags)
+{
+	struct xpc_openclose_args *args = ch->local_openclose_args;
+
+	args->remote_nentries = ch->remote_nentries;
+	args->local_nentries = ch->local_nentries;
+	args->local_msgqueue_pa = __pa(ch->local_msgqueue);
+
+	XPC_NOTIFY_IRQ_SEND(ch, XPC_IPI_OPENREPLY, irq_flags);
+}
+
+static inline void
+xpc_IPI_send_msgrequest(struct xpc_channel *ch)
+{
+	XPC_NOTIFY_IRQ_SEND(ch, XPC_IPI_MSGREQUEST, NULL);
+}
+
+static inline void
+xpc_IPI_send_local_msgrequest(struct xpc_channel *ch)
+{
+	XPC_NOTIFY_IRQ_SEND_LOCAL(ch, XPC_IPI_MSGREQUEST);
+}
+
+/*
+ * Memory for XPC's AMO variables is allocated by the MSPEC driver. These
+ * pages are located in the lowest granule. The lowest granule uses 4k pages
+ * for cached references and an alternate TLB handler to never provide a
+ * cacheable mapping for the entire region. This will prevent speculative
+ * reading of cached copies of our lines from being issued which will cause
+ * a PI FSB Protocol error to be generated by the SHUB. For XPC, we need 64
+ * AMO variables (based on XP_MAX_PARTITIONS) for message notification and an
+ * additional 128 AMO variables (based on XP_NASID_MASK_WORDS) for partition
+ * activation and 2 AMO variables for partition deactivation.
+ */
+static inline AMO_t *
+xpc_IPI_init(int index)
+{
+	AMO_t *amo = xpc_vars->amos_page + index;
+
+	(void)xpc_IPI_receive(amo);	/* clear AMO variable */
+	return amo;
+}
+
+static inline enum xpc_retval
+xpc_map_bte_errors(bte_result_t error)
+{
+	if (error == BTE_SUCCESS)
+		return xpcSuccess;
+
+	if (is_shub2()) {
+		if (BTE_VALID_SH2_ERROR(error))
+			return xpcBteSh2Start + error;
+		return xpcBteUnmappedError;
+	}
+	switch (error) {
+	case BTE_SUCCESS:
+		return xpcSuccess;
+	case BTEFAIL_DIR:
+		return xpcBteDirectoryError;
+	case BTEFAIL_POISON:
+		return xpcBtePoisonError;
+	case BTEFAIL_WERR:
+		return xpcBteWriteError;
+	case BTEFAIL_ACCESS:
+		return xpcBteAccessError;
+	case BTEFAIL_PWERR:
+		return xpcBtePWriteError;
+	case BTEFAIL_PRERR:
+		return xpcBtePReadError;
+	case BTEFAIL_TOUT:
+		return xpcBteTimeOutError;
+	case BTEFAIL_XTERR:
+		return xpcBteXtalkError;
+	case BTEFAIL_NOTAVAIL:
+		return xpcBteNotAvailable;
+	default:
+		return xpcBteUnmappedError;
+	}
+}
+
+/*
+ * Check to see if there is any channel activity to/from the specified
+ * partition.
+ */
+static inline void
+xpc_check_for_channel_activity(struct xpc_partition *part)
+{
+	u64 IPI_amo;
+	unsigned long irq_flags;
+
+	IPI_amo = xpc_IPI_receive(part->local_IPI_amo_va);
+	if (IPI_amo == 0)
+		return;
+
+	spin_lock_irqsave(&part->IPI_lock, irq_flags);
+	part->local_IPI_amo |= IPI_amo;
+	spin_unlock_irqrestore(&part->IPI_lock, irq_flags);
+
+	dev_dbg(xpc_chan, "received IPI from partid=%d, IPI_amo=0x%lx\n",
+		XPC_PARTID(part), IPI_amo);
+
+	xpc_wakeup_channel_mgr(part);
+}
+
+#endif /* _DRIVERS_MISC_SGIXP_XPC_H */
diff --git a/drivers/misc/sgi-xp/xpc_channel.c b/drivers/misc/sgi-xp/xpc_channel.c
new file mode 100644
index 00000000000..bfcb9ea968e
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc_channel.c
@@ -0,0 +1,2243 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2004-2008 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+/*
+ * Cross Partition Communication (XPC) channel support.
+ *
+ *	This is the part of XPC that manages the channels and
+ *	sends/receives messages across them to/from other partitions.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/cache.h>
+#include <linux/interrupt.h>
+#include <linux/mutex.h>
+#include <linux/completion.h>
+#include <asm/sn/bte.h>
+#include <asm/sn/sn_sal.h>
+#include "xpc.h"
+
+/*
+ * Guarantee that the kzalloc'd memory is cacheline aligned.
+ */
+static void *
+xpc_kzalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
+{
+	/* see if kzalloc will give us cachline aligned memory by default */
+	*base = kzalloc(size, flags);
+	if (*base == NULL)
+		return NULL;
+
+	if ((u64)*base == L1_CACHE_ALIGN((u64)*base))
+		return *base;
+
+	kfree(*base);
+
+	/* nope, we'll have to do it ourselves */
+	*base = kzalloc(size + L1_CACHE_BYTES, flags);
+	if (*base == NULL)
+		return NULL;
+
+	return (void *)L1_CACHE_ALIGN((u64)*base);
+}
+
+/*
+ * Set up the initial values for the XPartition Communication channels.
+ */
+static void
+xpc_initialize_channels(struct xpc_partition *part, partid_t partid)
+{
+	int ch_number;
+	struct xpc_channel *ch;
+
+	for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
+		ch = &part->channels[ch_number];
+
+		ch->partid = partid;
+		ch->number = ch_number;
+		ch->flags = XPC_C_DISCONNECTED;
+
+		ch->local_GP = &part->local_GPs[ch_number];
+		ch->local_openclose_args =
+		    &part->local_openclose_args[ch_number];
+
+		atomic_set(&ch->kthreads_assigned, 0);
+		atomic_set(&ch->kthreads_idle, 0);
+		atomic_set(&ch->kthreads_active, 0);
+
+		atomic_set(&ch->references, 0);
+		atomic_set(&ch->n_to_notify, 0);
+
+		spin_lock_init(&ch->lock);
+		mutex_init(&ch->msg_to_pull_mutex);
+		init_completion(&ch->wdisconnect_wait);
+
+		atomic_set(&ch->n_on_msg_allocate_wq, 0);
+		init_waitqueue_head(&ch->msg_allocate_wq);
+		init_waitqueue_head(&ch->idle_wq);
+	}
+}
+
+/*
+ * Setup the infrastructure necessary to support XPartition Communication
+ * between the specified remote partition and the local one.
+ */
+enum xpc_retval
+xpc_setup_infrastructure(struct xpc_partition *part)
+{
+	int ret, cpuid;
+	struct timer_list *timer;
+	partid_t partid = XPC_PARTID(part);
+
+	/*
+	 * Zero out MOST of the entry for this partition. Only the fields
+	 * starting with `nchannels' will be zeroed. The preceding fields must
+	 * remain `viable' across partition ups and downs, since they may be
+	 * referenced during this memset() operation.
+	 */
+	memset(&part->nchannels, 0, sizeof(struct xpc_partition) -
+	       offsetof(struct xpc_partition, nchannels));
+
+	/*
+	 * Allocate all of the channel structures as a contiguous chunk of
+	 * memory.
+	 */
+	part->channels = kzalloc(sizeof(struct xpc_channel) * XPC_NCHANNELS,
+				 GFP_KERNEL);
+	if (part->channels == NULL) {
+		dev_err(xpc_chan, "can't get memory for channels\n");
+		return xpcNoMemory;
+	}
+
+	part->nchannels = XPC_NCHANNELS;
+
+	/* allocate all the required GET/PUT values */
+
+	part->local_GPs = xpc_kzalloc_cacheline_aligned(XPC_GP_SIZE,
+							GFP_KERNEL,
+							&part->local_GPs_base);
+	if (part->local_GPs == NULL) {
+		kfree(part->channels);
+		part->channels = NULL;
+		dev_err(xpc_chan, "can't get memory for local get/put "
+			"values\n");
+		return xpcNoMemory;
+	}
+
+	part->remote_GPs = xpc_kzalloc_cacheline_aligned(XPC_GP_SIZE,
+							 GFP_KERNEL,
+							 &part->
+							 remote_GPs_base);
+	if (part->remote_GPs == NULL) {
+		dev_err(xpc_chan, "can't get memory for remote get/put "
+			"values\n");
+		kfree(part->local_GPs_base);
+		part->local_GPs = NULL;
+		kfree(part->channels);
+		part->channels = NULL;
+		return xpcNoMemory;
+	}
+
+	/* allocate all the required open and close args */
+
+	part->local_openclose_args =
+	    xpc_kzalloc_cacheline_aligned(XPC_OPENCLOSE_ARGS_SIZE, GFP_KERNEL,
+					  &part->local_openclose_args_base);
+	if (part->local_openclose_args == NULL) {
+		dev_err(xpc_chan, "can't get memory for local connect args\n");
+		kfree(part->remote_GPs_base);
+		part->remote_GPs = NULL;
+		kfree(part->local_GPs_base);
+		part->local_GPs = NULL;
+		kfree(part->channels);
+		part->channels = NULL;
+		return xpcNoMemory;
+	}
+
+	part->remote_openclose_args =
+	    xpc_kzalloc_cacheline_aligned(XPC_OPENCLOSE_ARGS_SIZE, GFP_KERNEL,
+					  &part->remote_openclose_args_base);
+	if (part->remote_openclose_args == NULL) {
+		dev_err(xpc_chan, "can't get memory for remote connect args\n");
+		kfree(part->local_openclose_args_base);
+		part->local_openclose_args = NULL;
+		kfree(part->remote_GPs_base);
+		part->remote_GPs = NULL;
+		kfree(part->local_GPs_base);
+		part->local_GPs = NULL;
+		kfree(part->channels);
+		part->channels = NULL;
+		return xpcNoMemory;
+	}
+
+	xpc_initialize_channels(part, partid);
+
+	atomic_set(&part->nchannels_active, 0);
+	atomic_set(&part->nchannels_engaged, 0);
+
+	/* local_IPI_amo were set to 0 by an earlier memset() */
+
+	/* Initialize this partitions AMO_t structure */
+	part->local_IPI_amo_va = xpc_IPI_init(partid);
+
+	spin_lock_init(&part->IPI_lock);
+
+	atomic_set(&part->channel_mgr_requests, 1);
+	init_waitqueue_head(&part->channel_mgr_wq);
+
+	sprintf(part->IPI_owner, "xpc%02d", partid);
+	ret = request_irq(SGI_XPC_NOTIFY, xpc_notify_IRQ_handler, IRQF_SHARED,
+			  part->IPI_owner, (void *)(u64)partid);
+	if (ret != 0) {
+		dev_err(xpc_chan, "can't register NOTIFY IRQ handler, "
+			"errno=%d\n", -ret);
+		kfree(part->remote_openclose_args_base);
+		part->remote_openclose_args = NULL;
+		kfree(part->local_openclose_args_base);
+		part->local_openclose_args = NULL;
+		kfree(part->remote_GPs_base);
+		part->remote_GPs = NULL;
+		kfree(part->local_GPs_base);
+		part->local_GPs = NULL;
+		kfree(part->channels);
+		part->channels = NULL;
+		return xpcLackOfResources;
+	}
+
+	/* Setup a timer to check for dropped IPIs */
+	timer = &part->dropped_IPI_timer;
+	init_timer(timer);
+	timer->function = (void (*)(unsigned long))xpc_dropped_IPI_check;
+	timer->data = (unsigned long)part;
+	timer->expires = jiffies + XPC_P_DROPPED_IPI_WAIT;
+	add_timer(timer);
+
+	/*
+	 * With the setting of the partition setup_state to XPC_P_SETUP, we're
+	 * declaring that this partition is ready to go.
+	 */
+	part->setup_state = XPC_P_SETUP;
+
+	/*
+	 * Setup the per partition specific variables required by the
+	 * remote partition to establish channel connections with us.
+	 *
+	 * The setting of the magic # indicates that these per partition
+	 * specific variables are ready to be used.
+	 */
+	xpc_vars_part[partid].GPs_pa = __pa(part->local_GPs);
+	xpc_vars_part[partid].openclose_args_pa =
+	    __pa(part->local_openclose_args);
+	xpc_vars_part[partid].IPI_amo_pa = __pa(part->local_IPI_amo_va);
+	cpuid = raw_smp_processor_id();	/* any CPU in this partition will do */
+	xpc_vars_part[partid].IPI_nasid = cpuid_to_nasid(cpuid);
+	xpc_vars_part[partid].IPI_phys_cpuid = cpu_physical_id(cpuid);
+	xpc_vars_part[partid].nchannels = part->nchannels;
+	xpc_vars_part[partid].magic = XPC_VP_MAGIC1;
+
+	return xpcSuccess;
+}
+
+/*
+ * Create a wrapper that hides the underlying mechanism for pulling a cacheline
+ * (or multiple cachelines) from a remote partition.
+ *
+ * src must be a cacheline aligned physical address on the remote partition.
+ * dst must be a cacheline aligned virtual address on this partition.
+ * cnt must be an cacheline sized
+ */
+static enum xpc_retval
+xpc_pull_remote_cachelines(struct xpc_partition *part, void *dst,
+			   const void *src, size_t cnt)
+{
+	bte_result_t bte_ret;
+
+	DBUG_ON((u64)src != L1_CACHE_ALIGN((u64)src));
+	DBUG_ON((u64)dst != L1_CACHE_ALIGN((u64)dst));
+	DBUG_ON(cnt != L1_CACHE_ALIGN(cnt));
+
+	if (part->act_state == XPC_P_DEACTIVATING)
+		return part->reason;
+
+	bte_ret = xp_bte_copy((u64)src, (u64)dst, (u64)cnt,
+			      (BTE_NORMAL | BTE_WACQUIRE), NULL);
+	if (bte_ret == BTE_SUCCESS)
+		return xpcSuccess;
+
+	dev_dbg(xpc_chan, "xp_bte_copy() from partition %d failed, ret=%d\n",
+		XPC_PARTID(part), bte_ret);
+
+	return xpc_map_bte_errors(bte_ret);
+}
+
+/*
+ * Pull the remote per partition specific variables from the specified
+ * partition.
+ */
+enum xpc_retval
+xpc_pull_remote_vars_part(struct xpc_partition *part)
+{
+	u8 buffer[L1_CACHE_BYTES * 2];
+	struct xpc_vars_part *pulled_entry_cacheline =
+	    (struct xpc_vars_part *)L1_CACHE_ALIGN((u64)buffer);
+	struct xpc_vars_part *pulled_entry;
+	u64 remote_entry_cacheline_pa, remote_entry_pa;
+	partid_t partid = XPC_PARTID(part);
+	enum xpc_retval ret;
+
+	/* pull the cacheline that contains the variables we're interested in */
+
+	DBUG_ON(part->remote_vars_part_pa !=
+		L1_CACHE_ALIGN(part->remote_vars_part_pa));
+	DBUG_ON(sizeof(struct xpc_vars_part) != L1_CACHE_BYTES / 2);
+
+	remote_entry_pa = part->remote_vars_part_pa +
+	    sn_partition_id * sizeof(struct xpc_vars_part);
+
+	remote_entry_cacheline_pa = (remote_entry_pa & ~(L1_CACHE_BYTES - 1));
+
+	pulled_entry = (struct xpc_vars_part *)((u64)pulled_entry_cacheline +
+						(remote_entry_pa &
+						 (L1_CACHE_BYTES - 1)));
+
+	ret = xpc_pull_remote_cachelines(part, pulled_entry_cacheline,
+					 (void *)remote_entry_cacheline_pa,
+					 L1_CACHE_BYTES);
+	if (ret != xpcSuccess) {
+		dev_dbg(xpc_chan, "failed to pull XPC vars_part from "
+			"partition %d, ret=%d\n", partid, ret);
+		return ret;
+	}
+
+	/* see if they've been set up yet */
+
+	if (pulled_entry->magic != XPC_VP_MAGIC1 &&
+	    pulled_entry->magic != XPC_VP_MAGIC2) {
+
+		if (pulled_entry->magic != 0) {
+			dev_dbg(xpc_chan, "partition %d's XPC vars_part for "
+				"partition %d has bad magic value (=0x%lx)\n",
+				partid, sn_partition_id, pulled_entry->magic);
+			return xpcBadMagic;
+		}
+
+		/* they've not been initialized yet */
+		return xpcRetry;
+	}
+
+	if (xpc_vars_part[partid].magic == XPC_VP_MAGIC1) {
+
+		/* validate the variables */
+
+		if (pulled_entry->GPs_pa == 0 ||
+		    pulled_entry->openclose_args_pa == 0 ||
+		    pulled_entry->IPI_amo_pa == 0) {
+
+			dev_err(xpc_chan, "partition %d's XPC vars_part for "
+				"partition %d are not valid\n", partid,
+				sn_partition_id);
+			return xpcInvalidAddress;
+		}
+
+		/* the variables we imported look to be valid */
+
+		part->remote_GPs_pa = pulled_entry->GPs_pa;
+		part->remote_openclose_args_pa =
+		    pulled_entry->openclose_args_pa;
+		part->remote_IPI_amo_va =
+		    (AMO_t *)__va(pulled_entry->IPI_amo_pa);
+		part->remote_IPI_nasid = pulled_entry->IPI_nasid;
+		part->remote_IPI_phys_cpuid = pulled_entry->IPI_phys_cpuid;
+
+		if (part->nchannels > pulled_entry->nchannels)
+			part->nchannels = pulled_entry->nchannels;
+
+		/* let the other side know that we've pulled their variables */
+
+		xpc_vars_part[partid].magic = XPC_VP_MAGIC2;
+	}
+
+	if (pulled_entry->magic == XPC_VP_MAGIC1)
+		return xpcRetry;
+
+	return xpcSuccess;
+}
+
+/*
+ * Get the IPI flags and pull the openclose args and/or remote GPs as needed.
+ */
+static u64
+xpc_get_IPI_flags(struct xpc_partition *part)
+{
+	unsigned long irq_flags;
+	u64 IPI_amo;
+	enum xpc_retval ret;
+
+	/*
+	 * See if there are any IPI flags to be handled.
+	 */
+
+	spin_lock_irqsave(&part->IPI_lock, irq_flags);
+	IPI_amo = part->local_IPI_amo;
+	if (IPI_amo != 0)
+		part->local_IPI_amo = 0;
+
+	spin_unlock_irqrestore(&part->IPI_lock, irq_flags);
+
+	if (XPC_ANY_OPENCLOSE_IPI_FLAGS_SET(IPI_amo)) {
+		ret = xpc_pull_remote_cachelines(part,
+						 part->remote_openclose_args,
+						 (void *)part->
+						 remote_openclose_args_pa,
+						 XPC_OPENCLOSE_ARGS_SIZE);
+		if (ret != xpcSuccess) {
+			XPC_DEACTIVATE_PARTITION(part, ret);
+
+			dev_dbg(xpc_chan, "failed to pull openclose args from "
+				"partition %d, ret=%d\n", XPC_PARTID(part),
+				ret);
+
+			/* don't bother processing IPIs anymore */
+			IPI_amo = 0;
+		}
+	}
+
+	if (XPC_ANY_MSG_IPI_FLAGS_SET(IPI_amo)) {
+		ret = xpc_pull_remote_cachelines(part, part->remote_GPs,
+						 (void *)part->remote_GPs_pa,
+						 XPC_GP_SIZE);
+		if (ret != xpcSuccess) {
+			XPC_DEACTIVATE_PARTITION(part, ret);
+
+			dev_dbg(xpc_chan, "failed to pull GPs from partition "
+				"%d, ret=%d\n", XPC_PARTID(part), ret);
+
+			/* don't bother processing IPIs anymore */
+			IPI_amo = 0;
+		}
+	}
+
+	return IPI_amo;
+}
+
+/*
+ * Allocate the local message queue and the notify queue.
+ */
+static enum xpc_retval
+xpc_allocate_local_msgqueue(struct xpc_channel *ch)
+{
+	unsigned long irq_flags;
+	int nentries;
+	size_t nbytes;
+
+	for (nentries = ch->local_nentries; nentries > 0; nentries--) {
+
+		nbytes = nentries * ch->msg_size;
+		ch->local_msgqueue = xpc_kzalloc_cacheline_aligned(nbytes,
+								   GFP_KERNEL,
+						      &ch->local_msgqueue_base);
+		if (ch->local_msgqueue == NULL)
+			continue;
+
+		nbytes = nentries * sizeof(struct xpc_notify);
+		ch->notify_queue = kzalloc(nbytes, GFP_KERNEL);
+		if (ch->notify_queue == NULL) {
+			kfree(ch->local_msgqueue_base);
+			ch->local_msgqueue = NULL;
+			continue;
+		}
+
+		spin_lock_irqsave(&ch->lock, irq_flags);
+		if (nentries < ch->local_nentries) {
+			dev_dbg(xpc_chan, "nentries=%d local_nentries=%d, "
+				"partid=%d, channel=%d\n", nentries,
+				ch->local_nentries, ch->partid, ch->number);
+
+			ch->local_nentries = nentries;
+		}
+		spin_unlock_irqrestore(&ch->lock, irq_flags);
+		return xpcSuccess;
+	}
+
+	dev_dbg(xpc_chan, "can't get memory for local message queue and notify "
+		"queue, partid=%d, channel=%d\n", ch->partid, ch->number);
+	return xpcNoMemory;
+}
+
+/*
+ * Allocate the cached remote message queue.
+ */
+static enum xpc_retval
+xpc_allocate_remote_msgqueue(struct xpc_channel *ch)
+{
+	unsigned long irq_flags;
+	int nentries;
+	size_t nbytes;
+
+	DBUG_ON(ch->remote_nentries <= 0);
+
+	for (nentries = ch->remote_nentries; nentries > 0; nentries--) {
+
+		nbytes = nentries * ch->msg_size;
+		ch->remote_msgqueue = xpc_kzalloc_cacheline_aligned(nbytes,
+								    GFP_KERNEL,
+						     &ch->remote_msgqueue_base);
+		if (ch->remote_msgqueue == NULL)
+			continue;
+
+		spin_lock_irqsave(&ch->lock, irq_flags);
+		if (nentries < ch->remote_nentries) {
+			dev_dbg(xpc_chan, "nentries=%d remote_nentries=%d, "
+				"partid=%d, channel=%d\n", nentries,
+				ch->remote_nentries, ch->partid, ch->number);
+
+			ch->remote_nentries = nentries;
+		}
+		spin_unlock_irqrestore(&ch->lock, irq_flags);
+		return xpcSuccess;
+	}
+
+	dev_dbg(xpc_chan, "can't get memory for cached remote message queue, "
+		"partid=%d, channel=%d\n", ch->partid, ch->number);
+	return xpcNoMemory;
+}
+
+/*
+ * Allocate message queues and other stuff associated with a channel.
+ *
+ * Note: Assumes all of the channel sizes are filled in.
+ */
+static enum xpc_retval
+xpc_allocate_msgqueues(struct xpc_channel *ch)
+{
+	unsigned long irq_flags;
+	enum xpc_retval ret;
+
+	DBUG_ON(ch->flags & XPC_C_SETUP);
+
+	ret = xpc_allocate_local_msgqueue(ch);
+	if (ret != xpcSuccess)
+		return ret;
+
+	ret = xpc_allocate_remote_msgqueue(ch);
+	if (ret != xpcSuccess) {
+		kfree(ch->local_msgqueue_base);
+		ch->local_msgqueue = NULL;
+		kfree(ch->notify_queue);
+		ch->notify_queue = NULL;
+		return ret;
+	}
+
+	spin_lock_irqsave(&ch->lock, irq_flags);
+	ch->flags |= XPC_C_SETUP;
+	spin_unlock_irqrestore(&ch->lock, irq_flags);
+
+	return xpcSuccess;
+}
+
+/*
+ * Process a connect message from a remote partition.
+ *
+ * Note: xpc_process_connect() is expecting to be called with the
+ * spin_lock_irqsave held and will leave it locked upon return.
+ */
+static void
+xpc_process_connect(struct xpc_channel *ch, unsigned long *irq_flags)
+{
+	enum xpc_retval ret;
+
+	DBUG_ON(!spin_is_locked(&ch->lock));
+
+	if (!(ch->flags & XPC_C_OPENREQUEST) ||
+	    !(ch->flags & XPC_C_ROPENREQUEST)) {
+		/* nothing more to do for now */
+		return;
+	}
+	DBUG_ON(!(ch->flags & XPC_C_CONNECTING));
+
+	if (!(ch->flags & XPC_C_SETUP)) {
+		spin_unlock_irqrestore(&ch->lock, *irq_flags);
+		ret = xpc_allocate_msgqueues(ch);
+		spin_lock_irqsave(&ch->lock, *irq_flags);
+
+		if (ret != xpcSuccess)
+			XPC_DISCONNECT_CHANNEL(ch, ret, irq_flags);
+
+		if (ch->flags & (XPC_C_CONNECTED | XPC_C_DISCONNECTING))
+			return;
+
+		DBUG_ON(!(ch->flags & XPC_C_SETUP));
+		DBUG_ON(ch->local_msgqueue == NULL);
+		DBUG_ON(ch->remote_msgqueue == NULL);
+	}
+
+	if (!(ch->flags & XPC_C_OPENREPLY)) {
+		ch->flags |= XPC_C_OPENREPLY;
+		xpc_IPI_send_openreply(ch, irq_flags);
+	}
+
+	if (!(ch->flags & XPC_C_ROPENREPLY))
+		return;
+
+	DBUG_ON(ch->remote_msgqueue_pa == 0);
+
+	ch->flags = (XPC_C_CONNECTED | XPC_C_SETUP);	/* clear all else */
+
+	dev_info(xpc_chan, "channel %d to partition %d connected\n",
+		 ch->number, ch->partid);
+
+	spin_unlock_irqrestore(&ch->lock, *irq_flags);
+	xpc_create_kthreads(ch, 1, 0);
+	spin_lock_irqsave(&ch->lock, *irq_flags);
+}
+
+/*
+ * Notify those who wanted to be notified upon delivery of their message.
+ */
+static void
+xpc_notify_senders(struct xpc_channel *ch, enum xpc_retval reason, s64 put)
+{
+	struct xpc_notify *notify;
+	u8 notify_type;
+	s64 get = ch->w_remote_GP.get - 1;
+
+	while (++get < put && atomic_read(&ch->n_to_notify) > 0) {
+
+		notify = &ch->notify_queue[get % ch->local_nentries];
+
+		/*
+		 * See if the notify entry indicates it was associated with
+		 * a message who's sender wants to be notified. It is possible
+		 * that it is, but someone else is doing or has done the
+		 * notification.
+		 */
+		notify_type = notify->type;
+		if (notify_type == 0 ||
+		    cmpxchg(&notify->type, notify_type, 0) != notify_type) {
+			continue;
+		}
+
+		DBUG_ON(notify_type != XPC_N_CALL);
+
+		atomic_dec(&ch->n_to_notify);
+
+		if (notify->func != NULL) {
+			dev_dbg(xpc_chan, "notify->func() called, notify=0x%p, "
+				"msg_number=%ld, partid=%d, channel=%d\n",
+				(void *)notify, get, ch->partid, ch->number);
+
+			notify->func(reason, ch->partid, ch->number,
+				     notify->key);
+
+			dev_dbg(xpc_chan, "notify->func() returned, "
+				"notify=0x%p, msg_number=%ld, partid=%d, "
+				"channel=%d\n", (void *)notify, get,
+				ch->partid, ch->number);
+		}
+	}
+}
+
+/*
+ * Free up message queues and other stuff that were allocated for the specified
+ * channel.
+ *
+ * Note: ch->reason and ch->reason_line are left set for debugging purposes,
+ * they're cleared when XPC_C_DISCONNECTED is cleared.
+ */
+static void
+xpc_free_msgqueues(struct xpc_channel *ch)
+{
+	DBUG_ON(!spin_is_locked(&ch->lock));
+	DBUG_ON(atomic_read(&ch->n_to_notify) != 0);
+
+	ch->remote_msgqueue_pa = 0;
+	ch->func = NULL;
+	ch->key = NULL;
+	ch->msg_size = 0;
+	ch->local_nentries = 0;
+	ch->remote_nentries = 0;
+	ch->kthreads_assigned_limit = 0;
+	ch->kthreads_idle_limit = 0;
+
+	ch->local_GP->get = 0;
+	ch->local_GP->put = 0;
+	ch->remote_GP.get = 0;
+	ch->remote_GP.put = 0;
+	ch->w_local_GP.get = 0;
+	ch->w_local_GP.put = 0;
+	ch->w_remote_GP.get = 0;
+	ch->w_remote_GP.put = 0;
+	ch->next_msg_to_pull = 0;
+
+	if (ch->flags & XPC_C_SETUP) {
+		ch->flags &= ~XPC_C_SETUP;
+
+		dev_dbg(xpc_chan, "ch->flags=0x%x, partid=%d, channel=%d\n",
+			ch->flags, ch->partid, ch->number);
+
+		kfree(ch->local_msgqueue_base);
+		ch->local_msgqueue = NULL;
+		kfree(ch->remote_msgqueue_base);
+		ch->remote_msgqueue = NULL;
+		kfree(ch->notify_queue);
+		ch->notify_queue = NULL;
+	}
+}
+
+/*
+ * spin_lock_irqsave() is expected to be held on entry.
+ */
+static void
+xpc_process_disconnect(struct xpc_channel *ch, unsigned long *irq_flags)
+{
+	struct xpc_partition *part = &xpc_partitions[ch->partid];
+	u32 channel_was_connected = (ch->flags & XPC_C_WASCONNECTED);
+
+	DBUG_ON(!spin_is_locked(&ch->lock));
+
+	if (!(ch->flags & XPC_C_DISCONNECTING))
+		return;
+
+	DBUG_ON(!(ch->flags & XPC_C_CLOSEREQUEST));
+
+	/* make sure all activity has settled down first */
+
+	if (atomic_read(&ch->kthreads_assigned) > 0 ||
+	    atomic_read(&ch->references) > 0) {
+		return;
+	}
+	DBUG_ON((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) &&
+		!(ch->flags & XPC_C_DISCONNECTINGCALLOUT_MADE));
+
+	if (part->act_state == XPC_P_DEACTIVATING) {
+		/* can't proceed until the other side disengages from us */
+		if (xpc_partition_engaged(1UL << ch->partid))
+			return;
+
+	} else {
+
+		/* as long as the other side is up do the full protocol */
+
+		if (!(ch->flags & XPC_C_RCLOSEREQUEST))
+			return;
+
+		if (!(ch->flags & XPC_C_CLOSEREPLY)) {
+			ch->flags |= XPC_C_CLOSEREPLY;
+			xpc_IPI_send_closereply(ch, irq_flags);
+		}
+
+		if (!(ch->flags & XPC_C_RCLOSEREPLY))
+			return;
+	}
+
+	/* wake those waiting for notify completion */
+	if (atomic_read(&ch->n_to_notify) > 0) {
+		/* >>> we do callout while holding ch->lock */
+		xpc_notify_senders(ch, ch->reason, ch->w_local_GP.put);
+	}
+
+	/* both sides are disconnected now */
+
+	if (ch->flags & XPC_C_DISCONNECTINGCALLOUT_MADE) {
+		spin_unlock_irqrestore(&ch->lock, *irq_flags);
+		xpc_disconnect_callout(ch, xpcDisconnected);
+		spin_lock_irqsave(&ch->lock, *irq_flags);
+	}
+
+	/* it's now safe to free the channel's message queues */
+	xpc_free_msgqueues(ch);
+
+	/* mark disconnected, clear all other flags except XPC_C_WDISCONNECT */
+	ch->flags = (XPC_C_DISCONNECTED | (ch->flags & XPC_C_WDISCONNECT));
+
+	atomic_dec(&part->nchannels_active);
+
+	if (channel_was_connected) {
+		dev_info(xpc_chan, "channel %d to partition %d disconnected, "
+			 "reason=%d\n", ch->number, ch->partid, ch->reason);
+	}
+
+	if (ch->flags & XPC_C_WDISCONNECT) {
+		/* we won't lose the CPU since we're holding ch->lock */
+		complete(&ch->wdisconnect_wait);
+	} else if (ch->delayed_IPI_flags) {
+		if (part->act_state != XPC_P_DEACTIVATING) {
+			/* time to take action on any delayed IPI flags */
+			spin_lock(&part->IPI_lock);
+			XPC_SET_IPI_FLAGS(part->local_IPI_amo, ch->number,
+					  ch->delayed_IPI_flags);
+			spin_unlock(&part->IPI_lock);
+		}
+		ch->delayed_IPI_flags = 0;
+	}
+}
+
+/*
+ * Process a change in the channel's remote connection state.
+ */
+static void
+xpc_process_openclose_IPI(struct xpc_partition *part, int ch_number,
+			  u8 IPI_flags)
+{
+	unsigned long irq_flags;
+	struct xpc_openclose_args *args =
+	    &part->remote_openclose_args[ch_number];
+	struct xpc_channel *ch = &part->channels[ch_number];
+	enum xpc_retval reason;
+
+	spin_lock_irqsave(&ch->lock, irq_flags);
+
+again:
+
+	if ((ch->flags & XPC_C_DISCONNECTED) &&
+	    (ch->flags & XPC_C_WDISCONNECT)) {
+		/*
+		 * Delay processing IPI flags until thread waiting disconnect
+		 * has had a chance to see that the channel is disconnected.
+		 */
+		ch->delayed_IPI_flags |= IPI_flags;
+		spin_unlock_irqrestore(&ch->lock, irq_flags);
+		return;
+	}
+
+	if (IPI_flags & XPC_IPI_CLOSEREQUEST) {
+
+		dev_dbg(xpc_chan, "XPC_IPI_CLOSEREQUEST (reason=%d) received "
+			"from partid=%d, channel=%d\n", args->reason,
+			ch->partid, ch->number);
+
+		/*
+		 * If RCLOSEREQUEST is set, we're probably waiting for
+		 * RCLOSEREPLY. We should find it and a ROPENREQUEST packed
+		 * with this RCLOSEREQUEST in the IPI_flags.
+		 */
+
+		if (ch->flags & XPC_C_RCLOSEREQUEST) {
+			DBUG_ON(!(ch->flags & XPC_C_DISCONNECTING));
+			DBUG_ON(!(ch->flags & XPC_C_CLOSEREQUEST));
+			DBUG_ON(!(ch->flags & XPC_C_CLOSEREPLY));
+			DBUG_ON(ch->flags & XPC_C_RCLOSEREPLY);
+
+			DBUG_ON(!(IPI_flags & XPC_IPI_CLOSEREPLY));
+			IPI_flags &= ~XPC_IPI_CLOSEREPLY;
+			ch->flags |= XPC_C_RCLOSEREPLY;
+
+			/* both sides have finished disconnecting */
+			xpc_process_disconnect(ch, &irq_flags);
+			DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED));
+			goto again;
+		}
+
+		if (ch->flags & XPC_C_DISCONNECTED) {
+			if (!(IPI_flags & XPC_IPI_OPENREQUEST)) {
+				if ((XPC_GET_IPI_FLAGS(part->local_IPI_amo,
+						       ch_number) &
+				     XPC_IPI_OPENREQUEST)) {
+
+					DBUG_ON(ch->delayed_IPI_flags != 0);
+					spin_lock(&part->IPI_lock);
+					XPC_SET_IPI_FLAGS(part->local_IPI_amo,
+							  ch_number,
+							  XPC_IPI_CLOSEREQUEST);
+					spin_unlock(&part->IPI_lock);
+				}
+				spin_unlock_irqrestore(&ch->lock, irq_flags);
+				return;
+			}
+
+			XPC_SET_REASON(ch, 0, 0);
+			ch->flags &= ~XPC_C_DISCONNECTED;
+
+			atomic_inc(&part->nchannels_active);
+			ch->flags |= (XPC_C_CONNECTING | XPC_C_ROPENREQUEST);
+		}
+
+		IPI_flags &= ~(XPC_IPI_OPENREQUEST | XPC_IPI_OPENREPLY);
+
+		/*
+		 * The meaningful CLOSEREQUEST connection state fields are:
+		 *      reason = reason connection is to be closed
+		 */
+
+		ch->flags |= XPC_C_RCLOSEREQUEST;
+
+		if (!(ch->flags & XPC_C_DISCONNECTING)) {
+			reason = args->reason;
+			if (reason <= xpcSuccess || reason > xpcUnknownReason)
+				reason = xpcUnknownReason;
+			else if (reason == xpcUnregistering)
+				reason = xpcOtherUnregistering;
+
+			XPC_DISCONNECT_CHANNEL(ch, reason, &irq_flags);
+
+			DBUG_ON(IPI_flags & XPC_IPI_CLOSEREPLY);
+			spin_unlock_irqrestore(&ch->lock, irq_flags);
+			return;
+		}
+
+		xpc_process_disconnect(ch, &irq_flags);
+	}
+
+	if (IPI_flags & XPC_IPI_CLOSEREPLY) {
+
+		dev_dbg(xpc_chan, "XPC_IPI_CLOSEREPLY received from partid=%d,"
+			" channel=%d\n", ch->partid, ch->number);
+
+		if (ch->flags & XPC_C_DISCONNECTED) {
+			DBUG_ON(part->act_state != XPC_P_DEACTIVATING);
+			spin_unlock_irqrestore(&ch->lock, irq_flags);
+			return;
+		}
+
+		DBUG_ON(!(ch->flags & XPC_C_CLOSEREQUEST));
+
+		if (!(ch->flags & XPC_C_RCLOSEREQUEST)) {
+			if ((XPC_GET_IPI_FLAGS(part->local_IPI_amo, ch_number)
+			     & XPC_IPI_CLOSEREQUEST)) {
+
+				DBUG_ON(ch->delayed_IPI_flags != 0);
+				spin_lock(&part->IPI_lock);
+				XPC_SET_IPI_FLAGS(part->local_IPI_amo,
+						  ch_number,
+						  XPC_IPI_CLOSEREPLY);
+				spin_unlock(&part->IPI_lock);
+			}
+			spin_unlock_irqrestore(&ch->lock, irq_flags);
+			return;
+		}
+
+		ch->flags |= XPC_C_RCLOSEREPLY;
+
+		if (ch->flags & XPC_C_CLOSEREPLY) {
+			/* both sides have finished disconnecting */
+			xpc_process_disconnect(ch, &irq_flags);
+		}
+	}
+
+	if (IPI_flags & XPC_IPI_OPENREQUEST) {
+
+		dev_dbg(xpc_chan, "XPC_IPI_OPENREQUEST (msg_size=%d, "
+			"local_nentries=%d) received from partid=%d, "
+			"channel=%d\n", args->msg_size, args->local_nentries,
+			ch->partid, ch->number);
+
+		if (part->act_state == XPC_P_DEACTIVATING ||
+		    (ch->flags & XPC_C_ROPENREQUEST)) {
+			spin_unlock_irqrestore(&ch->lock, irq_flags);
+			return;
+		}
+
+		if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_WDISCONNECT)) {
+			ch->delayed_IPI_flags |= XPC_IPI_OPENREQUEST;
+			spin_unlock_irqrestore(&ch->lock, irq_flags);
+			return;
+		}
+		DBUG_ON(!(ch->flags & (XPC_C_DISCONNECTED |
+				       XPC_C_OPENREQUEST)));
+		DBUG_ON(ch->flags & (XPC_C_ROPENREQUEST | XPC_C_ROPENREPLY |
+				     XPC_C_OPENREPLY | XPC_C_CONNECTED));
+
+		/*
+		 * The meaningful OPENREQUEST connection state fields are:
+		 *      msg_size = size of channel's messages in bytes
+		 *      local_nentries = remote partition's local_nentries
+		 */
+		if (args->msg_size == 0 || args->local_nentries == 0) {
+			/* assume OPENREQUEST was delayed by mistake */
+			spin_unlock_irqrestore(&ch->lock, irq_flags);
+			return;
+		}
+
+		ch->flags |= (XPC_C_ROPENREQUEST | XPC_C_CONNECTING);
+		ch->remote_nentries = args->local_nentries;
+
+		if (ch->flags & XPC_C_OPENREQUEST) {
+			if (args->msg_size != ch->msg_size) {
+				XPC_DISCONNECT_CHANNEL(ch, xpcUnequalMsgSizes,
+						       &irq_flags);
+				spin_unlock_irqrestore(&ch->lock, irq_flags);
+				return;
+			}
+		} else {
+			ch->msg_size = args->msg_size;
+
+			XPC_SET_REASON(ch, 0, 0);
+			ch->flags &= ~XPC_C_DISCONNECTED;
+
+			atomic_inc(&part->nchannels_active);
+		}
+
+		xpc_process_connect(ch, &irq_flags);
+	}
+
+	if (IPI_flags & XPC_IPI_OPENREPLY) {
+
+		dev_dbg(xpc_chan, "XPC_IPI_OPENREPLY (local_msgqueue_pa=0x%lx, "
+			"local_nentries=%d, remote_nentries=%d) received from "
+			"partid=%d, channel=%d\n", args->local_msgqueue_pa,
+			args->local_nentries, args->remote_nentries,
+			ch->partid, ch->number);
+
+		if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_DISCONNECTED)) {
+			spin_unlock_irqrestore(&ch->lock, irq_flags);
+			return;
+		}
+		if (!(ch->flags & XPC_C_OPENREQUEST)) {
+			XPC_DISCONNECT_CHANNEL(ch, xpcOpenCloseError,
+					       &irq_flags);
+			spin_unlock_irqrestore(&ch->lock, irq_flags);
+			return;
+		}
+
+		DBUG_ON(!(ch->flags & XPC_C_ROPENREQUEST));
+		DBUG_ON(ch->flags & XPC_C_CONNECTED);
+
+		/*
+		 * The meaningful OPENREPLY connection state fields are:
+		 *      local_msgqueue_pa = physical address of remote
+		 *                          partition's local_msgqueue
+		 *      local_nentries = remote partition's local_nentries
+		 *      remote_nentries = remote partition's remote_nentries
+		 */
+		DBUG_ON(args->local_msgqueue_pa == 0);
+		DBUG_ON(args->local_nentries == 0);
+		DBUG_ON(args->remote_nentries == 0);
+
+		ch->flags |= XPC_C_ROPENREPLY;
+		ch->remote_msgqueue_pa = args->local_msgqueue_pa;
+
+		if (args->local_nentries < ch->remote_nentries) {
+			dev_dbg(xpc_chan, "XPC_IPI_OPENREPLY: new "
+				"remote_nentries=%d, old remote_nentries=%d, "
+				"partid=%d, channel=%d\n",
+				args->local_nentries, ch->remote_nentries,
+				ch->partid, ch->number);
+
+			ch->remote_nentries = args->local_nentries;
+		}
+		if (args->remote_nentries < ch->local_nentries) {
+			dev_dbg(xpc_chan, "XPC_IPI_OPENREPLY: new "
+				"local_nentries=%d, old local_nentries=%d, "
+				"partid=%d, channel=%d\n",
+				args->remote_nentries, ch->local_nentries,
+				ch->partid, ch->number);
+
+			ch->local_nentries = args->remote_nentries;
+		}
+
+		xpc_process_connect(ch, &irq_flags);
+	}
+
+	spin_unlock_irqrestore(&ch->lock, irq_flags);
+}
+
+/*
+ * Attempt to establish a channel connection to a remote partition.
+ */
+static enum xpc_retval
+xpc_connect_channel(struct xpc_channel *ch)
+{
+	unsigned long irq_flags;
+	struct xpc_registration *registration = &xpc_registrations[ch->number];
+
+	if (mutex_trylock(&registration->mutex) == 0)
+		return xpcRetry;
+
+	if (!XPC_CHANNEL_REGISTERED(ch->number)) {
+		mutex_unlock(&registration->mutex);
+		return xpcUnregistered;
+	}
+
+	spin_lock_irqsave(&ch->lock, irq_flags);
+
+	DBUG_ON(ch->flags & XPC_C_CONNECTED);
+	DBUG_ON(ch->flags & XPC_C_OPENREQUEST);
+
+	if (ch->flags & XPC_C_DISCONNECTING) {
+		spin_unlock_irqrestore(&ch->lock, irq_flags);
+		mutex_unlock(&registration->mutex);
+		return ch->reason;
+	}
+
+	/* add info from the channel connect registration to the channel */
+
+	ch->kthreads_assigned_limit = registration->assigned_limit;
+	ch->kthreads_idle_limit = registration->idle_limit;
+	DBUG_ON(atomic_read(&ch->kthreads_assigned) != 0);
+	DBUG_ON(atomic_read(&ch->kthreads_idle) != 0);
+	DBUG_ON(atomic_read(&ch->kthreads_active) != 0);
+
+	ch->func = registration->func;
+	DBUG_ON(registration->func == NULL);
+	ch->key = registration->key;
+
+	ch->local_nentries = registration->nentries;
+
+	if (ch->flags & XPC_C_ROPENREQUEST) {
+		if (registration->msg_size != ch->msg_size) {
+			/* the local and remote sides aren't the same */
+
+			/*
+			 * Because XPC_DISCONNECT_CHANNEL() can block we're
+			 * forced to up the registration sema before we unlock
+			 * the channel lock. But that's okay here because we're
+			 * done with the part that required the registration
+			 * sema. XPC_DISCONNECT_CHANNEL() requires that the
+			 * channel lock be locked and will unlock and relock
+			 * the channel lock as needed.
+			 */
+			mutex_unlock(&registration->mutex);
+			XPC_DISCONNECT_CHANNEL(ch, xpcUnequalMsgSizes,
+					       &irq_flags);
+			spin_unlock_irqrestore(&ch->lock, irq_flags);
+			return xpcUnequalMsgSizes;
+		}
+	} else {
+		ch->msg_size = registration->msg_size;
+
+		XPC_SET_REASON(ch, 0, 0);
+		ch->flags &= ~XPC_C_DISCONNECTED;
+
+		atomic_inc(&xpc_partitions[ch->partid].nchannels_active);
+	}
+
+	mutex_unlock(&registration->mutex);
+
+	/* initiate the connection */
+
+	ch->flags |= (XPC_C_OPENREQUEST | XPC_C_CONNECTING);
+	xpc_IPI_send_openrequest(ch, &irq_flags);
+
+	xpc_process_connect(ch, &irq_flags);
+
+	spin_unlock_irqrestore(&ch->lock, irq_flags);
+
+	return xpcSuccess;
+}
+
+/*
+ * Clear some of the msg flags in the local message queue.
+ */
+static inline void
+xpc_clear_local_msgqueue_flags(struct xpc_channel *ch)
+{
+	struct xpc_msg *msg;
+	s64 get;
+
+	get = ch->w_remote_GP.get;
+	do {
+		msg = (struct xpc_msg *)((u64)ch->local_msgqueue +
+					 (get % ch->local_nentries) *
+					 ch->msg_size);
+		msg->flags = 0;
+	} while (++get < ch->remote_GP.get);
+}
+
+/*
+ * Clear some of the msg flags in the remote message queue.
+ */
+static inline void
+xpc_clear_remote_msgqueue_flags(struct xpc_channel *ch)
+{
+	struct xpc_msg *msg;
+	s64 put;
+
+	put = ch->w_remote_GP.put;
+	do {
+		msg = (struct xpc_msg *)((u64)ch->remote_msgqueue +
+					 (put % ch->remote_nentries) *
+					 ch->msg_size);
+		msg->flags = 0;
+	} while (++put < ch->remote_GP.put);
+}
+
+static void
+xpc_process_msg_IPI(struct xpc_partition *part, int ch_number)
+{
+	struct xpc_channel *ch = &part->channels[ch_number];
+	int nmsgs_sent;
+
+	ch->remote_GP = part->remote_GPs[ch_number];
+
+	/* See what, if anything, has changed for each connected channel */
+
+	xpc_msgqueue_ref(ch);
+
+	if (ch->w_remote_GP.get == ch->remote_GP.get &&
+	    ch->w_remote_GP.put == ch->remote_GP.put) {
+		/* nothing changed since GPs were last pulled */
+		xpc_msgqueue_deref(ch);
+		return;
+	}
+
+	if (!(ch->flags & XPC_C_CONNECTED)) {
+		xpc_msgqueue_deref(ch);
+		return;
+	}
+
+	/*
+	 * First check to see if messages recently sent by us have been
+	 * received by the other side. (The remote GET value will have
+	 * changed since we last looked at it.)
+	 */
+
+	if (ch->w_remote_GP.get != ch->remote_GP.get) {
+
+		/*
+		 * We need to notify any senders that want to be notified
+		 * that their sent messages have been received by their
+		 * intended recipients. We need to do this before updating
+		 * w_remote_GP.get so that we don't allocate the same message
+		 * queue entries prematurely (see xpc_allocate_msg()).
+		 */
+		if (atomic_read(&ch->n_to_notify) > 0) {
+			/*
+			 * Notify senders that messages sent have been
+			 * received and delivered by the other side.
+			 */
+			xpc_notify_senders(ch, xpcMsgDelivered,
+					   ch->remote_GP.get);
+		}
+
+		/*
+		 * Clear msg->flags in previously sent messages, so that
+		 * they're ready for xpc_allocate_msg().
+		 */
+		xpc_clear_local_msgqueue_flags(ch);
+
+		ch->w_remote_GP.get = ch->remote_GP.get;
+
+		dev_dbg(xpc_chan, "w_remote_GP.get changed to %ld, partid=%d, "
+			"channel=%d\n", ch->w_remote_GP.get, ch->partid,
+			ch->number);
+
+		/*
+		 * If anyone was waiting for message queue entries to become
+		 * available, wake them up.
+		 */
+		if (atomic_read(&ch->n_on_msg_allocate_wq) > 0)
+			wake_up(&ch->msg_allocate_wq);
+	}
+
+	/*
+	 * Now check for newly sent messages by the other side. (The remote
+	 * PUT value will have changed since we last looked at it.)
+	 */
+
+	if (ch->w_remote_GP.put != ch->remote_GP.put) {
+		/*
+		 * Clear msg->flags in previously received messages, so that
+		 * they're ready for xpc_get_deliverable_msg().
+		 */
+		xpc_clear_remote_msgqueue_flags(ch);
+
+		ch->w_remote_GP.put = ch->remote_GP.put;
+
+		dev_dbg(xpc_chan, "w_remote_GP.put changed to %ld, partid=%d, "
+			"channel=%d\n", ch->w_remote_GP.put, ch->partid,
+			ch->number);
+
+		nmsgs_sent = ch->w_remote_GP.put - ch->w_local_GP.get;
+		if (nmsgs_sent > 0) {
+			dev_dbg(xpc_chan, "msgs waiting to be copied and "
+				"delivered=%d, partid=%d, channel=%d\n",
+				nmsgs_sent, ch->partid, ch->number);
+
+			if (ch->flags & XPC_C_CONNECTEDCALLOUT_MADE)
+				xpc_activate_kthreads(ch, nmsgs_sent);
+		}
+	}
+
+	xpc_msgqueue_deref(ch);
+}
+
+void
+xpc_process_channel_activity(struct xpc_partition *part)
+{
+	unsigned long irq_flags;
+	u64 IPI_amo, IPI_flags;
+	struct xpc_channel *ch;
+	int ch_number;
+	u32 ch_flags;
+
+	IPI_amo = xpc_get_IPI_flags(part);
+
+	/*
+	 * Initiate channel connections for registered channels.
+	 *
+	 * For each connected channel that has pending messages activate idle
+	 * kthreads and/or create new kthreads as needed.
+	 */
+
+	for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
+		ch = &part->channels[ch_number];
+
+		/*
+		 * Process any open or close related IPI flags, and then deal
+		 * with connecting or disconnecting the channel as required.
+		 */
+
+		IPI_flags = XPC_GET_IPI_FLAGS(IPI_amo, ch_number);
+
+		if (XPC_ANY_OPENCLOSE_IPI_FLAGS_SET(IPI_flags))
+			xpc_process_openclose_IPI(part, ch_number, IPI_flags);
+
+		ch_flags = ch->flags;	/* need an atomic snapshot of flags */
+
+		if (ch_flags & XPC_C_DISCONNECTING) {
+			spin_lock_irqsave(&ch->lock, irq_flags);
+			xpc_process_disconnect(ch, &irq_flags);
+			spin_unlock_irqrestore(&ch->lock, irq_flags);
+			continue;
+		}
+
+		if (part->act_state == XPC_P_DEACTIVATING)
+			continue;
+
+		if (!(ch_flags & XPC_C_CONNECTED)) {
+			if (!(ch_flags & XPC_C_OPENREQUEST)) {
+				DBUG_ON(ch_flags & XPC_C_SETUP);
+				(void)xpc_connect_channel(ch);
+			} else {
+				spin_lock_irqsave(&ch->lock, irq_flags);
+				xpc_process_connect(ch, &irq_flags);
+				spin_unlock_irqrestore(&ch->lock, irq_flags);
+			}
+			continue;
+		}
+
+		/*
+		 * Process any message related IPI flags, this may involve the
+		 * activation of kthreads to deliver any pending messages sent
+		 * from the other partition.
+		 */
+
+		if (XPC_ANY_MSG_IPI_FLAGS_SET(IPI_flags))
+			xpc_process_msg_IPI(part, ch_number);
+	}
+}
+
+/*
+ * XPC's heartbeat code calls this function to inform XPC that a partition is
+ * going down.  XPC responds by tearing down the XPartition Communication
+ * infrastructure used for the just downed partition.
+ *
+ * XPC's heartbeat code will never call this function and xpc_partition_up()
+ * at the same time. Nor will it ever make multiple calls to either function
+ * at the same time.
+ */
+void
+xpc_partition_going_down(struct xpc_partition *part, enum xpc_retval reason)
+{
+	unsigned long irq_flags;
+	int ch_number;
+	struct xpc_channel *ch;
+
+	dev_dbg(xpc_chan, "deactivating partition %d, reason=%d\n",
+		XPC_PARTID(part), reason);
+
+	if (!xpc_part_ref(part)) {
+		/* infrastructure for this partition isn't currently set up */
+		return;
+	}
+
+	/* disconnect channels associated with the partition going down */
+
+	for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
+		ch = &part->channels[ch_number];
+
+		xpc_msgqueue_ref(ch);
+		spin_lock_irqsave(&ch->lock, irq_flags);
+
+		XPC_DISCONNECT_CHANNEL(ch, reason, &irq_flags);
+
+		spin_unlock_irqrestore(&ch->lock, irq_flags);
+		xpc_msgqueue_deref(ch);
+	}
+
+	xpc_wakeup_channel_mgr(part);
+
+	xpc_part_deref(part);
+}
+
+/*
+ * Teardown the infrastructure necessary to support XPartition Communication
+ * between the specified remote partition and the local one.
+ */
+void
+xpc_teardown_infrastructure(struct xpc_partition *part)
+{
+	partid_t partid = XPC_PARTID(part);
+
+	/*
+	 * We start off by making this partition inaccessible to local
+	 * processes by marking it as no longer setup. Then we make it
+	 * inaccessible to remote processes by clearing the XPC per partition
+	 * specific variable's magic # (which indicates that these variables
+	 * are no longer valid) and by ignoring all XPC notify IPIs sent to
+	 * this partition.
+	 */
+
+	DBUG_ON(atomic_read(&part->nchannels_engaged) != 0);
+	DBUG_ON(atomic_read(&part->nchannels_active) != 0);
+	DBUG_ON(part->setup_state != XPC_P_SETUP);
+	part->setup_state = XPC_P_WTEARDOWN;
+
+	xpc_vars_part[partid].magic = 0;
+
+	free_irq(SGI_XPC_NOTIFY, (void *)(u64)partid);
+
+	/*
+	 * Before proceeding with the teardown we have to wait until all
+	 * existing references cease.
+	 */
+	wait_event(part->teardown_wq, (atomic_read(&part->references) == 0));
+
+	/* now we can begin tearing down the infrastructure */
+
+	part->setup_state = XPC_P_TORNDOWN;
+
+	/* in case we've still got outstanding timers registered... */
+	del_timer_sync(&part->dropped_IPI_timer);
+
+	kfree(part->remote_openclose_args_base);
+	part->remote_openclose_args = NULL;
+	kfree(part->local_openclose_args_base);
+	part->local_openclose_args = NULL;
+	kfree(part->remote_GPs_base);
+	part->remote_GPs = NULL;
+	kfree(part->local_GPs_base);
+	part->local_GPs = NULL;
+	kfree(part->channels);
+	part->channels = NULL;
+	part->local_IPI_amo_va = NULL;
+}
+
+/*
+ * Called by XP at the time of channel connection registration to cause
+ * XPC to establish connections to all currently active partitions.
+ */
+void
+xpc_initiate_connect(int ch_number)
+{
+	partid_t partid;
+	struct xpc_partition *part;
+	struct xpc_channel *ch;
+
+	DBUG_ON(ch_number < 0 || ch_number >= XPC_NCHANNELS);
+
+	for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
+		part = &xpc_partitions[partid];
+
+		if (xpc_part_ref(part)) {
+			ch = &part->channels[ch_number];
+
+			/*
+			 * Initiate the establishment of a connection on the
+			 * newly registered channel to the remote partition.
+			 */
+			xpc_wakeup_channel_mgr(part);
+			xpc_part_deref(part);
+		}
+	}
+}
+
+void
+xpc_connected_callout(struct xpc_channel *ch)
+{
+	/* let the registerer know that a connection has been established */
+
+	if (ch->func != NULL) {
+		dev_dbg(xpc_chan, "ch->func() called, reason=xpcConnected, "
+			"partid=%d, channel=%d\n", ch->partid, ch->number);
+
+		ch->func(xpcConnected, ch->partid, ch->number,
+			 (void *)(u64)ch->local_nentries, ch->key);
+
+		dev_dbg(xpc_chan, "ch->func() returned, reason=xpcConnected, "
+			"partid=%d, channel=%d\n", ch->partid, ch->number);
+	}
+}
+
+/*
+ * Called by XP at the time of channel connection unregistration to cause
+ * XPC to teardown all current connections for the specified channel.
+ *
+ * Before returning xpc_initiate_disconnect() will wait until all connections
+ * on the specified channel have been closed/torndown. So the caller can be
+ * assured that they will not be receiving any more callouts from XPC to the
+ * function they registered via xpc_connect().
+ *
+ * Arguments:
+ *
+ *	ch_number - channel # to unregister.
+ */
+void
+xpc_initiate_disconnect(int ch_number)
+{
+	unsigned long irq_flags;
+	partid_t partid;
+	struct xpc_partition *part;
+	struct xpc_channel *ch;
+
+	DBUG_ON(ch_number < 0 || ch_number >= XPC_NCHANNELS);
+
+	/* initiate the channel disconnect for every active partition */
+	for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
+		part = &xpc_partitions[partid];
+
+		if (xpc_part_ref(part)) {
+			ch = &part->channels[ch_number];
+			xpc_msgqueue_ref(ch);
+
+			spin_lock_irqsave(&ch->lock, irq_flags);
+
+			if (!(ch->flags & XPC_C_DISCONNECTED)) {
+				ch->flags |= XPC_C_WDISCONNECT;
+
+				XPC_DISCONNECT_CHANNEL(ch, xpcUnregistering,
+						       &irq_flags);
+			}
+
+			spin_unlock_irqrestore(&ch->lock, irq_flags);
+
+			xpc_msgqueue_deref(ch);
+			xpc_part_deref(part);
+		}
+	}
+
+	xpc_disconnect_wait(ch_number);
+}
+
+/*
+ * To disconnect a channel, and reflect it back to all who may be waiting.
+ *
+ * An OPEN is not allowed until XPC_C_DISCONNECTING is cleared by
+ * xpc_process_disconnect(), and if set, XPC_C_WDISCONNECT is cleared by
+ * xpc_disconnect_wait().
+ *
+ * THE CHANNEL IS TO BE LOCKED BY THE CALLER AND WILL REMAIN LOCKED UPON RETURN.
+ */
+void
+xpc_disconnect_channel(const int line, struct xpc_channel *ch,
+		       enum xpc_retval reason, unsigned long *irq_flags)
+{
+	u32 channel_was_connected = (ch->flags & XPC_C_CONNECTED);
+
+	DBUG_ON(!spin_is_locked(&ch->lock));
+
+	if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_DISCONNECTED))
+		return;
+
+	DBUG_ON(!(ch->flags & (XPC_C_CONNECTING | XPC_C_CONNECTED)));
+
+	dev_dbg(xpc_chan, "reason=%d, line=%d, partid=%d, channel=%d\n",
+		reason, line, ch->partid, ch->number);
+
+	XPC_SET_REASON(ch, reason, line);
+
+	ch->flags |= (XPC_C_CLOSEREQUEST | XPC_C_DISCONNECTING);
+	/* some of these may not have been set */
+	ch->flags &= ~(XPC_C_OPENREQUEST | XPC_C_OPENREPLY |
+		       XPC_C_ROPENREQUEST | XPC_C_ROPENREPLY |
+		       XPC_C_CONNECTING | XPC_C_CONNECTED);
+
+	xpc_IPI_send_closerequest(ch, irq_flags);
+
+	if (channel_was_connected)
+		ch->flags |= XPC_C_WASCONNECTED;
+
+	spin_unlock_irqrestore(&ch->lock, *irq_flags);
+
+	/* wake all idle kthreads so they can exit */
+	if (atomic_read(&ch->kthreads_idle) > 0) {
+		wake_up_all(&ch->idle_wq);
+
+	} else if ((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) &&
+		   !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) {
+		/* start a kthread that will do the xpcDisconnecting callout */
+		xpc_create_kthreads(ch, 1, 1);
+	}
+
+	/* wake those waiting to allocate an entry from the local msg queue */
+	if (atomic_read(&ch->n_on_msg_allocate_wq) > 0)
+		wake_up(&ch->msg_allocate_wq);
+
+	spin_lock_irqsave(&ch->lock, *irq_flags);
+}
+
+void
+xpc_disconnect_callout(struct xpc_channel *ch, enum xpc_retval reason)
+{
+	/*
+	 * Let the channel's registerer know that the channel is being
+	 * disconnected. We don't want to do this if the registerer was never
+	 * informed of a connection being made.
+	 */
+
+	if (ch->func != NULL) {
+		dev_dbg(xpc_chan, "ch->func() called, reason=%d, partid=%d, "
+			"channel=%d\n", reason, ch->partid, ch->number);
+
+		ch->func(reason, ch->partid, ch->number, NULL, ch->key);
+
+		dev_dbg(xpc_chan, "ch->func() returned, reason=%d, partid=%d, "
+			"channel=%d\n", reason, ch->partid, ch->number);
+	}
+}
+
+/*
+ * Wait for a message entry to become available for the specified channel,
+ * but don't wait any longer than 1 jiffy.
+ */
+static enum xpc_retval
+xpc_allocate_msg_wait(struct xpc_channel *ch)
+{
+	enum xpc_retval ret;
+
+	if (ch->flags & XPC_C_DISCONNECTING) {
+		DBUG_ON(ch->reason == xpcInterrupted);
+		return ch->reason;
+	}
+
+	atomic_inc(&ch->n_on_msg_allocate_wq);
+	ret = interruptible_sleep_on_timeout(&ch->msg_allocate_wq, 1);
+	atomic_dec(&ch->n_on_msg_allocate_wq);
+
+	if (ch->flags & XPC_C_DISCONNECTING) {
+		ret = ch->reason;
+		DBUG_ON(ch->reason == xpcInterrupted);
+	} else if (ret == 0) {
+		ret = xpcTimeout;
+	} else {
+		ret = xpcInterrupted;
+	}
+
+	return ret;
+}
+
+/*
+ * Allocate an entry for a message from the message queue associated with the
+ * specified channel.
+ */
+static enum xpc_retval
+xpc_allocate_msg(struct xpc_channel *ch, u32 flags,
+		 struct xpc_msg **address_of_msg)
+{
+	struct xpc_msg *msg;
+	enum xpc_retval ret;
+	s64 put;
+
+	/* this reference will be dropped in xpc_send_msg() */
+	xpc_msgqueue_ref(ch);
+
+	if (ch->flags & XPC_C_DISCONNECTING) {
+		xpc_msgqueue_deref(ch);
+		return ch->reason;
+	}
+	if (!(ch->flags & XPC_C_CONNECTED)) {
+		xpc_msgqueue_deref(ch);
+		return xpcNotConnected;
+	}
+
+	/*
+	 * Get the next available message entry from the local message queue.
+	 * If none are available, we'll make sure that we grab the latest
+	 * GP values.
+	 */
+	ret = xpcTimeout;
+
+	while (1) {
+
+		put = ch->w_local_GP.put;
+		rmb();	/* guarantee that .put loads before .get */
+		if (put - ch->w_remote_GP.get < ch->local_nentries) {
+
+			/* There are available message entries. We need to try
+			 * to secure one for ourselves. We'll do this by trying
+			 * to increment w_local_GP.put as long as someone else
+			 * doesn't beat us to it. If they do, we'll have to
+			 * try again.
+			 */
+			if (cmpxchg(&ch->w_local_GP.put, put, put + 1) == put) {
+				/* we got the entry referenced by put */
+				break;
+			}
+			continue;	/* try again */
+		}
+
+		/*
+		 * There aren't any available msg entries at this time.
+		 *
+		 * In waiting for a message entry to become available,
+		 * we set a timeout in case the other side is not
+		 * sending completion IPIs. This lets us fake an IPI
+		 * that will cause the IPI handler to fetch the latest
+		 * GP values as if an IPI was sent by the other side.
+		 */
+		if (ret == xpcTimeout)
+			xpc_IPI_send_local_msgrequest(ch);
+
+		if (flags & XPC_NOWAIT) {
+			xpc_msgqueue_deref(ch);
+			return xpcNoWait;
+		}
+
+		ret = xpc_allocate_msg_wait(ch);
+		if (ret != xpcInterrupted && ret != xpcTimeout) {
+			xpc_msgqueue_deref(ch);
+			return ret;
+		}
+	}
+
+	/* get the message's address and initialize it */
+	msg = (struct xpc_msg *)((u64)ch->local_msgqueue +
+				 (put % ch->local_nentries) * ch->msg_size);
+
+	DBUG_ON(msg->flags != 0);
+	msg->number = put;
+
+	dev_dbg(xpc_chan, "w_local_GP.put changed to %ld; msg=0x%p, "
+		"msg_number=%ld, partid=%d, channel=%d\n", put + 1,
+		(void *)msg, msg->number, ch->partid, ch->number);
+
+	*address_of_msg = msg;
+
+	return xpcSuccess;
+}
+
+/*
+ * Allocate an entry for a message from the message queue associated with the
+ * specified channel. NOTE that this routine can sleep waiting for a message
+ * entry to become available. To not sleep, pass in the XPC_NOWAIT flag.
+ *
+ * Arguments:
+ *
+ *	partid - ID of partition to which the channel is connected.
+ *	ch_number - channel #.
+ *	flags - see xpc.h for valid flags.
+ *	payload - address of the allocated payload area pointer (filled in on
+ * 	          return) in which the user-defined message is constructed.
+ */
+enum xpc_retval
+xpc_initiate_allocate(partid_t partid, int ch_number, u32 flags, void **payload)
+{
+	struct xpc_partition *part = &xpc_partitions[partid];
+	enum xpc_retval ret = xpcUnknownReason;
+	struct xpc_msg *msg = NULL;
+
+	DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
+	DBUG_ON(ch_number < 0 || ch_number >= part->nchannels);
+
+	*payload = NULL;
+
+	if (xpc_part_ref(part)) {
+		ret = xpc_allocate_msg(&part->channels[ch_number], flags, &msg);
+		xpc_part_deref(part);
+
+		if (msg != NULL)
+			*payload = &msg->payload;
+	}
+
+	return ret;
+}
+
+/*
+ * Now we actually send the messages that are ready to be sent by advancing
+ * the local message queue's Put value and then send an IPI to the recipient
+ * partition.
+ */
+static void
+xpc_send_msgs(struct xpc_channel *ch, s64 initial_put)
+{
+	struct xpc_msg *msg;
+	s64 put = initial_put + 1;
+	int send_IPI = 0;
+
+	while (1) {
+
+		while (1) {
+			if (put == ch->w_local_GP.put)
+				break;
+
+			msg = (struct xpc_msg *)((u64)ch->local_msgqueue +
+						 (put % ch->local_nentries) *
+						 ch->msg_size);
+
+			if (!(msg->flags & XPC_M_READY))
+				break;
+
+			put++;
+		}
+
+		if (put == initial_put) {
+			/* nothing's changed */
+			break;
+		}
+
+		if (cmpxchg_rel(&ch->local_GP->put, initial_put, put) !=
+		    initial_put) {
+			/* someone else beat us to it */
+			DBUG_ON(ch->local_GP->put < initial_put);
+			break;
+		}
+
+		/* we just set the new value of local_GP->put */
+
+		dev_dbg(xpc_chan, "local_GP->put changed to %ld, partid=%d, "
+			"channel=%d\n", put, ch->partid, ch->number);
+
+		send_IPI = 1;
+
+		/*
+		 * We need to ensure that the message referenced by
+		 * local_GP->put is not XPC_M_READY or that local_GP->put
+		 * equals w_local_GP.put, so we'll go have a look.
+		 */
+		initial_put = put;
+	}
+
+	if (send_IPI)
+		xpc_IPI_send_msgrequest(ch);
+}
+
+/*
+ * Common code that does the actual sending of the message by advancing the
+ * local message queue's Put value and sends an IPI to the partition the
+ * message is being sent to.
+ */
+static enum xpc_retval
+xpc_send_msg(struct xpc_channel *ch, struct xpc_msg *msg, u8 notify_type,
+	     xpc_notify_func func, void *key)
+{
+	enum xpc_retval ret = xpcSuccess;
+	struct xpc_notify *notify = notify;
+	s64 put, msg_number = msg->number;
+
+	DBUG_ON(notify_type == XPC_N_CALL && func == NULL);
+	DBUG_ON((((u64)msg - (u64)ch->local_msgqueue) / ch->msg_size) !=
+		msg_number % ch->local_nentries);
+	DBUG_ON(msg->flags & XPC_M_READY);
+
+	if (ch->flags & XPC_C_DISCONNECTING) {
+		/* drop the reference grabbed in xpc_allocate_msg() */
+		xpc_msgqueue_deref(ch);
+		return ch->reason;
+	}
+
+	if (notify_type != 0) {
+		/*
+		 * Tell the remote side to send an ACK interrupt when the
+		 * message has been delivered.
+		 */
+		msg->flags |= XPC_M_INTERRUPT;
+
+		atomic_inc(&ch->n_to_notify);
+
+		notify = &ch->notify_queue[msg_number % ch->local_nentries];
+		notify->func = func;
+		notify->key = key;
+		notify->type = notify_type;
+
+		/* >>> is a mb() needed here? */
+
+		if (ch->flags & XPC_C_DISCONNECTING) {
+			/*
+			 * An error occurred between our last error check and
+			 * this one. We will try to clear the type field from
+			 * the notify entry. If we succeed then
+			 * xpc_disconnect_channel() didn't already process
+			 * the notify entry.
+			 */
+			if (cmpxchg(&notify->type, notify_type, 0) ==
+			    notify_type) {
+				atomic_dec(&ch->n_to_notify);
+				ret = ch->reason;
+			}
+
+			/* drop the reference grabbed in xpc_allocate_msg() */
+			xpc_msgqueue_deref(ch);
+			return ret;
+		}
+	}
+
+	msg->flags |= XPC_M_READY;
+
+	/*
+	 * The preceding store of msg->flags must occur before the following
+	 * load of ch->local_GP->put.
+	 */
+	mb();
+
+	/* see if the message is next in line to be sent, if so send it */
+
+	put = ch->local_GP->put;
+	if (put == msg_number)
+		xpc_send_msgs(ch, put);
+
+	/* drop the reference grabbed in xpc_allocate_msg() */
+	xpc_msgqueue_deref(ch);
+	return ret;
+}
+
+/*
+ * Send a message previously allocated using xpc_initiate_allocate() on the
+ * specified channel connected to the specified partition.
+ *
+ * This routine will not wait for the message to be received, nor will
+ * notification be given when it does happen. Once this routine has returned
+ * the message entry allocated via xpc_initiate_allocate() is no longer
+ * accessable to the caller.
+ *
+ * This routine, although called by users, does not call xpc_part_ref() to
+ * ensure that the partition infrastructure is in place. It relies on the
+ * fact that we called xpc_msgqueue_ref() in xpc_allocate_msg().
+ *
+ * Arguments:
+ *
+ *	partid - ID of partition to which the channel is connected.
+ *	ch_number - channel # to send message on.
+ *	payload - pointer to the payload area allocated via
+ *			xpc_initiate_allocate().
+ */
+enum xpc_retval
+xpc_initiate_send(partid_t partid, int ch_number, void *payload)
+{
+	struct xpc_partition *part = &xpc_partitions[partid];
+	struct xpc_msg *msg = XPC_MSG_ADDRESS(payload);
+	enum xpc_retval ret;
+
+	dev_dbg(xpc_chan, "msg=0x%p, partid=%d, channel=%d\n", (void *)msg,
+		partid, ch_number);
+
+	DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
+	DBUG_ON(ch_number < 0 || ch_number >= part->nchannels);
+	DBUG_ON(msg == NULL);
+
+	ret = xpc_send_msg(&part->channels[ch_number], msg, 0, NULL, NULL);
+
+	return ret;
+}
+
+/*
+ * Send a message previously allocated using xpc_initiate_allocate on the
+ * specified channel connected to the specified partition.
+ *
+ * This routine will not wait for the message to be sent. Once this routine
+ * has returned the message entry allocated via xpc_initiate_allocate() is no
+ * longer accessable to the caller.
+ *
+ * Once the remote end of the channel has received the message, the function
+ * passed as an argument to xpc_initiate_send_notify() will be called. This
+ * allows the sender to free up or re-use any buffers referenced by the
+ * message, but does NOT mean the message has been processed at the remote
+ * end by a receiver.
+ *
+ * If this routine returns an error, the caller's function will NOT be called.
+ *
+ * This routine, although called by users, does not call xpc_part_ref() to
+ * ensure that the partition infrastructure is in place. It relies on the
+ * fact that we called xpc_msgqueue_ref() in xpc_allocate_msg().
+ *
+ * Arguments:
+ *
+ *	partid - ID of partition to which the channel is connected.
+ *	ch_number - channel # to send message on.
+ *	payload - pointer to the payload area allocated via
+ *			xpc_initiate_allocate().
+ *	func - function to call with asynchronous notification of message
+ *		  receipt. THIS FUNCTION MUST BE NON-BLOCKING.
+ *	key - user-defined key to be passed to the function when it's called.
+ */
+enum xpc_retval
+xpc_initiate_send_notify(partid_t partid, int ch_number, void *payload,
+			 xpc_notify_func func, void *key)
+{
+	struct xpc_partition *part = &xpc_partitions[partid];
+	struct xpc_msg *msg = XPC_MSG_ADDRESS(payload);
+	enum xpc_retval ret;
+
+	dev_dbg(xpc_chan, "msg=0x%p, partid=%d, channel=%d\n", (void *)msg,
+		partid, ch_number);
+
+	DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
+	DBUG_ON(ch_number < 0 || ch_number >= part->nchannels);
+	DBUG_ON(msg == NULL);
+	DBUG_ON(func == NULL);
+
+	ret = xpc_send_msg(&part->channels[ch_number], msg, XPC_N_CALL,
+			   func, key);
+	return ret;
+}
+
+static struct xpc_msg *
+xpc_pull_remote_msg(struct xpc_channel *ch, s64 get)
+{
+	struct xpc_partition *part = &xpc_partitions[ch->partid];
+	struct xpc_msg *remote_msg, *msg;
+	u32 msg_index, nmsgs;
+	u64 msg_offset;
+	enum xpc_retval ret;
+
+	if (mutex_lock_interruptible(&ch->msg_to_pull_mutex) != 0) {
+		/* we were interrupted by a signal */
+		return NULL;
+	}
+
+	while (get >= ch->next_msg_to_pull) {
+
+		/* pull as many messages as are ready and able to be pulled */
+
+		msg_index = ch->next_msg_to_pull % ch->remote_nentries;
+
+		DBUG_ON(ch->next_msg_to_pull >= ch->w_remote_GP.put);
+		nmsgs = ch->w_remote_GP.put - ch->next_msg_to_pull;
+		if (msg_index + nmsgs > ch->remote_nentries) {
+			/* ignore the ones that wrap the msg queue for now */
+			nmsgs = ch->remote_nentries - msg_index;
+		}
+
+		msg_offset = msg_index * ch->msg_size;
+		msg = (struct xpc_msg *)((u64)ch->remote_msgqueue + msg_offset);
+		remote_msg = (struct xpc_msg *)(ch->remote_msgqueue_pa +
+						msg_offset);
+
+		ret = xpc_pull_remote_cachelines(part, msg, remote_msg,
+						 nmsgs * ch->msg_size);
+		if (ret != xpcSuccess) {
+
+			dev_dbg(xpc_chan, "failed to pull %d msgs starting with"
+				" msg %ld from partition %d, channel=%d, "
+				"ret=%d\n", nmsgs, ch->next_msg_to_pull,
+				ch->partid, ch->number, ret);
+
+			XPC_DEACTIVATE_PARTITION(part, ret);
+
+			mutex_unlock(&ch->msg_to_pull_mutex);
+			return NULL;
+		}
+
+		ch->next_msg_to_pull += nmsgs;
+	}
+
+	mutex_unlock(&ch->msg_to_pull_mutex);
+
+	/* return the message we were looking for */
+	msg_offset = (get % ch->remote_nentries) * ch->msg_size;
+	msg = (struct xpc_msg *)((u64)ch->remote_msgqueue + msg_offset);
+
+	return msg;
+}
+
+/*
+ * Get a message to be delivered.
+ */
+static struct xpc_msg *
+xpc_get_deliverable_msg(struct xpc_channel *ch)
+{
+	struct xpc_msg *msg = NULL;
+	s64 get;
+
+	do {
+		if (ch->flags & XPC_C_DISCONNECTING)
+			break;
+
+		get = ch->w_local_GP.get;
+		rmb();	/* guarantee that .get loads before .put */
+		if (get == ch->w_remote_GP.put)
+			break;
+
+		/* There are messages waiting to be pulled and delivered.
+		 * We need to try to secure one for ourselves. We'll do this
+		 * by trying to increment w_local_GP.get and hope that no one
+		 * else beats us to it. If they do, we'll we'll simply have
+		 * to try again for the next one.
+		 */
+
+		if (cmpxchg(&ch->w_local_GP.get, get, get + 1) == get) {
+			/* we got the entry referenced by get */
+
+			dev_dbg(xpc_chan, "w_local_GP.get changed to %ld, "
+				"partid=%d, channel=%d\n", get + 1,
+				ch->partid, ch->number);
+
+			/* pull the message from the remote partition */
+
+			msg = xpc_pull_remote_msg(ch, get);
+
+			DBUG_ON(msg != NULL && msg->number != get);
+			DBUG_ON(msg != NULL && (msg->flags & XPC_M_DONE));
+			DBUG_ON(msg != NULL && !(msg->flags & XPC_M_READY));
+
+			break;
+		}
+
+	} while (1);
+
+	return msg;
+}
+
+/*
+ * Deliver a message to its intended recipient.
+ */
+void
+xpc_deliver_msg(struct xpc_channel *ch)
+{
+	struct xpc_msg *msg;
+
+	msg = xpc_get_deliverable_msg(ch);
+	if (msg != NULL) {
+
+		/*
+		 * This ref is taken to protect the payload itself from being
+		 * freed before the user is finished with it, which the user
+		 * indicates by calling xpc_initiate_received().
+		 */
+		xpc_msgqueue_ref(ch);
+
+		atomic_inc(&ch->kthreads_active);
+
+		if (ch->func != NULL) {
+			dev_dbg(xpc_chan, "ch->func() called, msg=0x%p, "
+				"msg_number=%ld, partid=%d, channel=%d\n",
+				(void *)msg, msg->number, ch->partid,
+				ch->number);
+
+			/* deliver the message to its intended recipient */
+			ch->func(xpcMsgReceived, ch->partid, ch->number,
+				 &msg->payload, ch->key);
+
+			dev_dbg(xpc_chan, "ch->func() returned, msg=0x%p, "
+				"msg_number=%ld, partid=%d, channel=%d\n",
+				(void *)msg, msg->number, ch->partid,
+				ch->number);
+		}
+
+		atomic_dec(&ch->kthreads_active);
+	}
+}
+
+/*
+ * Now we actually acknowledge the messages that have been delivered and ack'd
+ * by advancing the cached remote message queue's Get value and if requested
+ * send an IPI to the message sender's partition.
+ */
+static void
+xpc_acknowledge_msgs(struct xpc_channel *ch, s64 initial_get, u8 msg_flags)
+{
+	struct xpc_msg *msg;
+	s64 get = initial_get + 1;
+	int send_IPI = 0;
+
+	while (1) {
+
+		while (1) {
+			if (get == ch->w_local_GP.get)
+				break;
+
+			msg = (struct xpc_msg *)((u64)ch->remote_msgqueue +
+						 (get % ch->remote_nentries) *
+						 ch->msg_size);
+
+			if (!(msg->flags & XPC_M_DONE))
+				break;
+
+			msg_flags |= msg->flags;
+			get++;
+		}
+
+		if (get == initial_get) {
+			/* nothing's changed */
+			break;
+		}
+
+		if (cmpxchg_rel(&ch->local_GP->get, initial_get, get) !=
+		    initial_get) {
+			/* someone else beat us to it */
+			DBUG_ON(ch->local_GP->get <= initial_get);
+			break;
+		}
+
+		/* we just set the new value of local_GP->get */
+
+		dev_dbg(xpc_chan, "local_GP->get changed to %ld, partid=%d, "
+			"channel=%d\n", get, ch->partid, ch->number);
+
+		send_IPI = (msg_flags & XPC_M_INTERRUPT);
+
+		/*
+		 * We need to ensure that the message referenced by
+		 * local_GP->get is not XPC_M_DONE or that local_GP->get
+		 * equals w_local_GP.get, so we'll go have a look.
+		 */
+		initial_get = get;
+	}
+
+	if (send_IPI)
+		xpc_IPI_send_msgrequest(ch);
+}
+
+/*
+ * Acknowledge receipt of a delivered message.
+ *
+ * If a message has XPC_M_INTERRUPT set, send an interrupt to the partition
+ * that sent the message.
+ *
+ * This function, although called by users, does not call xpc_part_ref() to
+ * ensure that the partition infrastructure is in place. It relies on the
+ * fact that we called xpc_msgqueue_ref() in xpc_deliver_msg().
+ *
+ * Arguments:
+ *
+ *	partid - ID of partition to which the channel is connected.
+ *	ch_number - channel # message received on.
+ *	payload - pointer to the payload area allocated via
+ *			xpc_initiate_allocate().
+ */
+void
+xpc_initiate_received(partid_t partid, int ch_number, void *payload)
+{
+	struct xpc_partition *part = &xpc_partitions[partid];
+	struct xpc_channel *ch;
+	struct xpc_msg *msg = XPC_MSG_ADDRESS(payload);
+	s64 get, msg_number = msg->number;
+
+	DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
+	DBUG_ON(ch_number < 0 || ch_number >= part->nchannels);
+
+	ch = &part->channels[ch_number];
+
+	dev_dbg(xpc_chan, "msg=0x%p, msg_number=%ld, partid=%d, channel=%d\n",
+		(void *)msg, msg_number, ch->partid, ch->number);
+
+	DBUG_ON((((u64)msg - (u64)ch->remote_msgqueue) / ch->msg_size) !=
+		msg_number % ch->remote_nentries);
+	DBUG_ON(msg->flags & XPC_M_DONE);
+
+	msg->flags |= XPC_M_DONE;
+
+	/*
+	 * The preceding store of msg->flags must occur before the following
+	 * load of ch->local_GP->get.
+	 */
+	mb();
+
+	/*
+	 * See if this message is next in line to be acknowledged as having
+	 * been delivered.
+	 */
+	get = ch->local_GP->get;
+	if (get == msg_number)
+		xpc_acknowledge_msgs(ch, get, msg->flags);
+
+	/* the call to xpc_msgqueue_ref() was done by xpc_deliver_msg()  */
+	xpc_msgqueue_deref(ch);
+}
diff --git a/drivers/misc/sgi-xp/xpc_main.c b/drivers/misc/sgi-xp/xpc_main.c
new file mode 100644
index 00000000000..f673ba90eb0
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc_main.c
@@ -0,0 +1,1323 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2004-2008 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+/*
+ * Cross Partition Communication (XPC) support - standard version.
+ *
+ *	XPC provides a message passing capability that crosses partition
+ *	boundaries. This module is made up of two parts:
+ *
+ *	    partition	This part detects the presence/absence of other
+ *			partitions. It provides a heartbeat and monitors
+ *			the heartbeats of other partitions.
+ *
+ *	    channel	This part manages the channels and sends/receives
+ *			messages across them to/from other partitions.
+ *
+ *	There are a couple of additional functions residing in XP, which
+ *	provide an interface to XPC for its users.
+ *
+ *
+ *	Caveats:
+ *
+ *	  . We currently have no way to determine which nasid an IPI came
+ *	    from. Thus, xpc_IPI_send() does a remote AMO write followed by
+ *	    an IPI. The AMO indicates where data is to be pulled from, so
+ *	    after the IPI arrives, the remote partition checks the AMO word.
+ *	    The IPI can actually arrive before the AMO however, so other code
+ *	    must periodically check for this case. Also, remote AMO operations
+ *	    do not reliably time out. Thus we do a remote PIO read solely to
+ *	    know whether the remote partition is down and whether we should
+ *	    stop sending IPIs to it. This remote PIO read operation is set up
+ *	    in a special nofault region so SAL knows to ignore (and cleanup)
+ *	    any errors due to the remote AMO write, PIO read, and/or PIO
+ *	    write operations.
+ *
+ *	    If/when new hardware solves this IPI problem, we should abandon
+ *	    the current approach.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/cache.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/completion.h>
+#include <linux/kdebug.h>
+#include <linux/kthread.h>
+#include <linux/uaccess.h>
+#include <asm/sn/intr.h>
+#include <asm/sn/sn_sal.h>
+#include "xpc.h"
+
+/* define two XPC debug device structures to be used with dev_dbg() et al */
+
+struct device_driver xpc_dbg_name = {
+	.name = "xpc"
+};
+
+struct device xpc_part_dbg_subname = {
+	.bus_id = {0},		/* set to "part" at xpc_init() time */
+	.driver = &xpc_dbg_name
+};
+
+struct device xpc_chan_dbg_subname = {
+	.bus_id = {0},		/* set to "chan" at xpc_init() time */
+	.driver = &xpc_dbg_name
+};
+
+struct device *xpc_part = &xpc_part_dbg_subname;
+struct device *xpc_chan = &xpc_chan_dbg_subname;
+
+static int xpc_kdebug_ignore;
+
+/* systune related variables for /proc/sys directories */
+
+static int xpc_hb_interval = XPC_HB_DEFAULT_INTERVAL;
+static int xpc_hb_min_interval = 1;
+static int xpc_hb_max_interval = 10;
+
+static int xpc_hb_check_interval = XPC_HB_CHECK_DEFAULT_INTERVAL;
+static int xpc_hb_check_min_interval = 10;
+static int xpc_hb_check_max_interval = 120;
+
+int xpc_disengage_request_timelimit = XPC_DISENGAGE_REQUEST_DEFAULT_TIMELIMIT;
+static int xpc_disengage_request_min_timelimit;	/* = 0 */
+static int xpc_disengage_request_max_timelimit = 120;
+
+static ctl_table xpc_sys_xpc_hb_dir[] = {
+	{
+	 .ctl_name = CTL_UNNUMBERED,
+	 .procname = "hb_interval",
+	 .data = &xpc_hb_interval,
+	 .maxlen = sizeof(int),
+	 .mode = 0644,
+	 .proc_handler = &proc_dointvec_minmax,
+	 .strategy = &sysctl_intvec,
+	 .extra1 = &xpc_hb_min_interval,
+	 .extra2 = &xpc_hb_max_interval},
+	{
+	 .ctl_name = CTL_UNNUMBERED,
+	 .procname = "hb_check_interval",
+	 .data = &xpc_hb_check_interval,
+	 .maxlen = sizeof(int),
+	 .mode = 0644,
+	 .proc_handler = &proc_dointvec_minmax,
+	 .strategy = &sysctl_intvec,
+	 .extra1 = &xpc_hb_check_min_interval,
+	 .extra2 = &xpc_hb_check_max_interval},
+	{}
+};
+static ctl_table xpc_sys_xpc_dir[] = {
+	{
+	 .ctl_name = CTL_UNNUMBERED,
+	 .procname = "hb",
+	 .mode = 0555,
+	 .child = xpc_sys_xpc_hb_dir},
+	{
+	 .ctl_name = CTL_UNNUMBERED,
+	 .procname = "disengage_request_timelimit",
+	 .data = &xpc_disengage_request_timelimit,
+	 .maxlen = sizeof(int),
+	 .mode = 0644,
+	 .proc_handler = &proc_dointvec_minmax,
+	 .strategy = &sysctl_intvec,
+	 .extra1 = &xpc_disengage_request_min_timelimit,
+	 .extra2 = &xpc_disengage_request_max_timelimit},
+	{}
+};
+static ctl_table xpc_sys_dir[] = {
+	{
+	 .ctl_name = CTL_UNNUMBERED,
+	 .procname = "xpc",
+	 .mode = 0555,
+	 .child = xpc_sys_xpc_dir},
+	{}
+};
+static struct ctl_table_header *xpc_sysctl;
+
+/* non-zero if any remote partition disengage request was timed out */
+int xpc_disengage_request_timedout;
+
+/* #of IRQs received */
+static atomic_t xpc_act_IRQ_rcvd;
+
+/* IRQ handler notifies this wait queue on receipt of an IRQ */
+static DECLARE_WAIT_QUEUE_HEAD(xpc_act_IRQ_wq);
+
+static unsigned long xpc_hb_check_timeout;
+
+/* notification that the xpc_hb_checker thread has exited */
+static DECLARE_COMPLETION(xpc_hb_checker_exited);
+
+/* notification that the xpc_discovery thread has exited */
+static DECLARE_COMPLETION(xpc_discovery_exited);
+
+static struct timer_list xpc_hb_timer;
+
+static void xpc_kthread_waitmsgs(struct xpc_partition *, struct xpc_channel *);
+
+static int xpc_system_reboot(struct notifier_block *, unsigned long, void *);
+static struct notifier_block xpc_reboot_notifier = {
+	.notifier_call = xpc_system_reboot,
+};
+
+static int xpc_system_die(struct notifier_block *, unsigned long, void *);
+static struct notifier_block xpc_die_notifier = {
+	.notifier_call = xpc_system_die,
+};
+
+/*
+ * Timer function to enforce the timelimit on the partition disengage request.
+ */
+static void
+xpc_timeout_partition_disengage_request(unsigned long data)
+{
+	struct xpc_partition *part = (struct xpc_partition *)data;
+
+	DBUG_ON(time_before(jiffies, part->disengage_request_timeout));
+
+	(void)xpc_partition_disengaged(part);
+
+	DBUG_ON(part->disengage_request_timeout != 0);
+	DBUG_ON(xpc_partition_engaged(1UL << XPC_PARTID(part)) != 0);
+}
+
+/*
+ * Notify the heartbeat check thread that an IRQ has been received.
+ */
+static irqreturn_t
+xpc_act_IRQ_handler(int irq, void *dev_id)
+{
+	atomic_inc(&xpc_act_IRQ_rcvd);
+	wake_up_interruptible(&xpc_act_IRQ_wq);
+	return IRQ_HANDLED;
+}
+
+/*
+ * Timer to produce the heartbeat.  The timer structures function is
+ * already set when this is initially called.  A tunable is used to
+ * specify when the next timeout should occur.
+ */
+static void
+xpc_hb_beater(unsigned long dummy)
+{
+	xpc_vars->heartbeat++;
+
+	if (time_after_eq(jiffies, xpc_hb_check_timeout))
+		wake_up_interruptible(&xpc_act_IRQ_wq);
+
+	xpc_hb_timer.expires = jiffies + (xpc_hb_interval * HZ);
+	add_timer(&xpc_hb_timer);
+}
+
+/*
+ * This thread is responsible for nearly all of the partition
+ * activation/deactivation.
+ */
+static int
+xpc_hb_checker(void *ignore)
+{
+	int last_IRQ_count = 0;
+	int new_IRQ_count;
+	int force_IRQ = 0;
+
+	/* this thread was marked active by xpc_hb_init() */
+
+	set_cpus_allowed(current, cpumask_of_cpu(XPC_HB_CHECK_CPU));
+
+	/* set our heartbeating to other partitions into motion */
+	xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ);
+	xpc_hb_beater(0);
+
+	while (!xpc_exiting) {
+
+		dev_dbg(xpc_part, "woke up with %d ticks rem; %d IRQs have "
+			"been received\n",
+			(int)(xpc_hb_check_timeout - jiffies),
+			atomic_read(&xpc_act_IRQ_rcvd) - last_IRQ_count);
+
+		/* checking of remote heartbeats is skewed by IRQ handling */
+		if (time_after_eq(jiffies, xpc_hb_check_timeout)) {
+			dev_dbg(xpc_part, "checking remote heartbeats\n");
+			xpc_check_remote_hb();
+
+			/*
+			 * We need to periodically recheck to ensure no
+			 * IPI/AMO pairs have been missed.  That check
+			 * must always reset xpc_hb_check_timeout.
+			 */
+			force_IRQ = 1;
+		}
+
+		/* check for outstanding IRQs */
+		new_IRQ_count = atomic_read(&xpc_act_IRQ_rcvd);
+		if (last_IRQ_count < new_IRQ_count || force_IRQ != 0) {
+			force_IRQ = 0;
+
+			dev_dbg(xpc_part, "found an IRQ to process; will be "
+				"resetting xpc_hb_check_timeout\n");
+
+			last_IRQ_count += xpc_identify_act_IRQ_sender();
+			if (last_IRQ_count < new_IRQ_count) {
+				/* retry once to help avoid missing AMO */
+				(void)xpc_identify_act_IRQ_sender();
+			}
+			last_IRQ_count = new_IRQ_count;
+
+			xpc_hb_check_timeout = jiffies +
+			    (xpc_hb_check_interval * HZ);
+		}
+
+		/* wait for IRQ or timeout */
+		(void)wait_event_interruptible(xpc_act_IRQ_wq,
+					       (last_IRQ_count <
+						atomic_read(&xpc_act_IRQ_rcvd)
+						|| time_after_eq(jiffies,
+							xpc_hb_check_timeout) ||
+						xpc_exiting));
+	}
+
+	dev_dbg(xpc_part, "heartbeat checker is exiting\n");
+
+	/* mark this thread as having exited */
+	complete(&xpc_hb_checker_exited);
+	return 0;
+}
+
+/*
+ * This thread will attempt to discover other partitions to activate
+ * based on info provided by SAL. This new thread is short lived and
+ * will exit once discovery is complete.
+ */
+static int
+xpc_initiate_discovery(void *ignore)
+{
+	xpc_discovery();
+
+	dev_dbg(xpc_part, "discovery thread is exiting\n");
+
+	/* mark this thread as having exited */
+	complete(&xpc_discovery_exited);
+	return 0;
+}
+
+/*
+ * Establish first contact with the remote partititon. This involves pulling
+ * the XPC per partition variables from the remote partition and waiting for
+ * the remote partition to pull ours.
+ */
+static enum xpc_retval
+xpc_make_first_contact(struct xpc_partition *part)
+{
+	enum xpc_retval ret;
+
+	while ((ret = xpc_pull_remote_vars_part(part)) != xpcSuccess) {
+		if (ret != xpcRetry) {
+			XPC_DEACTIVATE_PARTITION(part, ret);
+			return ret;
+		}
+
+		dev_dbg(xpc_chan, "waiting to make first contact with "
+			"partition %d\n", XPC_PARTID(part));
+
+		/* wait a 1/4 of a second or so */
+		(void)msleep_interruptible(250);
+
+		if (part->act_state == XPC_P_DEACTIVATING)
+			return part->reason;
+	}
+
+	return xpc_mark_partition_active(part);
+}
+
+/*
+ * The first kthread assigned to a newly activated partition is the one
+ * created by XPC HB with which it calls xpc_partition_up(). XPC hangs on to
+ * that kthread until the partition is brought down, at which time that kthread
+ * returns back to XPC HB. (The return of that kthread will signify to XPC HB
+ * that XPC has dismantled all communication infrastructure for the associated
+ * partition.) This kthread becomes the channel manager for that partition.
+ *
+ * Each active partition has a channel manager, who, besides connecting and
+ * disconnecting channels, will ensure that each of the partition's connected
+ * channels has the required number of assigned kthreads to get the work done.
+ */
+static void
+xpc_channel_mgr(struct xpc_partition *part)
+{
+	while (part->act_state != XPC_P_DEACTIVATING ||
+	       atomic_read(&part->nchannels_active) > 0 ||
+	       !xpc_partition_disengaged(part)) {
+
+		xpc_process_channel_activity(part);
+
+		/*
+		 * Wait until we've been requested to activate kthreads or
+		 * all of the channel's message queues have been torn down or
+		 * a signal is pending.
+		 *
+		 * The channel_mgr_requests is set to 1 after being awakened,
+		 * This is done to prevent the channel mgr from making one pass
+		 * through the loop for each request, since he will
+		 * be servicing all the requests in one pass. The reason it's
+		 * set to 1 instead of 0 is so that other kthreads will know
+		 * that the channel mgr is running and won't bother trying to
+		 * wake him up.
+		 */
+		atomic_dec(&part->channel_mgr_requests);
+		(void)wait_event_interruptible(part->channel_mgr_wq,
+				(atomic_read(&part->channel_mgr_requests) > 0 ||
+				 part->local_IPI_amo != 0 ||
+				 (part->act_state == XPC_P_DEACTIVATING &&
+				 atomic_read(&part->nchannels_active) == 0 &&
+				 xpc_partition_disengaged(part))));
+		atomic_set(&part->channel_mgr_requests, 1);
+	}
+}
+
+/*
+ * When XPC HB determines that a partition has come up, it will create a new
+ * kthread and that kthread will call this function to attempt to set up the
+ * basic infrastructure used for Cross Partition Communication with the newly
+ * upped partition.
+ *
+ * The kthread that was created by XPC HB and which setup the XPC
+ * infrastructure will remain assigned to the partition until the partition
+ * goes down. At which time the kthread will teardown the XPC infrastructure
+ * and then exit.
+ *
+ * XPC HB will put the remote partition's XPC per partition specific variables
+ * physical address into xpc_partitions[partid].remote_vars_part_pa prior to
+ * calling xpc_partition_up().
+ */
+static void
+xpc_partition_up(struct xpc_partition *part)
+{
+	DBUG_ON(part->channels != NULL);
+
+	dev_dbg(xpc_chan, "activating partition %d\n", XPC_PARTID(part));
+
+	if (xpc_setup_infrastructure(part) != xpcSuccess)
+		return;
+
+	/*
+	 * The kthread that XPC HB called us with will become the
+	 * channel manager for this partition. It will not return
+	 * back to XPC HB until the partition's XPC infrastructure
+	 * has been dismantled.
+	 */
+
+	(void)xpc_part_ref(part);	/* this will always succeed */
+
+	if (xpc_make_first_contact(part) == xpcSuccess)
+		xpc_channel_mgr(part);
+
+	xpc_part_deref(part);
+
+	xpc_teardown_infrastructure(part);
+}
+
+static int
+xpc_activating(void *__partid)
+{
+	partid_t partid = (u64)__partid;
+	struct xpc_partition *part = &xpc_partitions[partid];
+	unsigned long irq_flags;
+
+	DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
+
+	spin_lock_irqsave(&part->act_lock, irq_flags);
+
+	if (part->act_state == XPC_P_DEACTIVATING) {
+		part->act_state = XPC_P_INACTIVE;
+		spin_unlock_irqrestore(&part->act_lock, irq_flags);
+		part->remote_rp_pa = 0;
+		return 0;
+	}
+
+	/* indicate the thread is activating */
+	DBUG_ON(part->act_state != XPC_P_ACTIVATION_REQ);
+	part->act_state = XPC_P_ACTIVATING;
+
+	XPC_SET_REASON(part, 0, 0);
+	spin_unlock_irqrestore(&part->act_lock, irq_flags);
+
+	dev_dbg(xpc_part, "bringing partition %d up\n", partid);
+
+	/*
+	 * Register the remote partition's AMOs with SAL so it can handle
+	 * and cleanup errors within that address range should the remote
+	 * partition go down. We don't unregister this range because it is
+	 * difficult to tell when outstanding writes to the remote partition
+	 * are finished and thus when it is safe to unregister. This should
+	 * not result in wasted space in the SAL xp_addr_region table because
+	 * we should get the same page for remote_amos_page_pa after module
+	 * reloads and system reboots.
+	 */
+	if (sn_register_xp_addr_region(part->remote_amos_page_pa,
+				       PAGE_SIZE, 1) < 0) {
+		dev_warn(xpc_part, "xpc_partition_up(%d) failed to register "
+			 "xp_addr region\n", partid);
+
+		spin_lock_irqsave(&part->act_lock, irq_flags);
+		part->act_state = XPC_P_INACTIVE;
+		XPC_SET_REASON(part, xpcPhysAddrRegFailed, __LINE__);
+		spin_unlock_irqrestore(&part->act_lock, irq_flags);
+		part->remote_rp_pa = 0;
+		return 0;
+	}
+
+	xpc_allow_hb(partid, xpc_vars);
+	xpc_IPI_send_activated(part);
+
+	/*
+	 * xpc_partition_up() holds this thread and marks this partition as
+	 * XPC_P_ACTIVE by calling xpc_hb_mark_active().
+	 */
+	(void)xpc_partition_up(part);
+
+	xpc_disallow_hb(partid, xpc_vars);
+	xpc_mark_partition_inactive(part);
+
+	if (part->reason == xpcReactivating) {
+		/* interrupting ourselves results in activating partition */
+		xpc_IPI_send_reactivate(part);
+	}
+
+	return 0;
+}
+
+void
+xpc_activate_partition(struct xpc_partition *part)
+{
+	partid_t partid = XPC_PARTID(part);
+	unsigned long irq_flags;
+	struct task_struct *kthread;
+
+	spin_lock_irqsave(&part->act_lock, irq_flags);
+
+	DBUG_ON(part->act_state != XPC_P_INACTIVE);
+
+	part->act_state = XPC_P_ACTIVATION_REQ;
+	XPC_SET_REASON(part, xpcCloneKThread, __LINE__);
+
+	spin_unlock_irqrestore(&part->act_lock, irq_flags);
+
+	kthread = kthread_run(xpc_activating, (void *)((u64)partid), "xpc%02d",
+			      partid);
+	if (IS_ERR(kthread)) {
+		spin_lock_irqsave(&part->act_lock, irq_flags);
+		part->act_state = XPC_P_INACTIVE;
+		XPC_SET_REASON(part, xpcCloneKThreadFailed, __LINE__);
+		spin_unlock_irqrestore(&part->act_lock, irq_flags);
+	}
+}
+
+/*
+ * Handle the receipt of a SGI_XPC_NOTIFY IRQ by seeing whether the specified
+ * partition actually sent it. Since SGI_XPC_NOTIFY IRQs may be shared by more
+ * than one partition, we use an AMO_t structure per partition to indicate
+ * whether a partition has sent an IPI or not.  If it has, then wake up the
+ * associated kthread to handle it.
+ *
+ * All SGI_XPC_NOTIFY IRQs received by XPC are the result of IPIs sent by XPC
+ * running on other partitions.
+ *
+ * Noteworthy Arguments:
+ *
+ *	irq - Interrupt ReQuest number. NOT USED.
+ *
+ *	dev_id - partid of IPI's potential sender.
+ */
+irqreturn_t
+xpc_notify_IRQ_handler(int irq, void *dev_id)
+{
+	partid_t partid = (partid_t) (u64)dev_id;
+	struct xpc_partition *part = &xpc_partitions[partid];
+
+	DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
+
+	if (xpc_part_ref(part)) {
+		xpc_check_for_channel_activity(part);
+
+		xpc_part_deref(part);
+	}
+	return IRQ_HANDLED;
+}
+
+/*
+ * Check to see if xpc_notify_IRQ_handler() dropped any IPIs on the floor
+ * because the write to their associated IPI amo completed after the IRQ/IPI
+ * was received.
+ */
+void
+xpc_dropped_IPI_check(struct xpc_partition *part)
+{
+	if (xpc_part_ref(part)) {
+		xpc_check_for_channel_activity(part);
+
+		part->dropped_IPI_timer.expires = jiffies +
+		    XPC_P_DROPPED_IPI_WAIT;
+		add_timer(&part->dropped_IPI_timer);
+		xpc_part_deref(part);
+	}
+}
+
+void
+xpc_activate_kthreads(struct xpc_channel *ch, int needed)
+{
+	int idle = atomic_read(&ch->kthreads_idle);
+	int assigned = atomic_read(&ch->kthreads_assigned);
+	int wakeup;
+
+	DBUG_ON(needed <= 0);
+
+	if (idle > 0) {
+		wakeup = (needed > idle) ? idle : needed;
+		needed -= wakeup;
+
+		dev_dbg(xpc_chan, "wakeup %d idle kthreads, partid=%d, "
+			"channel=%d\n", wakeup, ch->partid, ch->number);
+
+		/* only wakeup the requested number of kthreads */
+		wake_up_nr(&ch->idle_wq, wakeup);
+	}
+
+	if (needed <= 0)
+		return;
+
+	if (needed + assigned > ch->kthreads_assigned_limit) {
+		needed = ch->kthreads_assigned_limit - assigned;
+		if (needed <= 0)
+			return;
+	}
+
+	dev_dbg(xpc_chan, "create %d new kthreads, partid=%d, channel=%d\n",
+		needed, ch->partid, ch->number);
+
+	xpc_create_kthreads(ch, needed, 0);
+}
+
+/*
+ * This function is where XPC's kthreads wait for messages to deliver.
+ */
+static void
+xpc_kthread_waitmsgs(struct xpc_partition *part, struct xpc_channel *ch)
+{
+	do {
+		/* deliver messages to their intended recipients */
+
+		while (ch->w_local_GP.get < ch->w_remote_GP.put &&
+		       !(ch->flags & XPC_C_DISCONNECTING)) {
+			xpc_deliver_msg(ch);
+		}
+
+		if (atomic_inc_return(&ch->kthreads_idle) >
+		    ch->kthreads_idle_limit) {
+			/* too many idle kthreads on this channel */
+			atomic_dec(&ch->kthreads_idle);
+			break;
+		}
+
+		dev_dbg(xpc_chan, "idle kthread calling "
+			"wait_event_interruptible_exclusive()\n");
+
+		(void)wait_event_interruptible_exclusive(ch->idle_wq,
+				(ch->w_local_GP.get < ch->w_remote_GP.put ||
+				 (ch->flags & XPC_C_DISCONNECTING)));
+
+		atomic_dec(&ch->kthreads_idle);
+
+	} while (!(ch->flags & XPC_C_DISCONNECTING));
+}
+
+static int
+xpc_kthread_start(void *args)
+{
+	partid_t partid = XPC_UNPACK_ARG1(args);
+	u16 ch_number = XPC_UNPACK_ARG2(args);
+	struct xpc_partition *part = &xpc_partitions[partid];
+	struct xpc_channel *ch;
+	int n_needed;
+	unsigned long irq_flags;
+
+	dev_dbg(xpc_chan, "kthread starting, partid=%d, channel=%d\n",
+		partid, ch_number);
+
+	ch = &part->channels[ch_number];
+
+	if (!(ch->flags & XPC_C_DISCONNECTING)) {
+
+		/* let registerer know that connection has been established */
+
+		spin_lock_irqsave(&ch->lock, irq_flags);
+		if (!(ch->flags & XPC_C_CONNECTEDCALLOUT)) {
+			ch->flags |= XPC_C_CONNECTEDCALLOUT;
+			spin_unlock_irqrestore(&ch->lock, irq_flags);
+
+			xpc_connected_callout(ch);
+
+			spin_lock_irqsave(&ch->lock, irq_flags);
+			ch->flags |= XPC_C_CONNECTEDCALLOUT_MADE;
+			spin_unlock_irqrestore(&ch->lock, irq_flags);
+
+			/*
+			 * It is possible that while the callout was being
+			 * made that the remote partition sent some messages.
+			 * If that is the case, we may need to activate
+			 * additional kthreads to help deliver them. We only
+			 * need one less than total #of messages to deliver.
+			 */
+			n_needed = ch->w_remote_GP.put - ch->w_local_GP.get - 1;
+			if (n_needed > 0 && !(ch->flags & XPC_C_DISCONNECTING))
+				xpc_activate_kthreads(ch, n_needed);
+
+		} else {
+			spin_unlock_irqrestore(&ch->lock, irq_flags);
+		}
+
+		xpc_kthread_waitmsgs(part, ch);
+	}
+
+	/* let registerer know that connection is disconnecting */
+
+	spin_lock_irqsave(&ch->lock, irq_flags);
+	if ((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) &&
+	    !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) {
+		ch->flags |= XPC_C_DISCONNECTINGCALLOUT;
+		spin_unlock_irqrestore(&ch->lock, irq_flags);
+
+		xpc_disconnect_callout(ch, xpcDisconnecting);
+
+		spin_lock_irqsave(&ch->lock, irq_flags);
+		ch->flags |= XPC_C_DISCONNECTINGCALLOUT_MADE;
+	}
+	spin_unlock_irqrestore(&ch->lock, irq_flags);
+
+	if (atomic_dec_return(&ch->kthreads_assigned) == 0) {
+		if (atomic_dec_return(&part->nchannels_engaged) == 0) {
+			xpc_mark_partition_disengaged(part);
+			xpc_IPI_send_disengage(part);
+		}
+	}
+
+	xpc_msgqueue_deref(ch);
+
+	dev_dbg(xpc_chan, "kthread exiting, partid=%d, channel=%d\n",
+		partid, ch_number);
+
+	xpc_part_deref(part);
+	return 0;
+}
+
+/*
+ * For each partition that XPC has established communications with, there is
+ * a minimum of one kernel thread assigned to perform any operation that
+ * may potentially sleep or block (basically the callouts to the asynchronous
+ * functions registered via xpc_connect()).
+ *
+ * Additional kthreads are created and destroyed by XPC as the workload
+ * demands.
+ *
+ * A kthread is assigned to one of the active channels that exists for a given
+ * partition.
+ */
+void
+xpc_create_kthreads(struct xpc_channel *ch, int needed,
+		    int ignore_disconnecting)
+{
+	unsigned long irq_flags;
+	u64 args = XPC_PACK_ARGS(ch->partid, ch->number);
+	struct xpc_partition *part = &xpc_partitions[ch->partid];
+	struct task_struct *kthread;
+
+	while (needed-- > 0) {
+
+		/*
+		 * The following is done on behalf of the newly created
+		 * kthread. That kthread is responsible for doing the
+		 * counterpart to the following before it exits.
+		 */
+		if (ignore_disconnecting) {
+			if (!atomic_inc_not_zero(&ch->kthreads_assigned)) {
+				/* kthreads assigned had gone to zero */
+				BUG_ON(!(ch->flags &
+					 XPC_C_DISCONNECTINGCALLOUT_MADE));
+				break;
+			}
+
+		} else if (ch->flags & XPC_C_DISCONNECTING) {
+			break;
+
+		} else if (atomic_inc_return(&ch->kthreads_assigned) == 1) {
+			if (atomic_inc_return(&part->nchannels_engaged) == 1)
+				xpc_mark_partition_engaged(part);
+		}
+		(void)xpc_part_ref(part);
+		xpc_msgqueue_ref(ch);
+
+		kthread = kthread_run(xpc_kthread_start, (void *)args,
+				      "xpc%02dc%d", ch->partid, ch->number);
+		if (IS_ERR(kthread)) {
+			/* the fork failed */
+
+			/*
+			 * NOTE: if (ignore_disconnecting &&
+			 * !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) is true,
+			 * then we'll deadlock if all other kthreads assigned
+			 * to this channel are blocked in the channel's
+			 * registerer, because the only thing that will unblock
+			 * them is the xpcDisconnecting callout that this
+			 * failed kthread_run() would have made.
+			 */
+
+			if (atomic_dec_return(&ch->kthreads_assigned) == 0 &&
+			    atomic_dec_return(&part->nchannels_engaged) == 0) {
+				xpc_mark_partition_disengaged(part);
+				xpc_IPI_send_disengage(part);
+			}
+			xpc_msgqueue_deref(ch);
+			xpc_part_deref(part);
+
+			if (atomic_read(&ch->kthreads_assigned) <
+			    ch->kthreads_idle_limit) {
+				/*
+				 * Flag this as an error only if we have an
+				 * insufficient #of kthreads for the channel
+				 * to function.
+				 */
+				spin_lock_irqsave(&ch->lock, irq_flags);
+				XPC_DISCONNECT_CHANNEL(ch, xpcLackOfResources,
+						       &irq_flags);
+				spin_unlock_irqrestore(&ch->lock, irq_flags);
+			}
+			break;
+		}
+	}
+}
+
+void
+xpc_disconnect_wait(int ch_number)
+{
+	unsigned long irq_flags;
+	partid_t partid;
+	struct xpc_partition *part;
+	struct xpc_channel *ch;
+	int wakeup_channel_mgr;
+
+	/* now wait for all callouts to the caller's function to cease */
+	for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
+		part = &xpc_partitions[partid];
+
+		if (!xpc_part_ref(part))
+			continue;
+
+		ch = &part->channels[ch_number];
+
+		if (!(ch->flags & XPC_C_WDISCONNECT)) {
+			xpc_part_deref(part);
+			continue;
+		}
+
+		wait_for_completion(&ch->wdisconnect_wait);
+
+		spin_lock_irqsave(&ch->lock, irq_flags);
+		DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED));
+		wakeup_channel_mgr = 0;
+
+		if (ch->delayed_IPI_flags) {
+			if (part->act_state != XPC_P_DEACTIVATING) {
+				spin_lock(&part->IPI_lock);
+				XPC_SET_IPI_FLAGS(part->local_IPI_amo,
+						  ch->number,
+						  ch->delayed_IPI_flags);
+				spin_unlock(&part->IPI_lock);
+				wakeup_channel_mgr = 1;
+			}
+			ch->delayed_IPI_flags = 0;
+		}
+
+		ch->flags &= ~XPC_C_WDISCONNECT;
+		spin_unlock_irqrestore(&ch->lock, irq_flags);
+
+		if (wakeup_channel_mgr)
+			xpc_wakeup_channel_mgr(part);
+
+		xpc_part_deref(part);
+	}
+}
+
+static void
+xpc_do_exit(enum xpc_retval reason)
+{
+	partid_t partid;
+	int active_part_count, printed_waiting_msg = 0;
+	struct xpc_partition *part;
+	unsigned long printmsg_time, disengage_request_timeout = 0;
+
+	/* a 'rmmod XPC' and a 'reboot' cannot both end up here together */
+	DBUG_ON(xpc_exiting == 1);
+
+	/*
+	 * Let the heartbeat checker thread and the discovery thread
+	 * (if one is running) know that they should exit. Also wake up
+	 * the heartbeat checker thread in case it's sleeping.
+	 */
+	xpc_exiting = 1;
+	wake_up_interruptible(&xpc_act_IRQ_wq);
+
+	/* ignore all incoming interrupts */
+	free_irq(SGI_XPC_ACTIVATE, NULL);
+
+	/* wait for the discovery thread to exit */
+	wait_for_completion(&xpc_discovery_exited);
+
+	/* wait for the heartbeat checker thread to exit */
+	wait_for_completion(&xpc_hb_checker_exited);
+
+	/* sleep for a 1/3 of a second or so */
+	(void)msleep_interruptible(300);
+
+	/* wait for all partitions to become inactive */
+
+	printmsg_time = jiffies + (XPC_DISENGAGE_PRINTMSG_INTERVAL * HZ);
+	xpc_disengage_request_timedout = 0;
+
+	do {
+		active_part_count = 0;
+
+		for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
+			part = &xpc_partitions[partid];
+
+			if (xpc_partition_disengaged(part) &&
+			    part->act_state == XPC_P_INACTIVE) {
+				continue;
+			}
+
+			active_part_count++;
+
+			XPC_DEACTIVATE_PARTITION(part, reason);
+
+			if (part->disengage_request_timeout >
+			    disengage_request_timeout) {
+				disengage_request_timeout =
+				    part->disengage_request_timeout;
+			}
+		}
+
+		if (xpc_partition_engaged(-1UL)) {
+			if (time_after(jiffies, printmsg_time)) {
+				dev_info(xpc_part, "waiting for remote "
+					 "partitions to disengage, timeout in "
+					 "%ld seconds\n",
+					 (disengage_request_timeout - jiffies)
+					 / HZ);
+				printmsg_time = jiffies +
+				    (XPC_DISENGAGE_PRINTMSG_INTERVAL * HZ);
+				printed_waiting_msg = 1;
+			}
+
+		} else if (active_part_count > 0) {
+			if (printed_waiting_msg) {
+				dev_info(xpc_part, "waiting for local partition"
+					 " to disengage\n");
+				printed_waiting_msg = 0;
+			}
+
+		} else {
+			if (!xpc_disengage_request_timedout) {
+				dev_info(xpc_part, "all partitions have "
+					 "disengaged\n");
+			}
+			break;
+		}
+
+		/* sleep for a 1/3 of a second or so */
+		(void)msleep_interruptible(300);
+
+	} while (1);
+
+	DBUG_ON(xpc_partition_engaged(-1UL));
+
+	/* indicate to others that our reserved page is uninitialized */
+	xpc_rsvd_page->vars_pa = 0;
+
+	/* now it's time to eliminate our heartbeat */
+	del_timer_sync(&xpc_hb_timer);
+	DBUG_ON(xpc_vars->heartbeating_to_mask != 0);
+
+	if (reason == xpcUnloading) {
+		/* take ourselves off of the reboot_notifier_list */
+		(void)unregister_reboot_notifier(&xpc_reboot_notifier);
+
+		/* take ourselves off of the die_notifier list */
+		(void)unregister_die_notifier(&xpc_die_notifier);
+	}
+
+	/* close down protections for IPI operations */
+	xpc_restrict_IPI_ops();
+
+	/* clear the interface to XPC's functions */
+	xpc_clear_interface();
+
+	if (xpc_sysctl)
+		unregister_sysctl_table(xpc_sysctl);
+
+	kfree(xpc_remote_copy_buffer_base);
+}
+
+/*
+ * This function is called when the system is being rebooted.
+ */
+static int
+xpc_system_reboot(struct notifier_block *nb, unsigned long event, void *unused)
+{
+	enum xpc_retval reason;
+
+	switch (event) {
+	case SYS_RESTART:
+		reason = xpcSystemReboot;
+		break;
+	case SYS_HALT:
+		reason = xpcSystemHalt;
+		break;
+	case SYS_POWER_OFF:
+		reason = xpcSystemPoweroff;
+		break;
+	default:
+		reason = xpcSystemGoingDown;
+	}
+
+	xpc_do_exit(reason);
+	return NOTIFY_DONE;
+}
+
+/*
+ * Notify other partitions to disengage from all references to our memory.
+ */
+static void
+xpc_die_disengage(void)
+{
+	struct xpc_partition *part;
+	partid_t partid;
+	unsigned long engaged;
+	long time, printmsg_time, disengage_request_timeout;
+
+	/* keep xpc_hb_checker thread from doing anything (just in case) */
+	xpc_exiting = 1;
+
+	xpc_vars->heartbeating_to_mask = 0;	/* indicate we're deactivated */
+
+	for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
+		part = &xpc_partitions[partid];
+
+		if (!XPC_SUPPORTS_DISENGAGE_REQUEST(part->
+		    remote_vars_version)) {
+
+			/* just in case it was left set by an earlier XPC */
+			xpc_clear_partition_engaged(1UL << partid);
+			continue;
+		}
+
+		if (xpc_partition_engaged(1UL << partid) ||
+		    part->act_state != XPC_P_INACTIVE) {
+			xpc_request_partition_disengage(part);
+			xpc_mark_partition_disengaged(part);
+			xpc_IPI_send_disengage(part);
+		}
+	}
+
+	time = rtc_time();
+	printmsg_time = time +
+	    (XPC_DISENGAGE_PRINTMSG_INTERVAL * sn_rtc_cycles_per_second);
+	disengage_request_timeout = time +
+	    (xpc_disengage_request_timelimit * sn_rtc_cycles_per_second);
+
+	/* wait for all other partitions to disengage from us */
+
+	while (1) {
+		engaged = xpc_partition_engaged(-1UL);
+		if (!engaged) {
+			dev_info(xpc_part, "all partitions have disengaged\n");
+			break;
+		}
+
+		time = rtc_time();
+		if (time >= disengage_request_timeout) {
+			for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
+				if (engaged & (1UL << partid)) {
+					dev_info(xpc_part, "disengage from "
+						 "remote partition %d timed "
+						 "out\n", partid);
+				}
+			}
+			break;
+		}
+
+		if (time >= printmsg_time) {
+			dev_info(xpc_part, "waiting for remote partitions to "
+				 "disengage, timeout in %ld seconds\n",
+				 (disengage_request_timeout - time) /
+				 sn_rtc_cycles_per_second);
+			printmsg_time = time +
+			    (XPC_DISENGAGE_PRINTMSG_INTERVAL *
+			     sn_rtc_cycles_per_second);
+		}
+	}
+}
+
+/*
+ * This function is called when the system is being restarted or halted due
+ * to some sort of system failure. If this is the case we need to notify the
+ * other partitions to disengage from all references to our memory.
+ * This function can also be called when our heartbeater could be offlined
+ * for a time. In this case we need to notify other partitions to not worry
+ * about the lack of a heartbeat.
+ */
+static int
+xpc_system_die(struct notifier_block *nb, unsigned long event, void *unused)
+{
+	switch (event) {
+	case DIE_MACHINE_RESTART:
+	case DIE_MACHINE_HALT:
+		xpc_die_disengage();
+		break;
+
+	case DIE_KDEBUG_ENTER:
+		/* Should lack of heartbeat be ignored by other partitions? */
+		if (!xpc_kdebug_ignore)
+			break;
+
+		/* fall through */
+	case DIE_MCA_MONARCH_ENTER:
+	case DIE_INIT_MONARCH_ENTER:
+		xpc_vars->heartbeat++;
+		xpc_vars->heartbeat_offline = 1;
+		break;
+
+	case DIE_KDEBUG_LEAVE:
+		/* Is lack of heartbeat being ignored by other partitions? */
+		if (!xpc_kdebug_ignore)
+			break;
+
+		/* fall through */
+	case DIE_MCA_MONARCH_LEAVE:
+	case DIE_INIT_MONARCH_LEAVE:
+		xpc_vars->heartbeat++;
+		xpc_vars->heartbeat_offline = 0;
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+int __init
+xpc_init(void)
+{
+	int ret;
+	partid_t partid;
+	struct xpc_partition *part;
+	struct task_struct *kthread;
+	size_t buf_size;
+
+	if (!ia64_platform_is("sn2"))
+		return -ENODEV;
+
+	buf_size = max(XPC_RP_VARS_SIZE,
+		       XPC_RP_HEADER_SIZE + XP_NASID_MASK_BYTES);
+	xpc_remote_copy_buffer = xpc_kmalloc_cacheline_aligned(buf_size,
+							       GFP_KERNEL,
+						  &xpc_remote_copy_buffer_base);
+	if (xpc_remote_copy_buffer == NULL)
+		return -ENOMEM;
+
+	snprintf(xpc_part->bus_id, BUS_ID_SIZE, "part");
+	snprintf(xpc_chan->bus_id, BUS_ID_SIZE, "chan");
+
+	xpc_sysctl = register_sysctl_table(xpc_sys_dir);
+
+	/*
+	 * The first few fields of each entry of xpc_partitions[] need to
+	 * be initialized now so that calls to xpc_connect() and
+	 * xpc_disconnect() can be made prior to the activation of any remote
+	 * partition. NOTE THAT NONE OF THE OTHER FIELDS BELONGING TO THESE
+	 * ENTRIES ARE MEANINGFUL UNTIL AFTER AN ENTRY'S CORRESPONDING
+	 * PARTITION HAS BEEN ACTIVATED.
+	 */
+	for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
+		part = &xpc_partitions[partid];
+
+		DBUG_ON((u64)part != L1_CACHE_ALIGN((u64)part));
+
+		part->act_IRQ_rcvd = 0;
+		spin_lock_init(&part->act_lock);
+		part->act_state = XPC_P_INACTIVE;
+		XPC_SET_REASON(part, 0, 0);
+
+		init_timer(&part->disengage_request_timer);
+		part->disengage_request_timer.function =
+		    xpc_timeout_partition_disengage_request;
+		part->disengage_request_timer.data = (unsigned long)part;
+
+		part->setup_state = XPC_P_UNSET;
+		init_waitqueue_head(&part->teardown_wq);
+		atomic_set(&part->references, 0);
+	}
+
+	/*
+	 * Open up protections for IPI operations (and AMO operations on
+	 * Shub 1.1 systems).
+	 */
+	xpc_allow_IPI_ops();
+
+	/*
+	 * Interrupts being processed will increment this atomic variable and
+	 * awaken the heartbeat thread which will process the interrupts.
+	 */
+	atomic_set(&xpc_act_IRQ_rcvd, 0);
+
+	/*
+	 * This is safe to do before the xpc_hb_checker thread has started
+	 * because the handler releases a wait queue.  If an interrupt is
+	 * received before the thread is waiting, it will not go to sleep,
+	 * but rather immediately process the interrupt.
+	 */
+	ret = request_irq(SGI_XPC_ACTIVATE, xpc_act_IRQ_handler, 0,
+			  "xpc hb", NULL);
+	if (ret != 0) {
+		dev_err(xpc_part, "can't register ACTIVATE IRQ handler, "
+			"errno=%d\n", -ret);
+
+		xpc_restrict_IPI_ops();
+
+		if (xpc_sysctl)
+			unregister_sysctl_table(xpc_sysctl);
+
+		kfree(xpc_remote_copy_buffer_base);
+		return -EBUSY;
+	}
+
+	/*
+	 * Fill the partition reserved page with the information needed by
+	 * other partitions to discover we are alive and establish initial
+	 * communications.
+	 */
+	xpc_rsvd_page = xpc_rsvd_page_init();
+	if (xpc_rsvd_page == NULL) {
+		dev_err(xpc_part, "could not setup our reserved page\n");
+
+		free_irq(SGI_XPC_ACTIVATE, NULL);
+		xpc_restrict_IPI_ops();
+
+		if (xpc_sysctl)
+			unregister_sysctl_table(xpc_sysctl);
+
+		kfree(xpc_remote_copy_buffer_base);
+		return -EBUSY;
+	}
+
+	/* add ourselves to the reboot_notifier_list */
+	ret = register_reboot_notifier(&xpc_reboot_notifier);
+	if (ret != 0)
+		dev_warn(xpc_part, "can't register reboot notifier\n");
+
+	/* add ourselves to the die_notifier list */
+	ret = register_die_notifier(&xpc_die_notifier);
+	if (ret != 0)
+		dev_warn(xpc_part, "can't register die notifier\n");
+
+	init_timer(&xpc_hb_timer);
+	xpc_hb_timer.function = xpc_hb_beater;
+
+	/*
+	 * The real work-horse behind xpc.  This processes incoming
+	 * interrupts and monitors remote heartbeats.
+	 */
+	kthread = kthread_run(xpc_hb_checker, NULL, XPC_HB_CHECK_THREAD_NAME);
+	if (IS_ERR(kthread)) {
+		dev_err(xpc_part, "failed while forking hb check thread\n");
+
+		/* indicate to others that our reserved page is uninitialized */
+		xpc_rsvd_page->vars_pa = 0;
+
+		/* take ourselves off of the reboot_notifier_list */
+		(void)unregister_reboot_notifier(&xpc_reboot_notifier);
+
+		/* take ourselves off of the die_notifier list */
+		(void)unregister_die_notifier(&xpc_die_notifier);
+
+		del_timer_sync(&xpc_hb_timer);
+		free_irq(SGI_XPC_ACTIVATE, NULL);
+		xpc_restrict_IPI_ops();
+
+		if (xpc_sysctl)
+			unregister_sysctl_table(xpc_sysctl);
+
+		kfree(xpc_remote_copy_buffer_base);
+		return -EBUSY;
+	}
+
+	/*
+	 * Startup a thread that will attempt to discover other partitions to
+	 * activate based on info provided by SAL. This new thread is short
+	 * lived and will exit once discovery is complete.
+	 */
+	kthread = kthread_run(xpc_initiate_discovery, NULL,
+			      XPC_DISCOVERY_THREAD_NAME);
+	if (IS_ERR(kthread)) {
+		dev_err(xpc_part, "failed while forking discovery thread\n");
+
+		/* mark this new thread as a non-starter */
+		complete(&xpc_discovery_exited);
+
+		xpc_do_exit(xpcUnloading);
+		return -EBUSY;
+	}
+
+	/* set the interface to point at XPC's functions */
+	xpc_set_interface(xpc_initiate_connect, xpc_initiate_disconnect,
+			  xpc_initiate_allocate, xpc_initiate_send,
+			  xpc_initiate_send_notify, xpc_initiate_received,
+			  xpc_initiate_partid_to_nasids);
+
+	return 0;
+}
+
+module_init(xpc_init);
+
+void __exit
+xpc_exit(void)
+{
+	xpc_do_exit(xpcUnloading);
+}
+
+module_exit(xpc_exit);
+
+MODULE_AUTHOR("Silicon Graphics, Inc.");
+MODULE_DESCRIPTION("Cross Partition Communication (XPC) support");
+MODULE_LICENSE("GPL");
+
+module_param(xpc_hb_interval, int, 0);
+MODULE_PARM_DESC(xpc_hb_interval, "Number of seconds between "
+		 "heartbeat increments.");
+
+module_param(xpc_hb_check_interval, int, 0);
+MODULE_PARM_DESC(xpc_hb_check_interval, "Number of seconds between "
+		 "heartbeat checks.");
+
+module_param(xpc_disengage_request_timelimit, int, 0);
+MODULE_PARM_DESC(xpc_disengage_request_timelimit, "Number of seconds to wait "
+		 "for disengage request to complete.");
+
+module_param(xpc_kdebug_ignore, int, 0);
+MODULE_PARM_DESC(xpc_kdebug_ignore, "Should lack of heartbeat be ignored by "
+		 "other partitions when dropping into kdebug.");
diff --git a/drivers/misc/sgi-xp/xpc_partition.c b/drivers/misc/sgi-xp/xpc_partition.c
new file mode 100644
index 00000000000..27e200ec582
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc_partition.c
@@ -0,0 +1,1174 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2004-2008 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+/*
+ * Cross Partition Communication (XPC) partition support.
+ *
+ *	This is the part of XPC that detects the presence/absence of
+ *	other partitions. It provides a heartbeat and monitors the
+ *	heartbeats of other partitions.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/sysctl.h>
+#include <linux/cache.h>
+#include <linux/mmzone.h>
+#include <linux/nodemask.h>
+#include <asm/uncached.h>
+#include <asm/sn/bte.h>
+#include <asm/sn/intr.h>
+#include <asm/sn/sn_sal.h>
+#include <asm/sn/nodepda.h>
+#include <asm/sn/addrs.h>
+#include "xpc.h"
+
+/* XPC is exiting flag */
+int xpc_exiting;
+
+/* SH_IPI_ACCESS shub register value on startup */
+static u64 xpc_sh1_IPI_access;
+static u64 xpc_sh2_IPI_access0;
+static u64 xpc_sh2_IPI_access1;
+static u64 xpc_sh2_IPI_access2;
+static u64 xpc_sh2_IPI_access3;
+
+/* original protection values for each node */
+u64 xpc_prot_vec[MAX_NUMNODES];
+
+/* this partition's reserved page pointers */
+struct xpc_rsvd_page *xpc_rsvd_page;
+static u64 *xpc_part_nasids;
+static u64 *xpc_mach_nasids;
+struct xpc_vars *xpc_vars;
+struct xpc_vars_part *xpc_vars_part;
+
+static int xp_nasid_mask_bytes;	/* actual size in bytes of nasid mask */
+static int xp_nasid_mask_words;	/* actual size in words of nasid mask */
+
+/*
+ * For performance reasons, each entry of xpc_partitions[] is cacheline
+ * aligned. And xpc_partitions[] is padded with an additional entry at the
+ * end so that the last legitimate entry doesn't share its cacheline with
+ * another variable.
+ */
+struct xpc_partition xpc_partitions[XP_MAX_PARTITIONS + 1];
+
+/*
+ * Generic buffer used to store a local copy of portions of a remote
+ * partition's reserved page (either its header and part_nasids mask,
+ * or its vars).
+ */
+char *xpc_remote_copy_buffer;
+void *xpc_remote_copy_buffer_base;
+
+/*
+ * Guarantee that the kmalloc'd memory is cacheline aligned.
+ */
+void *
+xpc_kmalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
+{
+	/* see if kmalloc will give us cachline aligned memory by default */
+	*base = kmalloc(size, flags);
+	if (*base == NULL)
+		return NULL;
+
+	if ((u64)*base == L1_CACHE_ALIGN((u64)*base))
+		return *base;
+
+	kfree(*base);
+
+	/* nope, we'll have to do it ourselves */
+	*base = kmalloc(size + L1_CACHE_BYTES, flags);
+	if (*base == NULL)
+		return NULL;
+
+	return (void *)L1_CACHE_ALIGN((u64)*base);
+}
+
+/*
+ * Given a nasid, get the physical address of the  partition's reserved page
+ * for that nasid. This function returns 0 on any error.
+ */
+static u64
+xpc_get_rsvd_page_pa(int nasid)
+{
+	bte_result_t bte_res;
+	s64 status;
+	u64 cookie = 0;
+	u64 rp_pa = nasid;	/* seed with nasid */
+	u64 len = 0;
+	u64 buf = buf;
+	u64 buf_len = 0;
+	void *buf_base = NULL;
+
+	while (1) {
+
+		status = sn_partition_reserved_page_pa(buf, &cookie, &rp_pa,
+						       &len);
+
+		dev_dbg(xpc_part, "SAL returned with status=%li, cookie="
+			"0x%016lx, address=0x%016lx, len=0x%016lx\n",
+			status, cookie, rp_pa, len);
+
+		if (status != SALRET_MORE_PASSES)
+			break;
+
+		if (L1_CACHE_ALIGN(len) > buf_len) {
+			kfree(buf_base);
+			buf_len = L1_CACHE_ALIGN(len);
+			buf = (u64)xpc_kmalloc_cacheline_aligned(buf_len,
+								 GFP_KERNEL,
+								 &buf_base);
+			if (buf_base == NULL) {
+				dev_err(xpc_part, "unable to kmalloc "
+					"len=0x%016lx\n", buf_len);
+				status = SALRET_ERROR;
+				break;
+			}
+		}
+
+		bte_res = xp_bte_copy(rp_pa, buf, buf_len,
+				      (BTE_NOTIFY | BTE_WACQUIRE), NULL);
+		if (bte_res != BTE_SUCCESS) {
+			dev_dbg(xpc_part, "xp_bte_copy failed %i\n", bte_res);
+			status = SALRET_ERROR;
+			break;
+		}
+	}
+
+	kfree(buf_base);
+
+	if (status != SALRET_OK)
+		rp_pa = 0;
+
+	dev_dbg(xpc_part, "reserved page at phys address 0x%016lx\n", rp_pa);
+	return rp_pa;
+}
+
+/*
+ * Fill the partition reserved page with the information needed by
+ * other partitions to discover we are alive and establish initial
+ * communications.
+ */
+struct xpc_rsvd_page *
+xpc_rsvd_page_init(void)
+{
+	struct xpc_rsvd_page *rp;
+	AMO_t *amos_page;
+	u64 rp_pa, nasid_array = 0;
+	int i, ret;
+
+	/* get the local reserved page's address */
+
+	preempt_disable();
+	rp_pa = xpc_get_rsvd_page_pa(cpuid_to_nasid(smp_processor_id()));
+	preempt_enable();
+	if (rp_pa == 0) {
+		dev_err(xpc_part, "SAL failed to locate the reserved page\n");
+		return NULL;
+	}
+	rp = (struct xpc_rsvd_page *)__va(rp_pa);
+
+	if (rp->partid != sn_partition_id) {
+		dev_err(xpc_part, "the reserved page's partid of %d should be "
+			"%d\n", rp->partid, sn_partition_id);
+		return NULL;
+	}
+
+	rp->version = XPC_RP_VERSION;
+
+	/* establish the actual sizes of the nasid masks */
+	if (rp->SAL_version == 1) {
+		/* SAL_version 1 didn't set the nasids_size field */
+		rp->nasids_size = 128;
+	}
+	xp_nasid_mask_bytes = rp->nasids_size;
+	xp_nasid_mask_words = xp_nasid_mask_bytes / 8;
+
+	/* setup the pointers to the various items in the reserved page */
+	xpc_part_nasids = XPC_RP_PART_NASIDS(rp);
+	xpc_mach_nasids = XPC_RP_MACH_NASIDS(rp);
+	xpc_vars = XPC_RP_VARS(rp);
+	xpc_vars_part = XPC_RP_VARS_PART(rp);
+
+	/*
+	 * Before clearing xpc_vars, see if a page of AMOs had been previously
+	 * allocated. If not we'll need to allocate one and set permissions
+	 * so that cross-partition AMOs are allowed.
+	 *
+	 * The allocated AMO page needs MCA reporting to remain disabled after
+	 * XPC has unloaded.  To make this work, we keep a copy of the pointer
+	 * to this page (i.e., amos_page) in the struct xpc_vars structure,
+	 * which is pointed to by the reserved page, and re-use that saved copy
+	 * on subsequent loads of XPC. This AMO page is never freed, and its
+	 * memory protections are never restricted.
+	 */
+	amos_page = xpc_vars->amos_page;
+	if (amos_page == NULL) {
+		amos_page = (AMO_t *)TO_AMO(uncached_alloc_page(0));
+		if (amos_page == NULL) {
+			dev_err(xpc_part, "can't allocate page of AMOs\n");
+			return NULL;
+		}
+
+		/*
+		 * Open up AMO-R/W to cpu.  This is done for Shub 1.1 systems
+		 * when xpc_allow_IPI_ops() is called via xpc_hb_init().
+		 */
+		if (!enable_shub_wars_1_1()) {
+			ret = sn_change_memprotect(ia64_tpa((u64)amos_page),
+						   PAGE_SIZE,
+						   SN_MEMPROT_ACCESS_CLASS_1,
+						   &nasid_array);
+			if (ret != 0) {
+				dev_err(xpc_part, "can't change memory "
+					"protections\n");
+				uncached_free_page(__IA64_UNCACHED_OFFSET |
+						   TO_PHYS((u64)amos_page));
+				return NULL;
+			}
+		}
+	} else if (!IS_AMO_ADDRESS((u64)amos_page)) {
+		/*
+		 * EFI's XPBOOT can also set amos_page in the reserved page,
+		 * but it happens to leave it as an uncached physical address
+		 * and we need it to be an uncached virtual, so we'll have to
+		 * convert it.
+		 */
+		if (!IS_AMO_PHYS_ADDRESS((u64)amos_page)) {
+			dev_err(xpc_part, "previously used amos_page address "
+				"is bad = 0x%p\n", (void *)amos_page);
+			return NULL;
+		}
+		amos_page = (AMO_t *)TO_AMO((u64)amos_page);
+	}
+
+	/* clear xpc_vars */
+	memset(xpc_vars, 0, sizeof(struct xpc_vars));
+
+	xpc_vars->version = XPC_V_VERSION;
+	xpc_vars->act_nasid = cpuid_to_nasid(0);
+	xpc_vars->act_phys_cpuid = cpu_physical_id(0);
+	xpc_vars->vars_part_pa = __pa(xpc_vars_part);
+	xpc_vars->amos_page_pa = ia64_tpa((u64)amos_page);
+	xpc_vars->amos_page = amos_page;	/* save for next load of XPC */
+
+	/* clear xpc_vars_part */
+	memset((u64 *)xpc_vars_part, 0, sizeof(struct xpc_vars_part) *
+	       XP_MAX_PARTITIONS);
+
+	/* initialize the activate IRQ related AMO variables */
+	for (i = 0; i < xp_nasid_mask_words; i++)
+		(void)xpc_IPI_init(XPC_ACTIVATE_IRQ_AMOS + i);
+
+	/* initialize the engaged remote partitions related AMO variables */
+	(void)xpc_IPI_init(XPC_ENGAGED_PARTITIONS_AMO);
+	(void)xpc_IPI_init(XPC_DISENGAGE_REQUEST_AMO);
+
+	/* timestamp of when reserved page was setup by XPC */
+	rp->stamp = CURRENT_TIME;
+
+	/*
+	 * This signifies to the remote partition that our reserved
+	 * page is initialized.
+	 */
+	rp->vars_pa = __pa(xpc_vars);
+
+	return rp;
+}
+
+/*
+ * Change protections to allow IPI operations (and AMO operations on
+ * Shub 1.1 systems).
+ */
+void
+xpc_allow_IPI_ops(void)
+{
+	int node;
+	int nasid;
+
+	/* >>> Change SH_IPI_ACCESS code to use SAL call once it is available */
+
+	if (is_shub2()) {
+		xpc_sh2_IPI_access0 =
+		    (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS0));
+		xpc_sh2_IPI_access1 =
+		    (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS1));
+		xpc_sh2_IPI_access2 =
+		    (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS2));
+		xpc_sh2_IPI_access3 =
+		    (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS3));
+
+		for_each_online_node(node) {
+			nasid = cnodeid_to_nasid(node);
+			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS0),
+			      -1UL);
+			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS1),
+			      -1UL);
+			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS2),
+			      -1UL);
+			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS3),
+			      -1UL);
+		}
+
+	} else {
+		xpc_sh1_IPI_access =
+		    (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH1_IPI_ACCESS));
+
+		for_each_online_node(node) {
+			nasid = cnodeid_to_nasid(node);
+			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH1_IPI_ACCESS),
+			      -1UL);
+
+			/*
+			 * Since the BIST collides with memory operations on
+			 * SHUB 1.1 sn_change_memprotect() cannot be used.
+			 */
+			if (enable_shub_wars_1_1()) {
+				/* open up everything */
+				xpc_prot_vec[node] = (u64)HUB_L((u64 *)
+								GLOBAL_MMR_ADDR
+								(nasid,
+						  SH1_MD_DQLP_MMR_DIR_PRIVEC0));
+				HUB_S((u64 *)
+				      GLOBAL_MMR_ADDR(nasid,
+						   SH1_MD_DQLP_MMR_DIR_PRIVEC0),
+				      -1UL);
+				HUB_S((u64 *)
+				      GLOBAL_MMR_ADDR(nasid,
+						   SH1_MD_DQRP_MMR_DIR_PRIVEC0),
+				      -1UL);
+			}
+		}
+	}
+}
+
+/*
+ * Restrict protections to disallow IPI operations (and AMO operations on
+ * Shub 1.1 systems).
+ */
+void
+xpc_restrict_IPI_ops(void)
+{
+	int node;
+	int nasid;
+
+	/* >>> Change SH_IPI_ACCESS code to use SAL call once it is available */
+
+	if (is_shub2()) {
+
+		for_each_online_node(node) {
+			nasid = cnodeid_to_nasid(node);
+			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS0),
+			      xpc_sh2_IPI_access0);
+			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS1),
+			      xpc_sh2_IPI_access1);
+			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS2),
+			      xpc_sh2_IPI_access2);
+			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS3),
+			      xpc_sh2_IPI_access3);
+		}
+
+	} else {
+
+		for_each_online_node(node) {
+			nasid = cnodeid_to_nasid(node);
+			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH1_IPI_ACCESS),
+			      xpc_sh1_IPI_access);
+
+			if (enable_shub_wars_1_1()) {
+				HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid,
+						   SH1_MD_DQLP_MMR_DIR_PRIVEC0),
+				      xpc_prot_vec[node]);
+				HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid,
+						   SH1_MD_DQRP_MMR_DIR_PRIVEC0),
+				      xpc_prot_vec[node]);
+			}
+		}
+	}
+}
+
+/*
+ * At periodic intervals, scan through all active partitions and ensure
+ * their heartbeat is still active.  If not, the partition is deactivated.
+ */
+void
+xpc_check_remote_hb(void)
+{
+	struct xpc_vars *remote_vars;
+	struct xpc_partition *part;
+	partid_t partid;
+	bte_result_t bres;
+
+	remote_vars = (struct xpc_vars *)xpc_remote_copy_buffer;
+
+	for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
+
+		if (xpc_exiting)
+			break;
+
+		if (partid == sn_partition_id)
+			continue;
+
+		part = &xpc_partitions[partid];
+
+		if (part->act_state == XPC_P_INACTIVE ||
+		    part->act_state == XPC_P_DEACTIVATING) {
+			continue;
+		}
+
+		/* pull the remote_hb cache line */
+		bres = xp_bte_copy(part->remote_vars_pa,
+				   (u64)remote_vars,
+				   XPC_RP_VARS_SIZE,
+				   (BTE_NOTIFY | BTE_WACQUIRE), NULL);
+		if (bres != BTE_SUCCESS) {
+			XPC_DEACTIVATE_PARTITION(part,
+						 xpc_map_bte_errors(bres));
+			continue;
+		}
+
+		dev_dbg(xpc_part, "partid = %d, heartbeat = %ld, last_heartbeat"
+			" = %ld, heartbeat_offline = %ld, HB_mask = 0x%lx\n",
+			partid, remote_vars->heartbeat, part->last_heartbeat,
+			remote_vars->heartbeat_offline,
+			remote_vars->heartbeating_to_mask);
+
+		if (((remote_vars->heartbeat == part->last_heartbeat) &&
+		     (remote_vars->heartbeat_offline == 0)) ||
+		    !xpc_hb_allowed(sn_partition_id, remote_vars)) {
+
+			XPC_DEACTIVATE_PARTITION(part, xpcNoHeartbeat);
+			continue;
+		}
+
+		part->last_heartbeat = remote_vars->heartbeat;
+	}
+}
+
+/*
+ * Get a copy of a portion of the remote partition's rsvd page.
+ *
+ * remote_rp points to a buffer that is cacheline aligned for BTE copies and
+ * is large enough to contain a copy of their reserved page header and
+ * part_nasids mask.
+ */
+static enum xpc_retval
+xpc_get_remote_rp(int nasid, u64 *discovered_nasids,
+		  struct xpc_rsvd_page *remote_rp, u64 *remote_rp_pa)
+{
+	int bres, i;
+
+	/* get the reserved page's physical address */
+
+	*remote_rp_pa = xpc_get_rsvd_page_pa(nasid);
+	if (*remote_rp_pa == 0)
+		return xpcNoRsvdPageAddr;
+
+	/* pull over the reserved page header and part_nasids mask */
+	bres = xp_bte_copy(*remote_rp_pa, (u64)remote_rp,
+			   XPC_RP_HEADER_SIZE + xp_nasid_mask_bytes,
+			   (BTE_NOTIFY | BTE_WACQUIRE), NULL);
+	if (bres != BTE_SUCCESS)
+		return xpc_map_bte_errors(bres);
+
+	if (discovered_nasids != NULL) {
+		u64 *remote_part_nasids = XPC_RP_PART_NASIDS(remote_rp);
+
+		for (i = 0; i < xp_nasid_mask_words; i++)
+			discovered_nasids[i] |= remote_part_nasids[i];
+	}
+
+	/* check that the partid is for another partition */
+
+	if (remote_rp->partid < 1 ||
+	    remote_rp->partid > (XP_MAX_PARTITIONS - 1)) {
+		return xpcInvalidPartid;
+	}
+
+	if (remote_rp->partid == sn_partition_id)
+		return xpcLocalPartid;
+
+	if (XPC_VERSION_MAJOR(remote_rp->version) !=
+	    XPC_VERSION_MAJOR(XPC_RP_VERSION)) {
+		return xpcBadVersion;
+	}
+
+	return xpcSuccess;
+}
+
+/*
+ * Get a copy of the remote partition's XPC variables from the reserved page.
+ *
+ * remote_vars points to a buffer that is cacheline aligned for BTE copies and
+ * assumed to be of size XPC_RP_VARS_SIZE.
+ */
+static enum xpc_retval
+xpc_get_remote_vars(u64 remote_vars_pa, struct xpc_vars *remote_vars)
+{
+	int bres;
+
+	if (remote_vars_pa == 0)
+		return xpcVarsNotSet;
+
+	/* pull over the cross partition variables */
+	bres = xp_bte_copy(remote_vars_pa, (u64)remote_vars, XPC_RP_VARS_SIZE,
+			   (BTE_NOTIFY | BTE_WACQUIRE), NULL);
+	if (bres != BTE_SUCCESS)
+		return xpc_map_bte_errors(bres);
+
+	if (XPC_VERSION_MAJOR(remote_vars->version) !=
+	    XPC_VERSION_MAJOR(XPC_V_VERSION)) {
+		return xpcBadVersion;
+	}
+
+	return xpcSuccess;
+}
+
+/*
+ * Update the remote partition's info.
+ */
+static void
+xpc_update_partition_info(struct xpc_partition *part, u8 remote_rp_version,
+			  struct timespec *remote_rp_stamp, u64 remote_rp_pa,
+			  u64 remote_vars_pa, struct xpc_vars *remote_vars)
+{
+	part->remote_rp_version = remote_rp_version;
+	dev_dbg(xpc_part, "  remote_rp_version = 0x%016x\n",
+		part->remote_rp_version);
+
+	part->remote_rp_stamp = *remote_rp_stamp;
+	dev_dbg(xpc_part, "  remote_rp_stamp (tv_sec = 0x%lx tv_nsec = 0x%lx\n",
+		part->remote_rp_stamp.tv_sec, part->remote_rp_stamp.tv_nsec);
+
+	part->remote_rp_pa = remote_rp_pa;
+	dev_dbg(xpc_part, "  remote_rp_pa = 0x%016lx\n", part->remote_rp_pa);
+
+	part->remote_vars_pa = remote_vars_pa;
+	dev_dbg(xpc_part, "  remote_vars_pa = 0x%016lx\n",
+		part->remote_vars_pa);
+
+	part->last_heartbeat = remote_vars->heartbeat;
+	dev_dbg(xpc_part, "  last_heartbeat = 0x%016lx\n",
+		part->last_heartbeat);
+
+	part->remote_vars_part_pa = remote_vars->vars_part_pa;
+	dev_dbg(xpc_part, "  remote_vars_part_pa = 0x%016lx\n",
+		part->remote_vars_part_pa);
+
+	part->remote_act_nasid = remote_vars->act_nasid;
+	dev_dbg(xpc_part, "  remote_act_nasid = 0x%x\n",
+		part->remote_act_nasid);
+
+	part->remote_act_phys_cpuid = remote_vars->act_phys_cpuid;
+	dev_dbg(xpc_part, "  remote_act_phys_cpuid = 0x%x\n",
+		part->remote_act_phys_cpuid);
+
+	part->remote_amos_page_pa = remote_vars->amos_page_pa;
+	dev_dbg(xpc_part, "  remote_amos_page_pa = 0x%lx\n",
+		part->remote_amos_page_pa);
+
+	part->remote_vars_version = remote_vars->version;
+	dev_dbg(xpc_part, "  remote_vars_version = 0x%x\n",
+		part->remote_vars_version);
+}
+
+/*
+ * Prior code has determined the nasid which generated an IPI.  Inspect
+ * that nasid to determine if its partition needs to be activated or
+ * deactivated.
+ *
+ * A partition is consider "awaiting activation" if our partition
+ * flags indicate it is not active and it has a heartbeat.  A
+ * partition is considered "awaiting deactivation" if our partition
+ * flags indicate it is active but it has no heartbeat or it is not
+ * sending its heartbeat to us.
+ *
+ * To determine the heartbeat, the remote nasid must have a properly
+ * initialized reserved page.
+ */
+static void
+xpc_identify_act_IRQ_req(int nasid)
+{
+	struct xpc_rsvd_page *remote_rp;
+	struct xpc_vars *remote_vars;
+	u64 remote_rp_pa;
+	u64 remote_vars_pa;
+	int remote_rp_version;
+	int reactivate = 0;
+	int stamp_diff;
+	struct timespec remote_rp_stamp = { 0, 0 };
+	partid_t partid;
+	struct xpc_partition *part;
+	enum xpc_retval ret;
+
+	/* pull over the reserved page structure */
+
+	remote_rp = (struct xpc_rsvd_page *)xpc_remote_copy_buffer;
+
+	ret = xpc_get_remote_rp(nasid, NULL, remote_rp, &remote_rp_pa);
+	if (ret != xpcSuccess) {
+		dev_warn(xpc_part, "unable to get reserved page from nasid %d, "
+			 "which sent interrupt, reason=%d\n", nasid, ret);
+		return;
+	}
+
+	remote_vars_pa = remote_rp->vars_pa;
+	remote_rp_version = remote_rp->version;
+	if (XPC_SUPPORTS_RP_STAMP(remote_rp_version))
+		remote_rp_stamp = remote_rp->stamp;
+
+	partid = remote_rp->partid;
+	part = &xpc_partitions[partid];
+
+	/* pull over the cross partition variables */
+
+	remote_vars = (struct xpc_vars *)xpc_remote_copy_buffer;
+
+	ret = xpc_get_remote_vars(remote_vars_pa, remote_vars);
+	if (ret != xpcSuccess) {
+
+		dev_warn(xpc_part, "unable to get XPC variables from nasid %d, "
+			 "which sent interrupt, reason=%d\n", nasid, ret);
+
+		XPC_DEACTIVATE_PARTITION(part, ret);
+		return;
+	}
+
+	part->act_IRQ_rcvd++;
+
+	dev_dbg(xpc_part, "partid for nasid %d is %d; IRQs = %d; HB = "
+		"%ld:0x%lx\n", (int)nasid, (int)partid, part->act_IRQ_rcvd,
+		remote_vars->heartbeat, remote_vars->heartbeating_to_mask);
+
+	if (xpc_partition_disengaged(part) &&
+	    part->act_state == XPC_P_INACTIVE) {
+
+		xpc_update_partition_info(part, remote_rp_version,
+					  &remote_rp_stamp, remote_rp_pa,
+					  remote_vars_pa, remote_vars);
+
+		if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version)) {
+			if (xpc_partition_disengage_requested(1UL << partid)) {
+				/*
+				 * Other side is waiting on us to disengage,
+				 * even though we already have.
+				 */
+				return;
+			}
+		} else {
+			/* other side doesn't support disengage requests */
+			xpc_clear_partition_disengage_request(1UL << partid);
+		}
+
+		xpc_activate_partition(part);
+		return;
+	}
+
+	DBUG_ON(part->remote_rp_version == 0);
+	DBUG_ON(part->remote_vars_version == 0);
+
+	if (!XPC_SUPPORTS_RP_STAMP(part->remote_rp_version)) {
+		DBUG_ON(XPC_SUPPORTS_DISENGAGE_REQUEST(part->
+						       remote_vars_version));
+
+		if (!XPC_SUPPORTS_RP_STAMP(remote_rp_version)) {
+			DBUG_ON(XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->
+							       version));
+			/* see if the other side rebooted */
+			if (part->remote_amos_page_pa ==
+			    remote_vars->amos_page_pa &&
+			    xpc_hb_allowed(sn_partition_id, remote_vars)) {
+				/* doesn't look that way, so ignore the IPI */
+				return;
+			}
+		}
+
+		/*
+		 * Other side rebooted and previous XPC didn't support the
+		 * disengage request, so we don't need to do anything special.
+		 */
+
+		xpc_update_partition_info(part, remote_rp_version,
+					  &remote_rp_stamp, remote_rp_pa,
+					  remote_vars_pa, remote_vars);
+		part->reactivate_nasid = nasid;
+		XPC_DEACTIVATE_PARTITION(part, xpcReactivating);
+		return;
+	}
+
+	DBUG_ON(!XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version));
+
+	if (!XPC_SUPPORTS_RP_STAMP(remote_rp_version)) {
+		DBUG_ON(!XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->version));
+
+		/*
+		 * Other side rebooted and previous XPC did support the
+		 * disengage request, but the new one doesn't.
+		 */
+
+		xpc_clear_partition_engaged(1UL << partid);
+		xpc_clear_partition_disengage_request(1UL << partid);
+
+		xpc_update_partition_info(part, remote_rp_version,
+					  &remote_rp_stamp, remote_rp_pa,
+					  remote_vars_pa, remote_vars);
+		reactivate = 1;
+
+	} else {
+		DBUG_ON(!XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->version));
+
+		stamp_diff = xpc_compare_stamps(&part->remote_rp_stamp,
+						&remote_rp_stamp);
+		if (stamp_diff != 0) {
+			DBUG_ON(stamp_diff >= 0);
+
+			/*
+			 * Other side rebooted and the previous XPC did support
+			 * the disengage request, as does the new one.
+			 */
+
+			DBUG_ON(xpc_partition_engaged(1UL << partid));
+			DBUG_ON(xpc_partition_disengage_requested(1UL <<
+								  partid));
+
+			xpc_update_partition_info(part, remote_rp_version,
+						  &remote_rp_stamp,
+						  remote_rp_pa, remote_vars_pa,
+						  remote_vars);
+			reactivate = 1;
+		}
+	}
+
+	if (part->disengage_request_timeout > 0 &&
+	    !xpc_partition_disengaged(part)) {
+		/* still waiting on other side to disengage from us */
+		return;
+	}
+
+	if (reactivate) {
+		part->reactivate_nasid = nasid;
+		XPC_DEACTIVATE_PARTITION(part, xpcReactivating);
+
+	} else if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version) &&
+		   xpc_partition_disengage_requested(1UL << partid)) {
+		XPC_DEACTIVATE_PARTITION(part, xpcOtherGoingDown);
+	}
+}
+
+/*
+ * Loop through the activation AMO variables and process any bits
+ * which are set.  Each bit indicates a nasid sending a partition
+ * activation or deactivation request.
+ *
+ * Return #of IRQs detected.
+ */
+int
+xpc_identify_act_IRQ_sender(void)
+{
+	int word, bit;
+	u64 nasid_mask;
+	u64 nasid;		/* remote nasid */
+	int n_IRQs_detected = 0;
+	AMO_t *act_amos;
+
+	act_amos = xpc_vars->amos_page + XPC_ACTIVATE_IRQ_AMOS;
+
+	/* scan through act AMO variable looking for non-zero entries */
+	for (word = 0; word < xp_nasid_mask_words; word++) {
+
+		if (xpc_exiting)
+			break;
+
+		nasid_mask = xpc_IPI_receive(&act_amos[word]);
+		if (nasid_mask == 0) {
+			/* no IRQs from nasids in this variable */
+			continue;
+		}
+
+		dev_dbg(xpc_part, "AMO[%d] gave back 0x%lx\n", word,
+			nasid_mask);
+
+		/*
+		 * If this nasid has been added to the machine since
+		 * our partition was reset, this will retain the
+		 * remote nasid in our reserved pages machine mask.
+		 * This is used in the event of module reload.
+		 */
+		xpc_mach_nasids[word] |= nasid_mask;
+
+		/* locate the nasid(s) which sent interrupts */
+
+		for (bit = 0; bit < (8 * sizeof(u64)); bit++) {
+			if (nasid_mask & (1UL << bit)) {
+				n_IRQs_detected++;
+				nasid = XPC_NASID_FROM_W_B(word, bit);
+				dev_dbg(xpc_part, "interrupt from nasid %ld\n",
+					nasid);
+				xpc_identify_act_IRQ_req(nasid);
+			}
+		}
+	}
+	return n_IRQs_detected;
+}
+
+/*
+ * See if the other side has responded to a partition disengage request
+ * from us.
+ */
+int
+xpc_partition_disengaged(struct xpc_partition *part)
+{
+	partid_t partid = XPC_PARTID(part);
+	int disengaged;
+
+	disengaged = (xpc_partition_engaged(1UL << partid) == 0);
+	if (part->disengage_request_timeout) {
+		if (!disengaged) {
+			if (time_before(jiffies,
+			    part->disengage_request_timeout)) {
+				/* timelimit hasn't been reached yet */
+				return 0;
+			}
+
+			/*
+			 * Other side hasn't responded to our disengage
+			 * request in a timely fashion, so assume it's dead.
+			 */
+
+			dev_info(xpc_part, "disengage from remote partition %d "
+				 "timed out\n", partid);
+			xpc_disengage_request_timedout = 1;
+			xpc_clear_partition_engaged(1UL << partid);
+			disengaged = 1;
+		}
+		part->disengage_request_timeout = 0;
+
+		/* cancel the timer function, provided it's not us */
+		if (!in_interrupt()) {
+			del_singleshot_timer_sync(&part->
+						  disengage_request_timer);
+		}
+
+		DBUG_ON(part->act_state != XPC_P_DEACTIVATING &&
+			part->act_state != XPC_P_INACTIVE);
+		if (part->act_state != XPC_P_INACTIVE)
+			xpc_wakeup_channel_mgr(part);
+
+		if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version))
+			xpc_cancel_partition_disengage_request(part);
+	}
+	return disengaged;
+}
+
+/*
+ * Mark specified partition as active.
+ */
+enum xpc_retval
+xpc_mark_partition_active(struct xpc_partition *part)
+{
+	unsigned long irq_flags;
+	enum xpc_retval ret;
+
+	dev_dbg(xpc_part, "setting partition %d to ACTIVE\n", XPC_PARTID(part));
+
+	spin_lock_irqsave(&part->act_lock, irq_flags);
+	if (part->act_state == XPC_P_ACTIVATING) {
+		part->act_state = XPC_P_ACTIVE;
+		ret = xpcSuccess;
+	} else {
+		DBUG_ON(part->reason == xpcSuccess);
+		ret = part->reason;
+	}
+	spin_unlock_irqrestore(&part->act_lock, irq_flags);
+
+	return ret;
+}
+
+/*
+ * Notify XPC that the partition is down.
+ */
+void
+xpc_deactivate_partition(const int line, struct xpc_partition *part,
+			 enum xpc_retval reason)
+{
+	unsigned long irq_flags;
+
+	spin_lock_irqsave(&part->act_lock, irq_flags);
+
+	if (part->act_state == XPC_P_INACTIVE) {
+		XPC_SET_REASON(part, reason, line);
+		spin_unlock_irqrestore(&part->act_lock, irq_flags);
+		if (reason == xpcReactivating) {
+			/* we interrupt ourselves to reactivate partition */
+			xpc_IPI_send_reactivate(part);
+		}
+		return;
+	}
+	if (part->act_state == XPC_P_DEACTIVATING) {
+		if ((part->reason == xpcUnloading && reason != xpcUnloading) ||
+		    reason == xpcReactivating) {
+			XPC_SET_REASON(part, reason, line);
+		}
+		spin_unlock_irqrestore(&part->act_lock, irq_flags);
+		return;
+	}
+
+	part->act_state = XPC_P_DEACTIVATING;
+	XPC_SET_REASON(part, reason, line);
+
+	spin_unlock_irqrestore(&part->act_lock, irq_flags);
+
+	if (XPC_SUPPORTS_DISENGAGE_REQUEST(part->remote_vars_version)) {
+		xpc_request_partition_disengage(part);
+		xpc_IPI_send_disengage(part);
+
+		/* set a timelimit on the disengage request */
+		part->disengage_request_timeout = jiffies +
+		    (xpc_disengage_request_timelimit * HZ);
+		part->disengage_request_timer.expires =
+		    part->disengage_request_timeout;
+		add_timer(&part->disengage_request_timer);
+	}
+
+	dev_dbg(xpc_part, "bringing partition %d down, reason = %d\n",
+		XPC_PARTID(part), reason);
+
+	xpc_partition_going_down(part, reason);
+}
+
+/*
+ * Mark specified partition as inactive.
+ */
+void
+xpc_mark_partition_inactive(struct xpc_partition *part)
+{
+	unsigned long irq_flags;
+
+	dev_dbg(xpc_part, "setting partition %d to INACTIVE\n",
+		XPC_PARTID(part));
+
+	spin_lock_irqsave(&part->act_lock, irq_flags);
+	part->act_state = XPC_P_INACTIVE;
+	spin_unlock_irqrestore(&part->act_lock, irq_flags);
+	part->remote_rp_pa = 0;
+}
+
+/*
+ * SAL has provided a partition and machine mask.  The partition mask
+ * contains a bit for each even nasid in our partition.  The machine
+ * mask contains a bit for each even nasid in the entire machine.
+ *
+ * Using those two bit arrays, we can determine which nasids are
+ * known in the machine.  Each should also have a reserved page
+ * initialized if they are available for partitioning.
+ */
+void
+xpc_discovery(void)
+{
+	void *remote_rp_base;
+	struct xpc_rsvd_page *remote_rp;
+	struct xpc_vars *remote_vars;
+	u64 remote_rp_pa;
+	u64 remote_vars_pa;
+	int region;
+	int region_size;
+	int max_regions;
+	int nasid;
+	struct xpc_rsvd_page *rp;
+	partid_t partid;
+	struct xpc_partition *part;
+	u64 *discovered_nasids;
+	enum xpc_retval ret;
+
+	remote_rp = xpc_kmalloc_cacheline_aligned(XPC_RP_HEADER_SIZE +
+						  xp_nasid_mask_bytes,
+						  GFP_KERNEL, &remote_rp_base);
+	if (remote_rp == NULL)
+		return;
+
+	remote_vars = (struct xpc_vars *)remote_rp;
+
+	discovered_nasids = kzalloc(sizeof(u64) * xp_nasid_mask_words,
+				    GFP_KERNEL);
+	if (discovered_nasids == NULL) {
+		kfree(remote_rp_base);
+		return;
+	}
+
+	rp = (struct xpc_rsvd_page *)xpc_rsvd_page;
+
+	/*
+	 * The term 'region' in this context refers to the minimum number of
+	 * nodes that can comprise an access protection grouping. The access
+	 * protection is in regards to memory, IOI and IPI.
+	 */
+	max_regions = 64;
+	region_size = sn_region_size;
+
+	switch (region_size) {
+	case 128:
+		max_regions *= 2;
+	case 64:
+		max_regions *= 2;
+	case 32:
+		max_regions *= 2;
+		region_size = 16;
+		DBUG_ON(!is_shub2());
+	}
+
+	for (region = 0; region < max_regions; region++) {
+
+		if (xpc_exiting)
+			break;
+
+		dev_dbg(xpc_part, "searching region %d\n", region);
+
+		for (nasid = (region * region_size * 2);
+		     nasid < ((region + 1) * region_size * 2); nasid += 2) {
+
+			if (xpc_exiting)
+				break;
+
+			dev_dbg(xpc_part, "checking nasid %d\n", nasid);
+
+			if (XPC_NASID_IN_ARRAY(nasid, xpc_part_nasids)) {
+				dev_dbg(xpc_part, "PROM indicates Nasid %d is "
+					"part of the local partition; skipping "
+					"region\n", nasid);
+				break;
+			}
+
+			if (!(XPC_NASID_IN_ARRAY(nasid, xpc_mach_nasids))) {
+				dev_dbg(xpc_part, "PROM indicates Nasid %d was "
+					"not on Numa-Link network at reset\n",
+					nasid);
+				continue;
+			}
+
+			if (XPC_NASID_IN_ARRAY(nasid, discovered_nasids)) {
+				dev_dbg(xpc_part, "Nasid %d is part of a "
+					"partition which was previously "
+					"discovered\n", nasid);
+				continue;
+			}
+
+			/* pull over the reserved page structure */
+
+			ret = xpc_get_remote_rp(nasid, discovered_nasids,
+						remote_rp, &remote_rp_pa);
+			if (ret != xpcSuccess) {
+				dev_dbg(xpc_part, "unable to get reserved page "
+					"from nasid %d, reason=%d\n", nasid,
+					ret);
+
+				if (ret == xpcLocalPartid)
+					break;
+
+				continue;
+			}
+
+			remote_vars_pa = remote_rp->vars_pa;
+
+			partid = remote_rp->partid;
+			part = &xpc_partitions[partid];
+
+			/* pull over the cross partition variables */
+
+			ret = xpc_get_remote_vars(remote_vars_pa, remote_vars);
+			if (ret != xpcSuccess) {
+				dev_dbg(xpc_part, "unable to get XPC variables "
+					"from nasid %d, reason=%d\n", nasid,
+					ret);
+
+				XPC_DEACTIVATE_PARTITION(part, ret);
+				continue;
+			}
+
+			if (part->act_state != XPC_P_INACTIVE) {
+				dev_dbg(xpc_part, "partition %d on nasid %d is "
+					"already activating\n", partid, nasid);
+				break;
+			}
+
+			/*
+			 * Register the remote partition's AMOs with SAL so it
+			 * can handle and cleanup errors within that address
+			 * range should the remote partition go down. We don't
+			 * unregister this range because it is difficult to
+			 * tell when outstanding writes to the remote partition
+			 * are finished and thus when it is thus safe to
+			 * unregister. This should not result in wasted space
+			 * in the SAL xp_addr_region table because we should
+			 * get the same page for remote_act_amos_pa after
+			 * module reloads and system reboots.
+			 */
+			if (sn_register_xp_addr_region
+			    (remote_vars->amos_page_pa, PAGE_SIZE, 1) < 0) {
+				dev_dbg(xpc_part,
+					"partition %d failed to "
+					"register xp_addr region 0x%016lx\n",
+					partid, remote_vars->amos_page_pa);
+
+				XPC_SET_REASON(part, xpcPhysAddrRegFailed,
+					       __LINE__);
+				break;
+			}
+
+			/*
+			 * The remote nasid is valid and available.
+			 * Send an interrupt to that nasid to notify
+			 * it that we are ready to begin activation.
+			 */
+			dev_dbg(xpc_part, "sending an interrupt to AMO 0x%lx, "
+				"nasid %d, phys_cpuid 0x%x\n",
+				remote_vars->amos_page_pa,
+				remote_vars->act_nasid,
+				remote_vars->act_phys_cpuid);
+
+			if (XPC_SUPPORTS_DISENGAGE_REQUEST(remote_vars->
+							   version)) {
+				part->remote_amos_page_pa =
+				    remote_vars->amos_page_pa;
+				xpc_mark_partition_disengaged(part);
+				xpc_cancel_partition_disengage_request(part);
+			}
+			xpc_IPI_send_activate(remote_vars);
+		}
+	}
+
+	kfree(discovered_nasids);
+	kfree(remote_rp_base);
+}
+
+/*
+ * Given a partid, get the nasids owned by that partition from the
+ * remote partition's reserved page.
+ */
+enum xpc_retval
+xpc_initiate_partid_to_nasids(partid_t partid, void *nasid_mask)
+{
+	struct xpc_partition *part;
+	u64 part_nasid_pa;
+	int bte_res;
+
+	part = &xpc_partitions[partid];
+	if (part->remote_rp_pa == 0)
+		return xpcPartitionDown;
+
+	memset(nasid_mask, 0, XP_NASID_MASK_BYTES);
+
+	part_nasid_pa = (u64)XPC_RP_PART_NASIDS(part->remote_rp_pa);
+
+	bte_res = xp_bte_copy(part_nasid_pa, (u64)nasid_mask,
+			      xp_nasid_mask_bytes, (BTE_NOTIFY | BTE_WACQUIRE),
+			      NULL);
+
+	return xpc_map_bte_errors(bte_res);
+}
diff --git a/drivers/misc/sgi-xp/xpnet.c b/drivers/misc/sgi-xp/xpnet.c
new file mode 100644
index 00000000000..a9543c65814
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpnet.c
@@ -0,0 +1,677 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1999-2008 Silicon Graphics, Inc. All rights reserved.
+ */
+
+/*
+ * Cross Partition Network Interface (XPNET) support
+ *
+ *	XPNET provides a virtual network layered on top of the Cross
+ *	Partition communication layer.
+ *
+ *	XPNET provides direct point-to-point and broadcast-like support
+ *	for an ethernet-like device.  The ethernet broadcast medium is
+ *	replaced with a point-to-point message structure which passes
+ *	pointers to a DMA-capable block that a remote partition should
+ *	retrieve and pass to the upper level networking layer.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/delay.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/smp.h>
+#include <linux/string.h>
+#include <asm/sn/bte.h>
+#include <asm/sn/io.h>
+#include <asm/sn/sn_sal.h>
+#include <asm/atomic.h>
+#include "xp.h"
+
+/*
+ * The message payload transferred by XPC.
+ *
+ * buf_pa is the physical address where the DMA should pull from.
+ *
+ * NOTE: for performance reasons, buf_pa should _ALWAYS_ begin on a
+ * cacheline boundary.  To accomplish this, we record the number of
+ * bytes from the beginning of the first cacheline to the first useful
+ * byte of the skb (leadin_ignore) and the number of bytes from the
+ * last useful byte of the skb to the end of the last cacheline
+ * (tailout_ignore).
+ *
+ * size is the number of bytes to transfer which includes the skb->len
+ * (useful bytes of the senders skb) plus the leadin and tailout
+ */
+struct xpnet_message {
+	u16 version;		/* Version for this message */
+	u16 embedded_bytes;	/* #of bytes embedded in XPC message */
+	u32 magic;		/* Special number indicating this is xpnet */
+	u64 buf_pa;		/* phys address of buffer to retrieve */
+	u32 size;		/* #of bytes in buffer */
+	u8 leadin_ignore;	/* #of bytes to ignore at the beginning */
+	u8 tailout_ignore;	/* #of bytes to ignore at the end */
+	unsigned char data;	/* body of small packets */
+};
+
+/*
+ * Determine the size of our message, the cacheline aligned size,
+ * and then the number of message will request from XPC.
+ *
+ * XPC expects each message to exist in an individual cacheline.
+ */
+#define XPNET_MSG_SIZE		(L1_CACHE_BYTES - XPC_MSG_PAYLOAD_OFFSET)
+#define XPNET_MSG_DATA_MAX	\
+		(XPNET_MSG_SIZE - (u64)(&((struct xpnet_message *)0)->data))
+#define XPNET_MSG_ALIGNED_SIZE	(L1_CACHE_ALIGN(XPNET_MSG_SIZE))
+#define XPNET_MSG_NENTRIES	(PAGE_SIZE / XPNET_MSG_ALIGNED_SIZE)
+
+#define XPNET_MAX_KTHREADS	(XPNET_MSG_NENTRIES + 1)
+#define XPNET_MAX_IDLE_KTHREADS	(XPNET_MSG_NENTRIES + 1)
+
+/*
+ * Version number of XPNET implementation. XPNET can always talk to versions
+ * with same major #, and never talk to versions with a different version.
+ */
+#define _XPNET_VERSION(_major, _minor)	(((_major) << 4) | (_minor))
+#define XPNET_VERSION_MAJOR(_v)		((_v) >> 4)
+#define XPNET_VERSION_MINOR(_v)		((_v) & 0xf)
+
+#define	XPNET_VERSION _XPNET_VERSION(1, 0)	/* version 1.0 */
+#define	XPNET_VERSION_EMBED _XPNET_VERSION(1, 1)	/* version 1.1 */
+#define XPNET_MAGIC	0x88786984	/* "XNET" */
+
+#define XPNET_VALID_MSG(_m)						     \
+   ((XPNET_VERSION_MAJOR(_m->version) == XPNET_VERSION_MAJOR(XPNET_VERSION)) \
+    && (msg->magic == XPNET_MAGIC))
+
+#define XPNET_DEVICE_NAME		"xp0"
+
+/*
+ * When messages are queued with xpc_send_notify, a kmalloc'd buffer
+ * of the following type is passed as a notification cookie.  When the
+ * notification function is called, we use the cookie to decide
+ * whether all outstanding message sends have completed.  The skb can
+ * then be released.
+ */
+struct xpnet_pending_msg {
+	struct list_head free_list;
+	struct sk_buff *skb;
+	atomic_t use_count;
+};
+
+/* driver specific structure pointed to by the device structure */
+struct xpnet_dev_private {
+	struct net_device_stats stats;
+};
+
+struct net_device *xpnet_device;
+
+/*
+ * When we are notified of other partitions activating, we add them to
+ * our bitmask of partitions to which we broadcast.
+ */
+static u64 xpnet_broadcast_partitions;
+/* protect above */
+static DEFINE_SPINLOCK(xpnet_broadcast_lock);
+
+/*
+ * Since the Block Transfer Engine (BTE) is being used for the transfer
+ * and it relies upon cache-line size transfers, we need to reserve at
+ * least one cache-line for head and tail alignment.  The BTE is
+ * limited to 8MB transfers.
+ *
+ * Testing has shown that changing MTU to greater than 64KB has no effect
+ * on TCP as the two sides negotiate a Max Segment Size that is limited
+ * to 64K.  Other protocols May use packets greater than this, but for
+ * now, the default is 64KB.
+ */
+#define XPNET_MAX_MTU (0x800000UL - L1_CACHE_BYTES)
+/* 32KB has been determined to be the ideal */
+#define XPNET_DEF_MTU (0x8000UL)
+
+/*
+ * The partition id is encapsulated in the MAC address.  The following
+ * define locates the octet the partid is in.
+ */
+#define XPNET_PARTID_OCTET	1
+#define XPNET_LICENSE_OCTET	2
+
+/*
+ * Define the XPNET debug device structure that is to be used with dev_dbg(),
+ * dev_err(), dev_warn(), and dev_info().
+ */
+struct device_driver xpnet_dbg_name = {
+	.name = "xpnet"
+};
+
+struct device xpnet_dbg_subname = {
+	.bus_id = {0},		/* set to "" */
+	.driver = &xpnet_dbg_name
+};
+
+struct device *xpnet = &xpnet_dbg_subname;
+
+/*
+ * Packet was recevied by XPC and forwarded to us.
+ */
+static void
+xpnet_receive(partid_t partid, int channel, struct xpnet_message *msg)
+{
+	struct sk_buff *skb;
+	bte_result_t bret;
+	struct xpnet_dev_private *priv =
+	    (struct xpnet_dev_private *)xpnet_device->priv;
+
+	if (!XPNET_VALID_MSG(msg)) {
+		/*
+		 * Packet with a different XPC version.  Ignore.
+		 */
+		xpc_received(partid, channel, (void *)msg);
+
+		priv->stats.rx_errors++;
+
+		return;
+	}
+	dev_dbg(xpnet, "received 0x%lx, %d, %d, %d\n", msg->buf_pa, msg->size,
+		msg->leadin_ignore, msg->tailout_ignore);
+
+	/* reserve an extra cache line */
+	skb = dev_alloc_skb(msg->size + L1_CACHE_BYTES);
+	if (!skb) {
+		dev_err(xpnet, "failed on dev_alloc_skb(%d)\n",
+			msg->size + L1_CACHE_BYTES);
+
+		xpc_received(partid, channel, (void *)msg);
+
+		priv->stats.rx_errors++;
+
+		return;
+	}
+
+	/*
+	 * The allocated skb has some reserved space.
+	 * In order to use bte_copy, we need to get the
+	 * skb->data pointer moved forward.
+	 */
+	skb_reserve(skb, (L1_CACHE_BYTES - ((u64)skb->data &
+					    (L1_CACHE_BYTES - 1)) +
+			  msg->leadin_ignore));
+
+	/*
+	 * Update the tail pointer to indicate data actually
+	 * transferred.
+	 */
+	skb_put(skb, (msg->size - msg->leadin_ignore - msg->tailout_ignore));
+
+	/*
+	 * Move the data over from the other side.
+	 */
+	if ((XPNET_VERSION_MINOR(msg->version) == 1) &&
+	    (msg->embedded_bytes != 0)) {
+		dev_dbg(xpnet, "copying embedded message. memcpy(0x%p, 0x%p, "
+			"%lu)\n", skb->data, &msg->data,
+			(size_t)msg->embedded_bytes);
+
+		skb_copy_to_linear_data(skb, &msg->data,
+					(size_t)msg->embedded_bytes);
+	} else {
+		dev_dbg(xpnet, "transferring buffer to the skb->data area;\n\t"
+			"bte_copy(0x%p, 0x%p, %hu)\n", (void *)msg->buf_pa,
+			(void *)__pa((u64)skb->data & ~(L1_CACHE_BYTES - 1)),
+			msg->size);
+
+		bret = bte_copy(msg->buf_pa,
+				__pa((u64)skb->data & ~(L1_CACHE_BYTES - 1)),
+				msg->size, (BTE_NOTIFY | BTE_WACQUIRE), NULL);
+
+		if (bret != BTE_SUCCESS) {
+			/*
+			 * >>> Need better way of cleaning skb.  Currently skb
+			 * >>> appears in_use and we can't just call
+			 * >>> dev_kfree_skb.
+			 */
+			dev_err(xpnet, "bte_copy(0x%p, 0x%p, 0x%hx) returned "
+				"error=0x%x\n", (void *)msg->buf_pa,
+				(void *)__pa((u64)skb->data &
+					     ~(L1_CACHE_BYTES - 1)),
+				msg->size, bret);
+
+			xpc_received(partid, channel, (void *)msg);
+
+			priv->stats.rx_errors++;
+
+			return;
+		}
+	}
+
+	dev_dbg(xpnet, "<skb->head=0x%p skb->data=0x%p skb->tail=0x%p "
+		"skb->end=0x%p skb->len=%d\n", (void *)skb->head,
+		(void *)skb->data, skb_tail_pointer(skb), skb_end_pointer(skb),
+		skb->len);
+
+	skb->protocol = eth_type_trans(skb, xpnet_device);
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	dev_dbg(xpnet, "passing skb to network layer\n"
+		KERN_DEBUG "\tskb->head=0x%p skb->data=0x%p skb->tail=0x%p "
+		"skb->end=0x%p skb->len=%d\n",
+		(void *)skb->head, (void *)skb->data, skb_tail_pointer(skb),
+		skb_end_pointer(skb), skb->len);
+
+	xpnet_device->last_rx = jiffies;
+	priv->stats.rx_packets++;
+	priv->stats.rx_bytes += skb->len + ETH_HLEN;
+
+	netif_rx_ni(skb);
+	xpc_received(partid, channel, (void *)msg);
+}
+
+/*
+ * This is the handler which XPC calls during any sort of change in
+ * state or message reception on a connection.
+ */
+static void
+xpnet_connection_activity(enum xpc_retval reason, partid_t partid, int channel,
+			  void *data, void *key)
+{
+	long bp;
+
+	DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
+	DBUG_ON(channel != XPC_NET_CHANNEL);
+
+	switch (reason) {
+	case xpcMsgReceived:	/* message received */
+		DBUG_ON(data == NULL);
+
+		xpnet_receive(partid, channel, (struct xpnet_message *)data);
+		break;
+
+	case xpcConnected:	/* connection completed to a partition */
+		spin_lock_bh(&xpnet_broadcast_lock);
+		xpnet_broadcast_partitions |= 1UL << (partid - 1);
+		bp = xpnet_broadcast_partitions;
+		spin_unlock_bh(&xpnet_broadcast_lock);
+
+		netif_carrier_on(xpnet_device);
+
+		dev_dbg(xpnet, "%s connection created to partition %d; "
+			"xpnet_broadcast_partitions=0x%lx\n",
+			xpnet_device->name, partid, bp);
+		break;
+
+	default:
+		spin_lock_bh(&xpnet_broadcast_lock);
+		xpnet_broadcast_partitions &= ~(1UL << (partid - 1));
+		bp = xpnet_broadcast_partitions;
+		spin_unlock_bh(&xpnet_broadcast_lock);
+
+		if (bp == 0)
+			netif_carrier_off(xpnet_device);
+
+		dev_dbg(xpnet, "%s disconnected from partition %d; "
+			"xpnet_broadcast_partitions=0x%lx\n",
+			xpnet_device->name, partid, bp);
+		break;
+
+	}
+}
+
+static int
+xpnet_dev_open(struct net_device *dev)
+{
+	enum xpc_retval ret;
+
+	dev_dbg(xpnet, "calling xpc_connect(%d, 0x%p, NULL, %ld, %ld, %ld, "
+		"%ld)\n", XPC_NET_CHANNEL, xpnet_connection_activity,
+		XPNET_MSG_SIZE, XPNET_MSG_NENTRIES, XPNET_MAX_KTHREADS,
+		XPNET_MAX_IDLE_KTHREADS);
+
+	ret = xpc_connect(XPC_NET_CHANNEL, xpnet_connection_activity, NULL,
+			  XPNET_MSG_SIZE, XPNET_MSG_NENTRIES,
+			  XPNET_MAX_KTHREADS, XPNET_MAX_IDLE_KTHREADS);
+	if (ret != xpcSuccess) {
+		dev_err(xpnet, "ifconfig up of %s failed on XPC connect, "
+			"ret=%d\n", dev->name, ret);
+
+		return -ENOMEM;
+	}
+
+	dev_dbg(xpnet, "ifconfig up of %s; XPC connected\n", dev->name);
+
+	return 0;
+}
+
+static int
+xpnet_dev_stop(struct net_device *dev)
+{
+	xpc_disconnect(XPC_NET_CHANNEL);
+
+	dev_dbg(xpnet, "ifconfig down of %s; XPC disconnected\n", dev->name);
+
+	return 0;
+}
+
+static int
+xpnet_dev_change_mtu(struct net_device *dev, int new_mtu)
+{
+	/* 68 comes from min TCP+IP+MAC header */
+	if ((new_mtu < 68) || (new_mtu > XPNET_MAX_MTU)) {
+		dev_err(xpnet, "ifconfig %s mtu %d failed; value must be "
+			"between 68 and %ld\n", dev->name, new_mtu,
+			XPNET_MAX_MTU);
+		return -EINVAL;
+	}
+
+	dev->mtu = new_mtu;
+	dev_dbg(xpnet, "ifconfig %s mtu set to %d\n", dev->name, new_mtu);
+	return 0;
+}
+
+/*
+ * Required for the net_device structure.
+ */
+static int
+xpnet_dev_set_config(struct net_device *dev, struct ifmap *new_map)
+{
+	return 0;
+}
+
+/*
+ * Return statistics to the caller.
+ */
+static struct net_device_stats *
+xpnet_dev_get_stats(struct net_device *dev)
+{
+	struct xpnet_dev_private *priv;
+
+	priv = (struct xpnet_dev_private *)dev->priv;
+
+	return &priv->stats;
+}
+
+/*
+ * Notification that the other end has received the message and
+ * DMA'd the skb information.  At this point, they are done with
+ * our side.  When all recipients are done processing, we
+ * release the skb and then release our pending message structure.
+ */
+static void
+xpnet_send_completed(enum xpc_retval reason, partid_t partid, int channel,
+		     void *__qm)
+{
+	struct xpnet_pending_msg *queued_msg = (struct xpnet_pending_msg *)__qm;
+
+	DBUG_ON(queued_msg == NULL);
+
+	dev_dbg(xpnet, "message to %d notified with reason %d\n",
+		partid, reason);
+
+	if (atomic_dec_return(&queued_msg->use_count) == 0) {
+		dev_dbg(xpnet, "all acks for skb->head=-x%p\n",
+			(void *)queued_msg->skb->head);
+
+		dev_kfree_skb_any(queued_msg->skb);
+		kfree(queued_msg);
+	}
+}
+
+/*
+ * Network layer has formatted a packet (skb) and is ready to place it
+ * "on the wire".  Prepare and send an xpnet_message to all partitions
+ * which have connected with us and are targets of this packet.
+ *
+ * MAC-NOTE:  For the XPNET driver, the MAC address contains the
+ * destination partition_id.  If the destination partition id word
+ * is 0xff, this packet is to broadcast to all partitions.
+ */
+static int
+xpnet_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct xpnet_pending_msg *queued_msg;
+	enum xpc_retval ret;
+	struct xpnet_message *msg;
+	u64 start_addr, end_addr;
+	long dp;
+	u8 second_mac_octet;
+	partid_t dest_partid;
+	struct xpnet_dev_private *priv;
+	u16 embedded_bytes;
+
+	priv = (struct xpnet_dev_private *)dev->priv;
+
+	dev_dbg(xpnet, ">skb->head=0x%p skb->data=0x%p skb->tail=0x%p "
+		"skb->end=0x%p skb->len=%d\n", (void *)skb->head,
+		(void *)skb->data, skb_tail_pointer(skb), skb_end_pointer(skb),
+		skb->len);
+
+	/*
+	 * The xpnet_pending_msg tracks how many outstanding
+	 * xpc_send_notifies are relying on this skb.  When none
+	 * remain, release the skb.
+	 */
+	queued_msg = kmalloc(sizeof(struct xpnet_pending_msg), GFP_ATOMIC);
+	if (queued_msg == NULL) {
+		dev_warn(xpnet, "failed to kmalloc %ld bytes; dropping "
+			 "packet\n", sizeof(struct xpnet_pending_msg));
+
+		priv->stats.tx_errors++;
+
+		return -ENOMEM;
+	}
+
+	/* get the beginning of the first cacheline and end of last */
+	start_addr = ((u64)skb->data & ~(L1_CACHE_BYTES - 1));
+	end_addr = L1_CACHE_ALIGN((u64)skb_tail_pointer(skb));
+
+	/* calculate how many bytes to embed in the XPC message */
+	embedded_bytes = 0;
+	if (unlikely(skb->len <= XPNET_MSG_DATA_MAX)) {
+		/* skb->data does fit so embed */
+		embedded_bytes = skb->len;
+	}
+
+	/*
+	 * Since the send occurs asynchronously, we set the count to one
+	 * and begin sending.  Any sends that happen to complete before
+	 * we are done sending will not free the skb.  We will be left
+	 * with that task during exit.  This also handles the case of
+	 * a packet destined for a partition which is no longer up.
+	 */
+	atomic_set(&queued_msg->use_count, 1);
+	queued_msg->skb = skb;
+
+	second_mac_octet = skb->data[XPNET_PARTID_OCTET];
+	if (second_mac_octet == 0xff) {
+		/* we are being asked to broadcast to all partitions */
+		dp = xpnet_broadcast_partitions;
+	} else if (second_mac_octet != 0) {
+		dp = xpnet_broadcast_partitions &
+		    (1UL << (second_mac_octet - 1));
+	} else {
+		/* 0 is an invalid partid.  Ignore */
+		dp = 0;
+	}
+	dev_dbg(xpnet, "destination Partitions mask (dp) = 0x%lx\n", dp);
+
+	/*
+	 * If we wanted to allow promiscuous mode to work like an
+	 * unswitched network, this would be a good point to OR in a
+	 * mask of partitions which should be receiving all packets.
+	 */
+
+	/*
+	 * Main send loop.
+	 */
+	for (dest_partid = 1; dp && dest_partid < XP_MAX_PARTITIONS;
+	     dest_partid++) {
+
+		if (!(dp & (1UL << (dest_partid - 1)))) {
+			/* not destined for this partition */
+			continue;
+		}
+
+		/* remove this partition from the destinations mask */
+		dp &= ~(1UL << (dest_partid - 1));
+
+		/* found a partition to send to */
+
+		ret = xpc_allocate(dest_partid, XPC_NET_CHANNEL,
+				   XPC_NOWAIT, (void **)&msg);
+		if (unlikely(ret != xpcSuccess))
+			continue;
+
+		msg->embedded_bytes = embedded_bytes;
+		if (unlikely(embedded_bytes != 0)) {
+			msg->version = XPNET_VERSION_EMBED;
+			dev_dbg(xpnet, "calling memcpy(0x%p, 0x%p, 0x%lx)\n",
+				&msg->data, skb->data, (size_t)embedded_bytes);
+			skb_copy_from_linear_data(skb, &msg->data,
+						  (size_t)embedded_bytes);
+		} else {
+			msg->version = XPNET_VERSION;
+		}
+		msg->magic = XPNET_MAGIC;
+		msg->size = end_addr - start_addr;
+		msg->leadin_ignore = (u64)skb->data - start_addr;
+		msg->tailout_ignore = end_addr - (u64)skb_tail_pointer(skb);
+		msg->buf_pa = __pa(start_addr);
+
+		dev_dbg(xpnet, "sending XPC message to %d:%d\n"
+			KERN_DEBUG "msg->buf_pa=0x%lx, msg->size=%u, "
+			"msg->leadin_ignore=%u, msg->tailout_ignore=%u\n",
+			dest_partid, XPC_NET_CHANNEL, msg->buf_pa, msg->size,
+			msg->leadin_ignore, msg->tailout_ignore);
+
+		atomic_inc(&queued_msg->use_count);
+
+		ret = xpc_send_notify(dest_partid, XPC_NET_CHANNEL, msg,
+				      xpnet_send_completed, queued_msg);
+		if (unlikely(ret != xpcSuccess)) {
+			atomic_dec(&queued_msg->use_count);
+			continue;
+		}
+	}
+
+	if (atomic_dec_return(&queued_msg->use_count) == 0) {
+		dev_dbg(xpnet, "no partitions to receive packet destined for "
+			"%d\n", dest_partid);
+
+		dev_kfree_skb(skb);
+		kfree(queued_msg);
+	}
+
+	priv->stats.tx_packets++;
+	priv->stats.tx_bytes += skb->len;
+
+	return 0;
+}
+
+/*
+ * Deal with transmit timeouts coming from the network layer.
+ */
+static void
+xpnet_dev_tx_timeout(struct net_device *dev)
+{
+	struct xpnet_dev_private *priv;
+
+	priv = (struct xpnet_dev_private *)dev->priv;
+
+	priv->stats.tx_errors++;
+	return;
+}
+
+static int __init
+xpnet_init(void)
+{
+	int i;
+	u32 license_num;
+	int result = -ENOMEM;
+
+	if (!ia64_platform_is("sn2"))
+		return -ENODEV;
+
+	dev_info(xpnet, "registering network device %s\n", XPNET_DEVICE_NAME);
+
+	/*
+	 * use ether_setup() to init the majority of our device
+	 * structure and then override the necessary pieces.
+	 */
+	xpnet_device = alloc_netdev(sizeof(struct xpnet_dev_private),
+				    XPNET_DEVICE_NAME, ether_setup);
+	if (xpnet_device == NULL)
+		return -ENOMEM;
+
+	netif_carrier_off(xpnet_device);
+
+	xpnet_device->mtu = XPNET_DEF_MTU;
+	xpnet_device->change_mtu = xpnet_dev_change_mtu;
+	xpnet_device->open = xpnet_dev_open;
+	xpnet_device->get_stats = xpnet_dev_get_stats;
+	xpnet_device->stop = xpnet_dev_stop;
+	xpnet_device->hard_start_xmit = xpnet_dev_hard_start_xmit;
+	xpnet_device->tx_timeout = xpnet_dev_tx_timeout;
+	xpnet_device->set_config = xpnet_dev_set_config;
+
+	/*
+	 * Multicast assumes the LSB of the first octet is set for multicast
+	 * MAC addresses.  We chose the first octet of the MAC to be unlikely
+	 * to collide with any vendor's officially issued MAC.
+	 */
+	xpnet_device->dev_addr[0] = 0xfe;
+	xpnet_device->dev_addr[XPNET_PARTID_OCTET] = sn_partition_id;
+	license_num = sn_partition_serial_number_val();
+	for (i = 3; i >= 0; i--) {
+		xpnet_device->dev_addr[XPNET_LICENSE_OCTET + i] =
+		    license_num & 0xff;
+		license_num = license_num >> 8;
+	}
+
+	/*
+	 * ether_setup() sets this to a multicast device.  We are
+	 * really not supporting multicast at this time.
+	 */
+	xpnet_device->flags &= ~IFF_MULTICAST;
+
+	/*
+	 * No need to checksum as it is a DMA transfer.  The BTE will
+	 * report an error if the data is not retrievable and the
+	 * packet will be dropped.
+	 */
+	xpnet_device->features = NETIF_F_NO_CSUM;
+
+	result = register_netdev(xpnet_device);
+	if (result != 0)
+		free_netdev(xpnet_device);
+
+	return result;
+}
+
+module_init(xpnet_init);
+
+static void __exit
+xpnet_exit(void)
+{
+	dev_info(xpnet, "unregistering network device %s\n",
+		 xpnet_device[0].name);
+
+	unregister_netdev(xpnet_device);
+
+	free_netdev(xpnet_device);
+}
+
+module_exit(xpnet_exit);
+
+MODULE_AUTHOR("Silicon Graphics, Inc.");
+MODULE_DESCRIPTION("Cross Partition Network adapter (XPNET)");
+MODULE_LICENSE("GPL");
diff --git a/drivers/misc/thinkpad_acpi.c b/drivers/misc/thinkpad_acpi.c
index 6cb781262f9..3f28f6eabdb 100644
--- a/drivers/misc/thinkpad_acpi.c
+++ b/drivers/misc/thinkpad_acpi.c
@@ -21,7 +21,7 @@
  *  02110-1301, USA.
  */
 
-#define TPACPI_VERSION "0.19"
+#define TPACPI_VERSION "0.20"
 #define TPACPI_SYSFS_VERSION 0x020200
 
 /*
@@ -67,6 +67,7 @@
 #include <linux/hwmon.h>
 #include <linux/hwmon-sysfs.h>
 #include <linux/input.h>
+#include <linux/leds.h>
 #include <asm/uaccess.h>
 
 #include <linux/dmi.h>
@@ -85,6 +86,8 @@
 #define TP_CMOS_VOLUME_MUTE	2
 #define TP_CMOS_BRIGHTNESS_UP	4
 #define TP_CMOS_BRIGHTNESS_DOWN	5
+#define TP_CMOS_THINKLIGHT_ON	12
+#define TP_CMOS_THINKLIGHT_OFF	13
 
 /* NVRAM Addresses */
 enum tp_nvram_addr {
@@ -133,8 +136,12 @@ enum {
 #define TPACPI_PROC_DIR "ibm"
 #define TPACPI_ACPI_EVENT_PREFIX "ibm"
 #define TPACPI_DRVR_NAME TPACPI_FILE
+#define TPACPI_DRVR_SHORTNAME "tpacpi"
 #define TPACPI_HWMON_DRVR_NAME TPACPI_NAME "_hwmon"
 
+#define TPACPI_NVRAM_KTHREAD_NAME "ktpacpi_nvramd"
+#define TPACPI_WORKQUEUE_NAME "ktpacpid"
+
 #define TPACPI_MAX_ACPI_ARGS 3
 
 /* Debugging */
@@ -225,6 +232,7 @@ static struct {
 	u32 light:1;
 	u32 light_status:1;
 	u32 bright_16levels:1;
+	u32 bright_acpimode:1;
 	u32 wan:1;
 	u32 fan_ctrl_status_undef:1;
 	u32 input_device_registered:1;
@@ -236,6 +244,11 @@ static struct {
 	u32 hotkey_poll_active:1;
 } tp_features;
 
+static struct {
+	u16 hotkey_mask_ff:1;
+	u16 bright_cmos_ec_unsync:1;
+} tp_warned;
+
 struct thinkpad_id_data {
 	unsigned int vendor;	/* ThinkPad vendor:
 				 * PCI_VENDOR_ID_IBM/PCI_VENDOR_ID_LENOVO */
@@ -246,7 +259,8 @@ struct thinkpad_id_data {
 	u16 bios_model;		/* Big Endian, TP-1Y = 0x5931, 0 = unknown */
 	u16 ec_model;
 
-	char *model_str;
+	char *model_str;	/* ThinkPad T43 */
+	char *nummodel_str;	/* 9384A9C for a 9384-A9C model */
 };
 static struct thinkpad_id_data thinkpad_id;
 
@@ -259,6 +273,16 @@ static enum {
 static int experimental;
 static u32 dbg_level;
 
+static struct workqueue_struct *tpacpi_wq;
+
+/* Special LED class that can defer work */
+struct tpacpi_led_classdev {
+	struct led_classdev led_classdev;
+	struct work_struct work;
+	enum led_brightness new_brightness;
+	unsigned int led;
+};
+
 /****************************************************************************
  ****************************************************************************
  *
@@ -807,6 +831,80 @@ static int parse_strtoul(const char *buf,
 	return 0;
 }
 
+static int __init tpacpi_query_bcl_levels(acpi_handle handle)
+{
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	union acpi_object *obj;
+	int rc;
+
+	if (ACPI_SUCCESS(acpi_evaluate_object(handle, NULL, NULL, &buffer))) {
+		obj = (union acpi_object *)buffer.pointer;
+		if (!obj || (obj->type != ACPI_TYPE_PACKAGE)) {
+			printk(TPACPI_ERR "Unknown _BCL data, "
+			       "please report this to %s\n", TPACPI_MAIL);
+			rc = 0;
+		} else {
+			rc = obj->package.count;
+		}
+	} else {
+		return 0;
+	}
+
+	kfree(buffer.pointer);
+	return rc;
+}
+
+static acpi_status __init tpacpi_acpi_walk_find_bcl(acpi_handle handle,
+					u32 lvl, void *context, void **rv)
+{
+	char name[ACPI_PATH_SEGMENT_LENGTH];
+	struct acpi_buffer buffer = { sizeof(name), &name };
+
+	if (ACPI_SUCCESS(acpi_get_name(handle, ACPI_SINGLE_NAME, &buffer)) &&
+	    !strncmp("_BCL", name, sizeof(name) - 1)) {
+		BUG_ON(!rv || !*rv);
+		**(int **)rv = tpacpi_query_bcl_levels(handle);
+		return AE_CTRL_TERMINATE;
+	} else {
+		return AE_OK;
+	}
+}
+
+/*
+ * Returns 0 (no ACPI _BCL or _BCL invalid), or size of brightness map
+ */
+static int __init tpacpi_check_std_acpi_brightness_support(void)
+{
+	int status;
+	int bcl_levels = 0;
+	void *bcl_ptr = &bcl_levels;
+
+	if (!vid_handle) {
+		TPACPI_ACPIHANDLE_INIT(vid);
+	}
+	if (!vid_handle)
+		return 0;
+
+	/*
+	 * Search for a _BCL method, and execute it.  This is safe on all
+	 * ThinkPads, and as a side-effect, _BCL will place a Lenovo Vista
+	 * BIOS in ACPI backlight control mode.  We do NOT have to care
+	 * about calling the _BCL method in an enabled video device, any
+	 * will do for our purposes.
+	 */
+
+	status = acpi_walk_namespace(ACPI_TYPE_METHOD, vid_handle, 3,
+				     tpacpi_acpi_walk_find_bcl, NULL,
+				     &bcl_ptr);
+
+	if (ACPI_SUCCESS(status) && bcl_levels > 2) {
+		tp_features.bright_acpimode = 1;
+		return (bcl_levels - 2);
+	}
+
+	return 0;
+}
+
 /*************************************************************************
  * thinkpad-acpi driver attributes
  */
@@ -909,12 +1007,14 @@ static int __init thinkpad_acpi_driver_init(struct ibm_init_struct *iibm)
 			thinkpad_id.ec_version_str : "unknown");
 
 	if (thinkpad_id.vendor && thinkpad_id.model_str)
-		printk(TPACPI_INFO "%s %s\n",
+		printk(TPACPI_INFO "%s %s, model %s\n",
 			(thinkpad_id.vendor == PCI_VENDOR_ID_IBM) ?
 				"IBM" : ((thinkpad_id.vendor ==
 						PCI_VENDOR_ID_LENOVO) ?
 					"Lenovo" : "Unknown vendor"),
-			thinkpad_id.model_str);
+			thinkpad_id.model_str,
+			(thinkpad_id.nummodel_str) ?
+				thinkpad_id.nummodel_str : "unknown");
 
 	return 0;
 }
@@ -1107,6 +1207,19 @@ static int hotkey_mask_set(u32 mask)
 	int rc = 0;
 
 	if (tp_features.hotkey_mask) {
+		if (!tp_warned.hotkey_mask_ff &&
+		    (mask == 0xffff || mask == 0xffffff ||
+		     mask == 0xffffffff)) {
+			tp_warned.hotkey_mask_ff = 1;
+			printk(TPACPI_NOTICE
+			       "setting the hotkey mask to 0x%08x is likely "
+			       "not the best way to go about it\n", mask);
+			printk(TPACPI_NOTICE
+			       "please consider using the driver defaults, "
+			       "and refer to up-to-date thinkpad-acpi "
+			       "documentation\n");
+		}
+
 		HOTKEY_CONFIG_CRITICAL_START
 		for (i = 0; i < 32; i++) {
 			u32 m = 1 << i;
@@ -1427,8 +1540,7 @@ static void hotkey_poll_setup(int may_warn)
 	    (tpacpi_inputdev->users > 0 || hotkey_report_mode < 2)) {
 		if (!tpacpi_hotkey_task) {
 			tpacpi_hotkey_task = kthread_run(hotkey_kthread,
-							 NULL,
-							 TPACPI_FILE "d");
+					NULL, TPACPI_NVRAM_KTHREAD_NAME);
 			if (IS_ERR(tpacpi_hotkey_task)) {
 				tpacpi_hotkey_task = NULL;
 				printk(TPACPI_ERR
@@ -1887,6 +1999,9 @@ static int __init hotkey_init(struct ibm_init_struct *iibm)
 		KEY_UNKNOWN,	/* 0x0D: FN+INSERT */
 		KEY_UNKNOWN,	/* 0x0E: FN+DELETE */
 
+		/* These either have to go through ACPI video, or
+		 * act like in the IBM ThinkPads, so don't ever
+		 * enable them by default */
 		KEY_RESERVED,	/* 0x0F: FN+HOME (brightness up) */
 		KEY_RESERVED,	/* 0x10: FN+END (brightness down) */
 
@@ -2091,6 +2206,32 @@ static int __init hotkey_init(struct ibm_init_struct *iibm)
 			set_bit(SW_TABLET_MODE, tpacpi_inputdev->swbit);
 		}
 
+		/* Do not issue duplicate brightness change events to
+		 * userspace */
+		if (!tp_features.bright_acpimode)
+			/* update bright_acpimode... */
+			tpacpi_check_std_acpi_brightness_support();
+
+		if (tp_features.bright_acpimode) {
+			printk(TPACPI_INFO
+			       "This ThinkPad has standard ACPI backlight "
+			       "brightness control, supported by the ACPI "
+			       "video driver\n");
+			printk(TPACPI_NOTICE
+			       "Disabling thinkpad-acpi brightness events "
+			       "by default...\n");
+
+			/* The hotkey_reserved_mask change below is not
+			 * necessary while the keys are at KEY_RESERVED in the
+			 * default map, but better safe than sorry, leave it
+			 * here as a marker of what we have to do, especially
+			 * when we finally become able to set this at runtime
+			 * on response to X.org requests */
+			hotkey_reserved_mask |=
+				(1 << TP_ACPI_HOTKEYSCAN_FNHOME)
+				| (1 << TP_ACPI_HOTKEYSCAN_FNEND);
+		}
+
 		dbg_printk(TPACPI_DBG_INIT,
 				"enabling hot key handling\n");
 		res = hotkey_status_set(1);
@@ -3110,13 +3251,82 @@ static struct ibm_struct video_driver_data = {
 TPACPI_HANDLE(lght, root, "\\LGHT");	/* A21e, A2xm/p, T20-22, X20-21 */
 TPACPI_HANDLE(ledb, ec, "LEDB");		/* G4x */
 
+static int light_get_status(void)
+{
+	int status = 0;
+
+	if (tp_features.light_status) {
+		if (!acpi_evalf(ec_handle, &status, "KBLT", "d"))
+			return -EIO;
+		return (!!status);
+	}
+
+	return -ENXIO;
+}
+
+static int light_set_status(int status)
+{
+	int rc;
+
+	if (tp_features.light) {
+		if (cmos_handle) {
+			rc = acpi_evalf(cmos_handle, NULL, NULL, "vd",
+					(status)?
+						TP_CMOS_THINKLIGHT_ON :
+						TP_CMOS_THINKLIGHT_OFF);
+		} else {
+			rc = acpi_evalf(lght_handle, NULL, NULL, "vd",
+					(status)? 1 : 0);
+		}
+		return (rc)? 0 : -EIO;
+	}
+
+	return -ENXIO;
+}
+
+static void light_set_status_worker(struct work_struct *work)
+{
+	struct tpacpi_led_classdev *data =
+			container_of(work, struct tpacpi_led_classdev, work);
+
+	if (likely(tpacpi_lifecycle == TPACPI_LIFE_RUNNING))
+		light_set_status((data->new_brightness != LED_OFF));
+}
+
+static void light_sysfs_set(struct led_classdev *led_cdev,
+			enum led_brightness brightness)
+{
+	struct tpacpi_led_classdev *data =
+		container_of(led_cdev,
+			     struct tpacpi_led_classdev,
+			     led_classdev);
+	data->new_brightness = brightness;
+	queue_work(tpacpi_wq, &data->work);
+}
+
+static enum led_brightness light_sysfs_get(struct led_classdev *led_cdev)
+{
+	return (light_get_status() == 1)? LED_FULL : LED_OFF;
+}
+
+static struct tpacpi_led_classdev tpacpi_led_thinklight = {
+	.led_classdev = {
+		.name		= "tpacpi::thinklight",
+		.brightness_set	= &light_sysfs_set,
+		.brightness_get	= &light_sysfs_get,
+	}
+};
+
 static int __init light_init(struct ibm_init_struct *iibm)
 {
+	int rc = 0;
+
 	vdbg_printk(TPACPI_DBG_INIT, "initializing light subdriver\n");
 
 	TPACPI_ACPIHANDLE_INIT(ledb);
 	TPACPI_ACPIHANDLE_INIT(lght);
 	TPACPI_ACPIHANDLE_INIT(cmos);
+	INIT_WORK(&tpacpi_led_thinklight.work, light_set_status_worker);
 
 	/* light not supported on 570, 600e/x, 770e, 770x, G4x, R30, R31 */
 	tp_features.light = (cmos_handle || lght_handle) && !ledb_handle;
@@ -3130,13 +3340,31 @@ static int __init light_init(struct ibm_init_struct *iibm)
 	vdbg_printk(TPACPI_DBG_INIT, "light is %s\n",
 		str_supported(tp_features.light));
 
-	return (tp_features.light)? 0 : 1;
+	if (tp_features.light) {
+		rc = led_classdev_register(&tpacpi_pdev->dev,
+					   &tpacpi_led_thinklight.led_classdev);
+	}
+
+	if (rc < 0) {
+		tp_features.light = 0;
+		tp_features.light_status = 0;
+	} else {
+		rc = (tp_features.light)? 0 : 1;
+	}
+	return rc;
+}
+
+static void light_exit(void)
+{
+	led_classdev_unregister(&tpacpi_led_thinklight.led_classdev);
+	if (work_pending(&tpacpi_led_thinklight.work))
+		flush_workqueue(tpacpi_wq);
 }
 
 static int light_read(char *p)
 {
 	int len = 0;
-	int status = 0;
+	int status;
 
 	if (!tp_features.light) {
 		len += sprintf(p + len, "status:\t\tnot supported\n");
@@ -3144,8 +3372,9 @@ static int light_read(char *p)
 		len += sprintf(p + len, "status:\t\tunknown\n");
 		len += sprintf(p + len, "commands:\ton, off\n");
 	} else {
-		if (!acpi_evalf(ec_handle, &status, "KBLT", "d"))
-			return -EIO;
+		status = light_get_status();
+		if (status < 0)
+			return status;
 		len += sprintf(p + len, "status:\t\t%s\n", onoff(status, 0));
 		len += sprintf(p + len, "commands:\ton, off\n");
 	}
@@ -3155,37 +3384,29 @@ static int light_read(char *p)
 
 static int light_write(char *buf)
 {
-	int cmos_cmd, lght_cmd;
 	char *cmd;
-	int success;
+	int newstatus = 0;
 
 	if (!tp_features.light)
 		return -ENODEV;
 
 	while ((cmd = next_cmd(&buf))) {
 		if (strlencmp(cmd, "on") == 0) {
-			cmos_cmd = 0x0c;
-			lght_cmd = 1;
+			newstatus = 1;
 		} else if (strlencmp(cmd, "off") == 0) {
-			cmos_cmd = 0x0d;
-			lght_cmd = 0;
+			newstatus = 0;
 		} else
 			return -EINVAL;
-
-		success = cmos_handle ?
-		    acpi_evalf(cmos_handle, NULL, NULL, "vd", cmos_cmd) :
-		    acpi_evalf(lght_handle, NULL, NULL, "vd", lght_cmd);
-		if (!success)
-			return -EIO;
 	}
 
-	return 0;
+	return light_set_status(newstatus);
 }
 
 static struct ibm_struct light_driver_data = {
 	.name = "light",
 	.read = light_read,
 	.write = light_write,
+	.exit = light_exit,
 };
 
 /*************************************************************************
@@ -3583,6 +3804,12 @@ enum {	/* For TPACPI_LED_OLD */
 	TPACPI_LED_EC_HLMS = 0x0e,	/* EC reg to select led to command */
 };
 
+enum led_status_t {
+	TPACPI_LED_OFF = 0,
+	TPACPI_LED_ON,
+	TPACPI_LED_BLINK,
+};
+
 static enum led_access_mode led_supported;
 
 TPACPI_HANDLE(led, ec, "SLED",	/* 570 */
@@ -3591,8 +3818,174 @@ TPACPI_HANDLE(led, ec, "SLED",	/* 570 */
 	   "LED",		/* all others */
 	   );			/* R30, R31 */
 
+#define TPACPI_LED_NUMLEDS 8
+static struct tpacpi_led_classdev *tpacpi_leds;
+static enum led_status_t tpacpi_led_state_cache[TPACPI_LED_NUMLEDS];
+static const char const *tpacpi_led_names[TPACPI_LED_NUMLEDS] = {
+	/* there's a limit of 19 chars + NULL before 2.6.26 */
+	"tpacpi::power",
+	"tpacpi:orange:batt",
+	"tpacpi:green:batt",
+	"tpacpi::dock_active",
+	"tpacpi::bay_active",
+	"tpacpi::dock_batt",
+	"tpacpi::unknown_led",
+	"tpacpi::standby",
+};
+
+static int led_get_status(unsigned int led)
+{
+	int status;
+	enum led_status_t led_s;
+
+	switch (led_supported) {
+	case TPACPI_LED_570:
+		if (!acpi_evalf(ec_handle,
+				&status, "GLED", "dd", 1 << led))
+			return -EIO;
+		led_s = (status == 0)?
+				TPACPI_LED_OFF :
+				((status == 1)?
+					TPACPI_LED_ON :
+					TPACPI_LED_BLINK);
+		tpacpi_led_state_cache[led] = led_s;
+		return led_s;
+	default:
+		return -ENXIO;
+	}
+
+	/* not reached */
+}
+
+static int led_set_status(unsigned int led, enum led_status_t ledstatus)
+{
+	/* off, on, blink. Index is led_status_t */
+	static const int const led_sled_arg1[] = { 0, 1, 3 };
+	static const int const led_exp_hlbl[] = { 0, 0, 1 };	/* led# * */
+	static const int const led_exp_hlcl[] = { 0, 1, 1 };	/* led# * */
+	static const int const led_led_arg1[] = { 0, 0x80, 0xc0 };
+
+	int rc = 0;
+
+	switch (led_supported) {
+	case TPACPI_LED_570:
+			/* 570 */
+			led = 1 << led;
+			if (!acpi_evalf(led_handle, NULL, NULL, "vdd",
+					led, led_sled_arg1[ledstatus]))
+				rc = -EIO;
+			break;
+	case TPACPI_LED_OLD:
+			/* 600e/x, 770e, 770x, A21e, A2xm/p, T20-22, X20 */
+			led = 1 << led;
+			rc = ec_write(TPACPI_LED_EC_HLMS, led);
+			if (rc >= 0)
+				rc = ec_write(TPACPI_LED_EC_HLBL,
+					      led * led_exp_hlbl[ledstatus]);
+			if (rc >= 0)
+				rc = ec_write(TPACPI_LED_EC_HLCL,
+					      led * led_exp_hlcl[ledstatus]);
+			break;
+	case TPACPI_LED_NEW:
+			/* all others */
+			if (!acpi_evalf(led_handle, NULL, NULL, "vdd",
+					led, led_led_arg1[ledstatus]))
+				rc = -EIO;
+			break;
+	default:
+		rc = -ENXIO;
+	}
+
+	if (!rc)
+		tpacpi_led_state_cache[led] = ledstatus;
+
+	return rc;
+}
+
+static void led_sysfs_set_status(unsigned int led,
+				 enum led_brightness brightness)
+{
+	led_set_status(led,
+			(brightness == LED_OFF) ?
+			TPACPI_LED_OFF :
+			(tpacpi_led_state_cache[led] == TPACPI_LED_BLINK) ?
+				TPACPI_LED_BLINK : TPACPI_LED_ON);
+}
+
+static void led_set_status_worker(struct work_struct *work)
+{
+	struct tpacpi_led_classdev *data =
+		container_of(work, struct tpacpi_led_classdev, work);
+
+	if (likely(tpacpi_lifecycle == TPACPI_LIFE_RUNNING))
+		led_sysfs_set_status(data->led, data->new_brightness);
+}
+
+static void led_sysfs_set(struct led_classdev *led_cdev,
+			enum led_brightness brightness)
+{
+	struct tpacpi_led_classdev *data = container_of(led_cdev,
+			     struct tpacpi_led_classdev, led_classdev);
+
+	data->new_brightness = brightness;
+	queue_work(tpacpi_wq, &data->work);
+}
+
+static int led_sysfs_blink_set(struct led_classdev *led_cdev,
+			unsigned long *delay_on, unsigned long *delay_off)
+{
+	struct tpacpi_led_classdev *data = container_of(led_cdev,
+			     struct tpacpi_led_classdev, led_classdev);
+
+	/* Can we choose the flash rate? */
+	if (*delay_on == 0 && *delay_off == 0) {
+		/* yes. set them to the hardware blink rate (1 Hz) */
+		*delay_on = 500; /* ms */
+		*delay_off = 500; /* ms */
+	} else if ((*delay_on != 500) || (*delay_off != 500))
+		return -EINVAL;
+
+	data->new_brightness = TPACPI_LED_BLINK;
+	queue_work(tpacpi_wq, &data->work);
+
+	return 0;
+}
+
+static enum led_brightness led_sysfs_get(struct led_classdev *led_cdev)
+{
+	int rc;
+
+	struct tpacpi_led_classdev *data = container_of(led_cdev,
+			     struct tpacpi_led_classdev, led_classdev);
+
+	rc = led_get_status(data->led);
+
+	if (rc == TPACPI_LED_OFF || rc < 0)
+		rc = LED_OFF;	/* no error handling in led class :( */
+	else
+		rc = LED_FULL;
+
+	return rc;
+}
+
+static void led_exit(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < TPACPI_LED_NUMLEDS; i++) {
+		if (tpacpi_leds[i].led_classdev.name)
+			led_classdev_unregister(&tpacpi_leds[i].led_classdev);
+	}
+
+	kfree(tpacpi_leds);
+	tpacpi_leds = NULL;
+}
+
 static int __init led_init(struct ibm_init_struct *iibm)
 {
+	unsigned int i;
+	int rc;
+
 	vdbg_printk(TPACPI_DBG_INIT, "initializing LED subdriver\n");
 
 	TPACPI_ACPIHANDLE_INIT(led);
@@ -3613,10 +4006,41 @@ static int __init led_init(struct ibm_init_struct *iibm)
 	vdbg_printk(TPACPI_DBG_INIT, "LED commands are %s, mode %d\n",
 		str_supported(led_supported), led_supported);
 
+	tpacpi_leds = kzalloc(sizeof(*tpacpi_leds) * TPACPI_LED_NUMLEDS,
+			      GFP_KERNEL);
+	if (!tpacpi_leds) {
+		printk(TPACPI_ERR "Out of memory for LED data\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < TPACPI_LED_NUMLEDS; i++) {
+		tpacpi_leds[i].led = i;
+
+		tpacpi_leds[i].led_classdev.brightness_set = &led_sysfs_set;
+		tpacpi_leds[i].led_classdev.blink_set = &led_sysfs_blink_set;
+		if (led_supported == TPACPI_LED_570)
+			tpacpi_leds[i].led_classdev.brightness_get =
+							&led_sysfs_get;
+
+		tpacpi_leds[i].led_classdev.name = tpacpi_led_names[i];
+
+		INIT_WORK(&tpacpi_leds[i].work, led_set_status_worker);
+
+		rc = led_classdev_register(&tpacpi_pdev->dev,
+					   &tpacpi_leds[i].led_classdev);
+		if (rc < 0) {
+			tpacpi_leds[i].led_classdev.name = NULL;
+			led_exit();
+			return rc;
+		}
+	}
+
 	return (led_supported != TPACPI_LED_NONE)? 0 : 1;
 }
 
-#define led_status(s) ((s) == 0 ? "off" : ((s) == 1 ? "on" : "blinking"))
+#define str_led_status(s) \
+	((s) == TPACPI_LED_OFF ? "off" : \
+		((s) == TPACPI_LED_ON ? "on" : "blinking"))
 
 static int led_read(char *p)
 {
@@ -3632,11 +4056,11 @@ static int led_read(char *p)
 		/* 570 */
 		int i, status;
 		for (i = 0; i < 8; i++) {
-			if (!acpi_evalf(ec_handle,
-					&status, "GLED", "dd", 1 << i))
+			status = led_get_status(i);
+			if (status < 0)
 				return -EIO;
 			len += sprintf(p + len, "%d:\t\t%s\n",
-				       i, led_status(status));
+				       i, str_led_status(status));
 		}
 	}
 
@@ -3646,16 +4070,11 @@ static int led_read(char *p)
 	return len;
 }
 
-/* off, on, blink */
-static const int led_sled_arg1[] = { 0, 1, 3 };
-static const int led_exp_hlbl[] = { 0, 0, 1 };	/* led# * */
-static const int led_exp_hlcl[] = { 0, 1, 1 };	/* led# * */
-static const int led_led_arg1[] = { 0, 0x80, 0xc0 };
-
 static int led_write(char *buf)
 {
 	char *cmd;
-	int led, ind, ret;
+	int led, rc;
+	enum led_status_t s;
 
 	if (!led_supported)
 		return -ENODEV;
@@ -3665,38 +4084,18 @@ static int led_write(char *buf)
 			return -EINVAL;
 
 		if (strstr(cmd, "off")) {
-			ind = 0;
+			s = TPACPI_LED_OFF;
 		} else if (strstr(cmd, "on")) {
-			ind = 1;
+			s = TPACPI_LED_ON;
 		} else if (strstr(cmd, "blink")) {
-			ind = 2;
-		} else
-			return -EINVAL;
-
-		if (led_supported == TPACPI_LED_570) {
-			/* 570 */
-			led = 1 << led;
-			if (!acpi_evalf(led_handle, NULL, NULL, "vdd",
-					led, led_sled_arg1[ind]))
-				return -EIO;
-		} else if (led_supported == TPACPI_LED_OLD) {
-			/* 600e/x, 770e, 770x, A21e, A2xm/p, T20-22, X20 */
-			led = 1 << led;
-			ret = ec_write(TPACPI_LED_EC_HLMS, led);
-			if (ret >= 0)
-				ret = ec_write(TPACPI_LED_EC_HLBL,
-						led * led_exp_hlbl[ind]);
-			if (ret >= 0)
-				ret = ec_write(TPACPI_LED_EC_HLCL,
-						led * led_exp_hlcl[ind]);
-			if (ret < 0)
-				return ret;
+			s = TPACPI_LED_BLINK;
 		} else {
-			/* all others */
-			if (!acpi_evalf(led_handle, NULL, NULL, "vdd",
-					led, led_led_arg1[ind]))
-				return -EIO;
+			return -EINVAL;
 		}
+
+		rc = led_set_status(led, s);
+		if (rc < 0)
+			return rc;
 	}
 
 	return 0;
@@ -3706,6 +4105,7 @@ static struct ibm_struct led_driver_data = {
 	.name = "led",
 	.read = led_read,
 	.write = led_write,
+	.exit = led_exit,
 };
 
 /*************************************************************************
@@ -4170,8 +4570,16 @@ static struct ibm_struct ecdump_driver_data = {
 
 #define TPACPI_BACKLIGHT_DEV_NAME "thinkpad_screen"
 
+enum {
+	TP_EC_BACKLIGHT = 0x31,
+
+	/* TP_EC_BACKLIGHT bitmasks */
+	TP_EC_BACKLIGHT_LVLMSK = 0x1F,
+	TP_EC_BACKLIGHT_CMDMSK = 0xE0,
+	TP_EC_BACKLIGHT_MAPSW = 0x20,
+};
+
 static struct backlight_device *ibm_backlight_device;
-static int brightness_offset = 0x31;
 static int brightness_mode;
 static unsigned int brightness_enable = 2; /* 2 = auto, 0 = no, 1 = yes */
 
@@ -4180,16 +4588,24 @@ static struct mutex brightness_mutex;
 /*
  * ThinkPads can read brightness from two places: EC 0x31, or
  * CMOS NVRAM byte 0x5E, bits 0-3.
+ *
+ * EC 0x31 has the following layout
+ *   Bit 7: unknown function
+ *   Bit 6: unknown function
+ *   Bit 5: Z: honour scale changes, NZ: ignore scale changes
+ *   Bit 4: must be set to zero to avoid problems
+ *   Bit 3-0: backlight brightness level
+ *
+ * brightness_get_raw returns status data in the EC 0x31 layout
  */
-static int brightness_get(struct backlight_device *bd)
+static int brightness_get_raw(int *status)
 {
 	u8 lec = 0, lcmos = 0, level = 0;
 
 	if (brightness_mode & 1) {
-		if (!acpi_ec_read(brightness_offset, &lec))
+		if (!acpi_ec_read(TP_EC_BACKLIGHT, &lec))
 			return -EIO;
-		lec &= (tp_features.bright_16levels)? 0x0f : 0x07;
-		level = lec;
+		level = lec & TP_EC_BACKLIGHT_LVLMSK;
 	};
 	if (brightness_mode & 2) {
 		lcmos = (nvram_read_byte(TP_NVRAM_ADDR_BRIGHTNESS)
@@ -4199,16 +4615,27 @@ static int brightness_get(struct backlight_device *bd)
 		level = lcmos;
 	}
 
-	if (brightness_mode == 3 && lec != lcmos) {
-		printk(TPACPI_ERR
-			"CMOS NVRAM (%u) and EC (%u) do not agree "
-			"on display brightness level\n",
-			(unsigned int) lcmos,
-			(unsigned int) lec);
-		return -EIO;
+	if (brightness_mode == 3) {
+		*status = lec;	/* Prefer EC, CMOS is just a backing store */
+		lec &= TP_EC_BACKLIGHT_LVLMSK;
+		if (lec == lcmos)
+			tp_warned.bright_cmos_ec_unsync = 0;
+		else {
+			if (!tp_warned.bright_cmos_ec_unsync) {
+				printk(TPACPI_ERR
+					"CMOS NVRAM (%u) and EC (%u) do not "
+					"agree on display brightness level\n",
+					(unsigned int) lcmos,
+					(unsigned int) lec);
+				tp_warned.bright_cmos_ec_unsync = 1;
+			}
+			return -EIO;
+		}
+	} else {
+		*status = level;
 	}
 
-	return level;
+	return 0;
 }
 
 /* May return EINTR which can always be mapped to ERESTARTSYS */
@@ -4216,19 +4643,22 @@ static int brightness_set(int value)
 {
 	int cmos_cmd, inc, i, res;
 	int current_value;
+	int command_bits;
 
-	if (value > ((tp_features.bright_16levels)? 15 : 7))
+	if (value > ((tp_features.bright_16levels)? 15 : 7) ||
+	    value < 0)
 		return -EINVAL;
 
 	res = mutex_lock_interruptible(&brightness_mutex);
 	if (res < 0)
 		return res;
 
-	current_value = brightness_get(NULL);
-	if (current_value < 0) {
-		res = current_value;
+	res = brightness_get_raw(&current_value);
+	if (res < 0)
 		goto errout;
-	}
+
+	command_bits = current_value & TP_EC_BACKLIGHT_CMDMSK;
+	current_value &= TP_EC_BACKLIGHT_LVLMSK;
 
 	cmos_cmd = value > current_value ?
 			TP_CMOS_BRIGHTNESS_UP :
@@ -4243,7 +4673,8 @@ static int brightness_set(int value)
 			goto errout;
 		}
 		if ((brightness_mode & 1) &&
-		    !acpi_ec_write(brightness_offset, i + inc)) {
+		    !acpi_ec_write(TP_EC_BACKLIGHT,
+				   (i + inc) | command_bits)) {
 			res = -EIO;
 			goto errout;;
 		}
@@ -4266,106 +4697,23 @@ static int brightness_update_status(struct backlight_device *bd)
 				bd->props.brightness : 0);
 }
 
-static struct backlight_ops ibm_backlight_data = {
-	.get_brightness = brightness_get,
-	.update_status  = brightness_update_status,
-};
-
-/* --------------------------------------------------------------------- */
-
-static int __init tpacpi_query_bcll_levels(acpi_handle handle)
-{
-	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
-	union acpi_object *obj;
-	int rc;
-
-	if (ACPI_SUCCESS(acpi_evaluate_object(handle, NULL, NULL, &buffer))) {
-		obj = (union acpi_object *)buffer.pointer;
-		if (!obj || (obj->type != ACPI_TYPE_PACKAGE)) {
-			printk(TPACPI_ERR "Unknown BCLL data, "
-			       "please report this to %s\n", TPACPI_MAIL);
-			rc = 0;
-		} else {
-			rc = obj->package.count;
-		}
-	} else {
-		return 0;
-	}
-
-	kfree(buffer.pointer);
-	return rc;
-}
-
-static acpi_status __init brightness_find_bcll(acpi_handle handle, u32 lvl,
-					void *context, void **rv)
-{
-	char name[ACPI_PATH_SEGMENT_LENGTH];
-	struct acpi_buffer buffer = { sizeof(name), &name };
-
-	if (ACPI_SUCCESS(acpi_get_name(handle, ACPI_SINGLE_NAME, &buffer)) &&
-	    !strncmp("BCLL", name, sizeof(name) - 1)) {
-		if (tpacpi_query_bcll_levels(handle) == 16) {
-			*rv = handle;
-			return AE_CTRL_TERMINATE;
-		} else {
-			return AE_OK;
-		}
-	} else {
-		return AE_OK;
-	}
-}
-
-static int __init brightness_check_levels(void)
+static int brightness_get(struct backlight_device *bd)
 {
-	int status;
-	void *found_node = NULL;
+	int status, res;
 
-	if (!vid_handle) {
-		TPACPI_ACPIHANDLE_INIT(vid);
-	}
-	if (!vid_handle)
-		return 0;
-
-	/* Search for a BCLL package with 16 levels */
-	status = acpi_walk_namespace(ACPI_TYPE_PACKAGE, vid_handle, 3,
-					brightness_find_bcll, NULL,
-					&found_node);
-
-	return (ACPI_SUCCESS(status) && found_node != NULL);
-}
-
-static acpi_status __init brightness_find_bcl(acpi_handle handle, u32 lvl,
-					void *context, void **rv)
-{
-	char name[ACPI_PATH_SEGMENT_LENGTH];
-	struct acpi_buffer buffer = { sizeof(name), &name };
+	res = brightness_get_raw(&status);
+	if (res < 0)
+		return 0; /* FIXME: teach backlight about error handling */
 
-	if (ACPI_SUCCESS(acpi_get_name(handle, ACPI_SINGLE_NAME, &buffer)) &&
-	    !strncmp("_BCL", name, sizeof(name) - 1)) {
-		*rv = handle;
-		return AE_CTRL_TERMINATE;
-	} else {
-		return AE_OK;
-	}
+	return status & TP_EC_BACKLIGHT_LVLMSK;
 }
 
-static int __init brightness_check_std_acpi_support(void)
-{
-	int status;
-	void *found_node = NULL;
-
-	if (!vid_handle) {
-		TPACPI_ACPIHANDLE_INIT(vid);
-	}
-	if (!vid_handle)
-		return 0;
-
-	/* Search for a _BCL method, but don't execute it */
-	status = acpi_walk_namespace(ACPI_TYPE_METHOD, vid_handle, 3,
-				     brightness_find_bcl, NULL, &found_node);
+static struct backlight_ops ibm_backlight_data = {
+	.get_brightness = brightness_get,
+	.update_status  = brightness_update_status,
+};
 
-	return (ACPI_SUCCESS(status) && found_node != NULL);
-}
+/* --------------------------------------------------------------------- */
 
 static int __init brightness_init(struct ibm_init_struct *iibm)
 {
@@ -4375,13 +4723,19 @@ static int __init brightness_init(struct ibm_init_struct *iibm)
 
 	mutex_init(&brightness_mutex);
 
-	if (!brightness_enable) {
-		dbg_printk(TPACPI_DBG_INIT,
-			   "brightness support disabled by "
-			   "module parameter\n");
-		return 1;
-	} else if (brightness_enable > 1) {
-		if (brightness_check_std_acpi_support()) {
+	/*
+	 * We always attempt to detect acpi support, so as to switch
+	 * Lenovo Vista BIOS to ACPI brightness mode even if we are not
+	 * going to publish a backlight interface
+	 */
+	b = tpacpi_check_std_acpi_brightness_support();
+	if (b > 0) {
+		if (thinkpad_id.vendor == PCI_VENDOR_ID_LENOVO) {
+			printk(TPACPI_NOTICE
+			       "Lenovo BIOS switched to ACPI backlight "
+			       "control mode\n");
+		}
+		if (brightness_enable > 1) {
 			printk(TPACPI_NOTICE
 			       "standard ACPI backlight interface "
 			       "available, not loading native one...\n");
@@ -4389,6 +4743,22 @@ static int __init brightness_init(struct ibm_init_struct *iibm)
 		}
 	}
 
+	if (!brightness_enable) {
+		dbg_printk(TPACPI_DBG_INIT,
+			   "brightness support disabled by "
+			   "module parameter\n");
+		return 1;
+	}
+
+	if (b > 16) {
+		printk(TPACPI_ERR
+		       "Unsupported brightness interface, "
+		       "please contact %s\n", TPACPI_MAIL);
+		return 1;
+	}
+	if (b == 16)
+		tp_features.bright_16levels = 1;
+
 	if (!brightness_mode) {
 		if (thinkpad_id.vendor == PCI_VENDOR_ID_LENOVO)
 			brightness_mode = 2;
@@ -4402,12 +4772,7 @@ static int __init brightness_init(struct ibm_init_struct *iibm)
 	if (brightness_mode > 3)
 		return -EINVAL;
 
-	tp_features.bright_16levels =
-			thinkpad_id.vendor == PCI_VENDOR_ID_LENOVO &&
-			brightness_check_levels();
-
-	b = brightness_get(NULL);
-	if (b < 0)
+	if (brightness_get_raw(&b) < 0)
 		return 1;
 
 	if (tp_features.bright_16levels)
@@ -4425,7 +4790,7 @@ static int __init brightness_init(struct ibm_init_struct *iibm)
 
 	ibm_backlight_device->props.max_brightness =
 				(tp_features.bright_16levels)? 15 : 7;
-	ibm_backlight_device->props.brightness = b;
+	ibm_backlight_device->props.brightness = b & TP_EC_BACKLIGHT_LVLMSK;
 	backlight_update_status(ibm_backlight_device);
 
 	return 0;
@@ -5046,11 +5411,11 @@ static void fan_watchdog_reset(void)
 	if (fan_watchdog_maxinterval > 0 &&
 	    tpacpi_lifecycle != TPACPI_LIFE_EXITING) {
 		fan_watchdog_active = 1;
-		if (!schedule_delayed_work(&fan_watchdog_task,
+		if (!queue_delayed_work(tpacpi_wq, &fan_watchdog_task,
 				msecs_to_jiffies(fan_watchdog_maxinterval
 						 * 1000))) {
 			printk(TPACPI_ERR
-			       "failed to schedule the fan watchdog, "
+			       "failed to queue the fan watchdog, "
 			       "watchdog will not trigger\n");
 		}
 	} else
@@ -5420,7 +5785,7 @@ static void fan_exit(void)
 			   &driver_attr_fan_watchdog);
 
 	cancel_delayed_work(&fan_watchdog_task);
-	flush_scheduled_work();
+	flush_workqueue(tpacpi_wq);
 }
 
 static int fan_read(char *p)
@@ -5826,10 +6191,13 @@ static void __init get_thinkpad_model_data(struct thinkpad_id_data *tp)
 
 	tp->model_str = kstrdup(dmi_get_system_info(DMI_PRODUCT_VERSION),
 					GFP_KERNEL);
-	if (strnicmp(tp->model_str, "ThinkPad", 8) != 0) {
+	if (tp->model_str && strnicmp(tp->model_str, "ThinkPad", 8) != 0) {
 		kfree(tp->model_str);
 		tp->model_str = NULL;
 	}
+
+	tp->nummodel_str = kstrdup(dmi_get_system_info(DMI_PRODUCT_NAME),
+					GFP_KERNEL);
 }
 
 static int __init probe_for_thinkpad(void)
@@ -6071,6 +6439,9 @@ static void thinkpad_acpi_module_exit(void)
 	if (proc_dir)
 		remove_proc_entry(TPACPI_PROC_DIR, acpi_root_dir);
 
+	if (tpacpi_wq)
+		destroy_workqueue(tpacpi_wq);
+
 	kfree(thinkpad_id.bios_version_str);
 	kfree(thinkpad_id.ec_version_str);
 	kfree(thinkpad_id.model_str);
@@ -6101,6 +6472,12 @@ static int __init thinkpad_acpi_module_init(void)
 	TPACPI_ACPIHANDLE_INIT(ecrd);
 	TPACPI_ACPIHANDLE_INIT(ecwr);
 
+	tpacpi_wq = create_singlethread_workqueue(TPACPI_WORKQUEUE_NAME);
+	if (!tpacpi_wq) {
+		thinkpad_acpi_module_exit();
+		return -ENOMEM;
+	}
+
 	proc_dir = proc_mkdir(TPACPI_PROC_DIR, acpi_root_dir);
 	if (!proc_dir) {
 		printk(TPACPI_ERR
@@ -6223,6 +6600,8 @@ static int __init thinkpad_acpi_module_init(void)
 /* Please remove this in year 2009 */
 MODULE_ALIAS("ibm_acpi");
 
+MODULE_ALIAS(TPACPI_DRVR_SHORTNAME);
+
 /*
  * DMI matching for module autoloading
  *