14 files changed, 380 insertions, 280 deletions
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 28817490fdc..096d470e280 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -76,7 +76,7 @@
 
 	.macro FAKE_STACK_FRAME child_rip
 	/* push in order ss, rsp, eflags, cs, rip */
-	xorq %rax, %rax
+	xorl %eax, %eax
 	pushq %rax /* ss */
 	CFI_ADJUST_CFA_OFFSET	8
 	pushq %rax /* rsp */
@@ -423,7 +423,7 @@ ENTRY(stub_rt_sigreturn)
 	testl $3,CS(%rdi)
 	je 1f
 	swapgs	
-1:	addl $1,%gs:pda_irqcount	# RED-PEN should check preempt count
+1:	incl	%gs:pda_irqcount	# RED-PEN should check preempt count
 	movq %gs:pda_irqstackptr,%rax
 	cmoveq %rax,%rsp							
 	pushq %rdi			# save old stack	
@@ -436,7 +436,7 @@ ENTRY(common_interrupt)
 ret_from_intr:		
 	popq  %rdi
 	cli	
-	subl $1,%gs:pda_irqcount
+	decl %gs:pda_irqcount
 #ifdef CONFIG_DEBUG_INFO
 	movq RBP(%rdi),%rbp
 #endif
@@ -494,7 +494,7 @@ retint_signal:
 	sti
 	SAVE_REST
 	movq $-1,ORIG_RAX(%rsp) 			
-	xorq %rsi,%rsi		# oldset
+	xorl %esi,%esi		# oldset
 	movq %rsp,%rdi		# &pt_regs
 	call do_notify_resume
 	RESTORE_REST
@@ -752,7 +752,7 @@ child_rip:
 	movq %rsi, %rdi
 	call *%rax
 	# exit
-	xorq %rdi, %rdi
+	xorl %edi, %edi
 	call do_exit
 
 /*
@@ -918,3 +918,15 @@ ENTRY(machine_check)
 ENTRY(call_debug)
        zeroentry do_call_debug
 
+ENTRY(call_softirq)
+	movq %gs:pda_irqstackptr,%rax
+	pushq %r15
+	movq %rsp,%r15
+	incl %gs:pda_irqcount
+	cmove %rax,%rsp
+	call __do_softirq
+	movq %r15,%rsp
+	decl %gs:pda_irqcount
+	popq %r15
+	ret
+
diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c
index 69b9c25a8fc..30c843a5efd 100644
--- a/arch/x86_64/kernel/genapic.c
+++ b/arch/x86_64/kernel/genapic.c
@@ -31,6 +31,7 @@ u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
 
 extern struct genapic apic_cluster;
 extern struct genapic apic_flat;
+extern struct genapic apic_physflat;
 
 struct genapic *genapic = &apic_flat;
 
@@ -44,12 +45,7 @@ void __init clustered_apic_check(void)
 	u8 clusters, max_cluster;
 	u8 id;
 	u8 cluster_cnt[NUM_APIC_CLUSTERS];
-
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
-		/* AMD always uses flat mode right now */
-		genapic = &apic_flat;
-		goto print;
-	}
+	int num_cpus = 0;
 
 #if defined(CONFIG_ACPI_BUS)
 	/*
@@ -64,15 +60,34 @@ void __init clustered_apic_check(void)
 #endif
 
 	memset(cluster_cnt, 0, sizeof(cluster_cnt));
-
 	for (i = 0; i < NR_CPUS; i++) {
 		id = bios_cpu_apicid[i];
-		if (id != BAD_APICID)
-			cluster_cnt[APIC_CLUSTERID(id)]++;
+		if (id == BAD_APICID)
+			continue;
+		num_cpus++;
+		cluster_cnt[APIC_CLUSTERID(id)]++;
 	}
 
+	/* Don't use clustered mode on AMD platforms. */
+ 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+		genapic = &apic_physflat;
+#ifndef CONFIG_CPU_HOTPLUG
+		/* In the CPU hotplug case we cannot use broadcast mode
+		   because that opens a race when a CPU is removed.
+		   Stay at physflat mode in this case.
+		   It is bad to do this unconditionally though. Once
+		   we have ACPI platform support for CPU hotplug
+		   we should detect hotplug capablity from ACPI tables and
+		   only do this when really needed. -AK */
+		if (num_cpus <= 8)
+			genapic = &apic_flat;
+#endif
+ 		goto print;
+ 	}
+
 	clusters = 0;
 	max_cluster = 0;
+
 	for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
 		if (cluster_cnt[i] > 0) {
 			++clusters;
diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c
index 28284696508..adc96282a9e 100644
--- a/arch/x86_64/kernel/genapic_flat.c
+++ b/arch/x86_64/kernel/genapic_flat.c
@@ -2,13 +2,11 @@
  * Copyright 2004 James Cleverdon, IBM.
  * Subject to the GNU Public License, v.2
  *
- * Flat APIC subarch code.  Maximum 8 CPUs, logical delivery.
+ * Flat APIC subarch code.
  *
  * Hacked for x86-64 by James Cleverdon from i386 architecture code by
  * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
  * James Cleverdon.
- * Ashok Raj <ashok.raj@intel.com>
- * 	Removed IPI broadcast shortcut to support CPU hotplug
  */
 #include <linux/config.h>
 #include <linux/threads.h>
@@ -20,47 +18,6 @@
 #include <asm/smp.h>
 #include <asm/ipi.h>
 
-/*
- * The following permit choosing broadcast IPI shortcut v.s sending IPI only
- * to online cpus via the send_IPI_mask varient.
- * The mask version is my preferred option, since it eliminates a lot of
- * other extra code that would need to be written to cleanup intrs sent
- * to a CPU while offline.
- *
- * Sending broadcast introduces lots of trouble in CPU hotplug situations.
- * These IPI's are delivered to cpu's irrespective of their offline status
- * and could pickup stale intr data when these CPUS are turned online.
- *
- * Not using broadcast is a cleaner approach IMO, but Andi Kleen disagrees with
- * the idea of not using broadcast IPI's anymore. Hence the run time check
- * is introduced, on his request so we can choose an alternate mechanism.
- *
- * Initial wacky performance tests that collect cycle counts show
- * no increase in using mask v.s broadcast version. In fact they seem
- * identical in terms of cycle counts.
- *
- * if we need to use broadcast, we need to do the following.
- *
- * cli;
- * hold call_lock;
- * clear any pending IPI, just ack and clear all pending intr
- * set cpu_online_map;
- * release call_lock;
- * sti;
- *
- * The complicated dummy irq processing shown above is not required if
- * we didnt sent IPI's to wrong CPU's in the first place.
- *
- * - Ashok Raj <ashok.raj@intel.com>
- */
-#ifdef CONFIG_HOTPLUG_CPU
-#define DEFAULT_SEND_IPI	(1)
-#else
-#define DEFAULT_SEND_IPI	(0)
-#endif
-
-static int no_broadcast=DEFAULT_SEND_IPI;
-
 static cpumask_t flat_target_cpus(void)
 {
 	return cpu_online_map;
@@ -119,37 +76,15 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
 	local_irq_restore(flags);
 }
 
-static inline void __local_flat_send_IPI_allbutself(int vector)
-{
-	if (no_broadcast) {
-		cpumask_t mask = cpu_online_map;
-		int this_cpu = get_cpu();
-
-		cpu_clear(this_cpu, mask);
-		flat_send_IPI_mask(mask, vector);
-		put_cpu();
-	}
-	else
-		__send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
-}
-
-static inline void __local_flat_send_IPI_all(int vector)
-{
-	if (no_broadcast)
-		flat_send_IPI_mask(cpu_online_map, vector);
-	else
-		__send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
-}
-
 static void flat_send_IPI_allbutself(int vector)
 {
 	if (((num_online_cpus()) - 1) >= 1)
-		__local_flat_send_IPI_allbutself(vector);
+		__send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL);
 }
 
 static void flat_send_IPI_all(int vector)
 {
-	__local_flat_send_IPI_all(vector);
+	__send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
 }
 
 static int flat_apic_id_registered(void)
@@ -170,16 +105,6 @@ static unsigned int phys_pkg_id(int index_msb)
 	return ((ebx >> 24) & 0xFF) >> index_msb;
 }
 
-static __init int no_ipi_broadcast(char *str)
-{
-	get_option(&str, &no_broadcast);
-	printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" :
-											"IPI Broadcast");
-	return 1;
-}
-
-__setup("no_ipi_broadcast", no_ipi_broadcast);
-
 struct genapic apic_flat =  {
 	.name = "flat",
 	.int_delivery_mode = dest_LowestPrio,
@@ -195,11 +120,62 @@ struct genapic apic_flat =  {
 	.phys_pkg_id = phys_pkg_id,
 };
 
-static int __init print_ipi_mode(void)
+/*
+ * Physflat mode is used when there are more than 8 CPUs on a AMD system.
+ * We cannot use logical delivery in this case because the mask
+ * overflows, so use physical mode.
+ */
+
+static cpumask_t physflat_target_cpus(void)
+{
+	return cpumask_of_cpu(0);
+}
+
+static void physflat_send_IPI_mask(cpumask_t cpumask, int vector)
+{
+	send_IPI_mask_sequence(cpumask, vector);
+}
+
+static void physflat_send_IPI_allbutself(int vector)
+{
+	cpumask_t allbutme = cpu_online_map;
+	int me = get_cpu();
+	cpu_clear(me, allbutme);
+	physflat_send_IPI_mask(allbutme, vector);
+	put_cpu();
+}
+
+static void physflat_send_IPI_all(int vector)
 {
-	printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" :
-											"Shortcut");
-	return 0;
+	physflat_send_IPI_mask(cpu_online_map, vector);
 }
 
-late_initcall(print_ipi_mode);
+static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
+{
+	int cpu;
+
+	/*
+	 * We're using fixed IRQ delivery, can only return one phys APIC ID.
+	 * May as well be the first.
+	 */
+	cpu = first_cpu(cpumask);
+	if ((unsigned)cpu < NR_CPUS)
+		return x86_cpu_to_apicid[cpu];
+	else
+		return BAD_APICID;
+}
+
+struct genapic apic_physflat =  {
+	.name = "physical flat",
+	.int_delivery_mode = dest_LowestPrio,
+	.int_dest_mode = (APIC_DEST_PHYSICAL != 0),
+	.int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_LOWEST,
+	.target_cpus = physflat_target_cpus,
+	.apic_id_registered = flat_apic_id_registered,
+	.init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/
+	.send_IPI_all = physflat_send_IPI_all,
+	.send_IPI_allbutself = physflat_send_IPI_allbutself,
+	.send_IPI_mask = physflat_send_IPI_mask,
+	.cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
+	.phys_pkg_id = phys_pkg_id,
+};
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 8d765aa77a2..98ff5eb32b9 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -137,14 +137,14 @@ startup_64:
 	wrmsr
 
 	/* Setup cr0 */
-	xorq	%rax, %rax
-	btsq	$31, %rax			/* Enable paging */
-	btsq	$0, %rax			/* Enable protected mode */
-	btsq	$1, %rax			/* Enable MP */
-	btsq	$4, %rax			/* Enable ET */
-	btsq	$5, %rax			/* Enable NE */
-	btsq	$16, %rax			/* Enable WP */
-	btsq	$18, %rax			/* Enable AM */
+#define CR0_PM				1		/* protected mode */
+#define CR0_MP				(1<<1)
+#define CR0_ET				(1<<4)
+#define CR0_NE				(1<<5)
+#define CR0_WP				(1<<16)
+#define CR0_AM				(1<<18)
+#define CR0_PAGING 			(1<<31)
+	movl $CR0_PM|CR0_MP|CR0_ET|CR0_NE|CR0_WP|CR0_AM|CR0_PAGING,%eax
 	/* Make changes effective */
 	movq	%rax, %cr0
 
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index cc3fb85f514..849a20aec7c 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -135,3 +135,22 @@ void fixup_irqs(cpumask_t map)
 	local_irq_disable();
 }
 #endif
+
+extern void call_softirq(void);
+
+asmlinkage void do_softirq(void)
+{
+ 	__u32 pending;
+ 	unsigned long flags;
+
+ 	if (in_interrupt())
+ 		return;
+
+ 	local_irq_save(flags);
+ 	pending = local_softirq_pending();
+ 	/* Switch to interrupt stack */
+ 	if (pending)
+		call_softirq();
+ 	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(do_softirq);
diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c
index 60d1eff4156..89fab51e20f 100644
--- a/arch/x86_64/kernel/machine_kexec.c
+++ b/arch/x86_64/kernel/machine_kexec.c
@@ -8,43 +8,26 @@
 
 #include <linux/mm.h>
 #include <linux/kexec.h>
-#include <linux/delay.h>
 #include <linux/string.h>
 #include <linux/reboot.h>
-#include <asm/pda.h>
 #include <asm/pgtable.h>
-#include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
 #include <asm/io.h>
-#include <asm/apic.h>
-#include <asm/cpufeature.h>
-#include <asm/hw_irq.h>
-
-#define LEVEL0_SIZE (1UL << 12UL)
-#define LEVEL1_SIZE (1UL << 21UL)
-#define LEVEL2_SIZE (1UL << 30UL)
-#define LEVEL3_SIZE (1UL << 39UL)
-#define LEVEL4_SIZE (1UL << 48UL)
-
-#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
-#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE)
-#define L2_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
-#define L3_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
-
-static void init_level2_page(u64 *level2p, unsigned long addr)
+
+static void init_level2_page(pmd_t *level2p, unsigned long addr)
 {
 	unsigned long end_addr;
 
 	addr &= PAGE_MASK;
-	end_addr = addr + LEVEL2_SIZE;
+	end_addr = addr + PUD_SIZE;
 	while (addr < end_addr) {
-		*(level2p++) = addr | L1_ATTR;
-		addr += LEVEL1_SIZE;
+		set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+		addr += PMD_SIZE;
 	}
 }
 
-static int init_level3_page(struct kimage *image, u64 *level3p,
+static int init_level3_page(struct kimage *image, pud_t *level3p,
 				unsigned long addr, unsigned long last_addr)
 {
 	unsigned long end_addr;
@@ -52,32 +35,32 @@ static int init_level3_page(struct kimage *image, u64 *level3p,
 
 	result = 0;
 	addr &= PAGE_MASK;
-	end_addr = addr + LEVEL3_SIZE;
+	end_addr = addr + PGDIR_SIZE;
 	while ((addr < last_addr) && (addr < end_addr)) {
 		struct page *page;
-		u64 *level2p;
+		pmd_t *level2p;
 
 		page = kimage_alloc_control_pages(image, 0);
 		if (!page) {
 			result = -ENOMEM;
 			goto out;
 		}
-		level2p = (u64 *)page_address(page);
+		level2p = (pmd_t *)page_address(page);
 		init_level2_page(level2p, addr);
-		*(level3p++) = __pa(level2p) | L2_ATTR;
-		addr += LEVEL2_SIZE;
+		set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
+		addr += PUD_SIZE;
 	}
 	/* clear the unused entries */
 	while (addr < end_addr) {
-		*(level3p++) = 0;
-		addr += LEVEL2_SIZE;
+		pud_clear(level3p++);
+		addr += PUD_SIZE;
 	}
 out:
 	return result;
 }
 
 
-static int init_level4_page(struct kimage *image, u64 *level4p,
+static int init_level4_page(struct kimage *image, pgd_t *level4p,
 				unsigned long addr, unsigned long last_addr)
 {
 	unsigned long end_addr;
@@ -85,28 +68,28 @@ static int init_level4_page(struct kimage *image, u64 *level4p,
 
 	result = 0;
 	addr &= PAGE_MASK;
-	end_addr = addr + LEVEL4_SIZE;
+	end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
 	while ((addr < last_addr) && (addr < end_addr)) {
 		struct page *page;
-		u64 *level3p;
+		pud_t *level3p;
 
 		page = kimage_alloc_control_pages(image, 0);
 		if (!page) {
 			result = -ENOMEM;
 			goto out;
 		}
-		level3p = (u64 *)page_address(page);
+		level3p = (pud_t *)page_address(page);
 		result = init_level3_page(image, level3p, addr, last_addr);
 		if (result) {
 			goto out;
 		}
-		*(level4p++) = __pa(level3p) | L3_ATTR;
-		addr += LEVEL3_SIZE;
+		set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
+		addr += PGDIR_SIZE;
 	}
 	/* clear the unused entries */
 	while (addr < end_addr) {
-		*(level4p++) = 0;
-		addr += LEVEL3_SIZE;
+		pgd_clear(level4p++);
+		addr += PGDIR_SIZE;
 	}
 out:
 	return result;
@@ -115,52 +98,50 @@ out:
 
 static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
 {
-	u64 *level4p;
-	level4p = (u64 *)__va(start_pgtable);
+	pgd_t *level4p;
+	level4p = (pgd_t *)__va(start_pgtable);
  	return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
 }
 
 static void set_idt(void *newidt, u16 limit)
 {
-	unsigned char curidt[10];
+	struct desc_ptr curidt;
 
 	/* x86-64 supports unaliged loads & stores */
-	(*(u16 *)(curidt)) = limit;
-	(*(u64 *)(curidt +2)) = (unsigned long)(newidt);
+	curidt.size    = limit;
+	curidt.address = (unsigned long)newidt;
 
 	__asm__ __volatile__ (
-		"lidt %0\n"
-		: "=m" (curidt)
+		"lidtq %0\n"
+		: : "m" (curidt)
 		);
 };
 
 
 static void set_gdt(void *newgdt, u16 limit)
 {
-	unsigned char curgdt[10];
+	struct desc_ptr curgdt;
 
 	/* x86-64 supports unaligned loads & stores */
-	(*(u16 *)(curgdt)) = limit;
-	(*(u64 *)(curgdt +2)) = (unsigned long)(newgdt);
+	curgdt.size    = limit;
+	curgdt.address = (unsigned long)newgdt;
 
 	__asm__ __volatile__ (
-		"lgdt %0\n"
-		: "=m" (curgdt)
+		"lgdtq %0\n"
+		: : "m" (curgdt)
 		);
 };
 
 static void load_segments(void)
 {
 	__asm__ __volatile__ (
-		"\tmovl $"STR(__KERNEL_DS)",%eax\n"
-		"\tmovl %eax,%ds\n"
-		"\tmovl %eax,%es\n"
-		"\tmovl %eax,%ss\n"
-		"\tmovl %eax,%fs\n"
-		"\tmovl %eax,%gs\n"
+		"\tmovl %0,%%ds\n"
+		"\tmovl %0,%%es\n"
+		"\tmovl %0,%%ss\n"
+		"\tmovl %0,%%fs\n"
+		"\tmovl %0,%%gs\n"
+		: : "a" (__KERNEL_DS)
 		);
-#undef STR
-#undef __STR
 }
 
 typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
@@ -178,7 +159,7 @@ int machine_kexec_prepare(struct kimage *image)
 
 	/* Calculate the offsets */
 	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
-	control_code_buffer = start_pgtable + 4096UL;
+	control_code_buffer = start_pgtable + PAGE_SIZE;
 
 	/* Setup the identity mapped 64bit page table */
 	result = init_pgtable(image, start_pgtable);
@@ -214,7 +195,7 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 	/* Calculate the offsets */
 	page_list = image->head;
 	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
-	control_code_buffer = start_pgtable + 4096UL;
+	control_code_buffer = start_pgtable + PAGE_SIZE;
 
 	/* Set the low half of the page table to my identity mapped
 	 * page table for kexec.  Leave the high half pointing at the
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 21e70625a49..3b267c91bb0 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -15,6 +15,8 @@
 #include <linux/sysdev.h>
 #include <linux/miscdevice.h>
 #include <linux/fs.h>
+#include <linux/cpu.h>
+#include <linux/percpu.h>
 #include <asm/processor.h> 
 #include <asm/msr.h>
 #include <asm/mce.h>
@@ -514,10 +516,7 @@ static struct sysdev_class mce_sysclass = {
 	set_kset_name("machinecheck"),
 };
 
-static struct sys_device device_mce = {
-	.id	= 0,
-	.cls	= &mce_sysclass,
-};
+static DEFINE_PER_CPU(struct sys_device, device_mce);
 
 /* Why are there no generic functions for this? */
 #define ACCESSOR(name, var, start) \
@@ -542,27 +541,83 @@ ACCESSOR(bank4ctl,bank[4],mce_restart())
 ACCESSOR(tolerant,tolerant,)
 ACCESSOR(check_interval,check_interval,mce_restart())
 
-static __cpuinit int mce_init_device(void)
+/* Per cpu sysdev init.  All of the cpus still share the same ctl bank */
+static __cpuinit int mce_create_device(unsigned int cpu)
 {
 	int err;
+	if (!mce_available(&cpu_data[cpu]))
+		return -EIO;
+
+	per_cpu(device_mce,cpu).id = cpu;
+	per_cpu(device_mce,cpu).cls = &mce_sysclass;
+
+	err = sysdev_register(&per_cpu(device_mce,cpu));
+
+	if (!err) {
+		sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
+		sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
+		sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
+		sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
+		sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
+		sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
+		sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
+	}
+	return err;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static __cpuinit void mce_remove_device(unsigned int cpu)
+{
+	sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
+	sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
+	sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
+	sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
+	sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
+	sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
+	sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
+	sysdev_unregister(&per_cpu(device_mce,cpu));
+}
+#endif
+
+/* Get notified when a cpu comes on/off. Be hotplug friendly. */
+static __cpuinit int
+mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+
+	switch (action) {
+	case CPU_ONLINE:
+		mce_create_device(cpu);
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DEAD:
+		mce_remove_device(cpu);
+		break;
+#endif
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block mce_cpu_notifier = {
+	.notifier_call = mce_cpu_callback,
+};
+
+static __init int mce_init_device(void)
+{
+	int err;
+	int i = 0;
+
 	if (!mce_available(&boot_cpu_data))
 		return -EIO;
 	err = sysdev_class_register(&mce_sysclass);
-	if (!err)
-		err = sysdev_register(&device_mce);
-	if (!err) { 
-		/* could create per CPU objects, but it is not worth it. */
-		sysdev_create_file(&device_mce, &attr_bank0ctl); 
-		sysdev_create_file(&device_mce, &attr_bank1ctl); 
-		sysdev_create_file(&device_mce, &attr_bank2ctl); 
-		sysdev_create_file(&device_mce, &attr_bank3ctl); 
-		sysdev_create_file(&device_mce, &attr_bank4ctl); 
-		sysdev_create_file(&device_mce, &attr_tolerant); 
-		sysdev_create_file(&device_mce, &attr_check_interval);
-	} 
-	
+
+	for_each_online_cpu(i) {
+		mce_create_device(i);
+	}
+
+	register_cpu_notifier(&mce_cpu_notifier);
 	misc_register(&mce_log_device);
 	return err;
-
 }
+
 device_initcall(mce_init_device);
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index 9c5aa2a790c..08abf9f5b15 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -109,7 +109,7 @@ static int __init mpf_checksum(unsigned char *mp, int len)
 
 static void __init MP_processor_info (struct mpc_config_processor *m)
 {
-	int ver;
+	int ver, cpu;
 	static int found_bsp=0;
 
 	if (!(m->mpc_cpuflag & CPU_ENABLED))
@@ -131,7 +131,7 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
 		return;
 	}
 
-	num_processors++;
+	cpu = num_processors++;
 
 	if (m->mpc_apicid > MAX_APICS) {
 		printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
@@ -155,13 +155,18 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
  		 * in same order as logical cpu numbers. Hence the first
  		 * entry is BSP, and so on.
  		 */
+		cpu = 0;
+
  		bios_cpu_apicid[0] = m->mpc_apicid;
  		x86_cpu_to_apicid[0] = m->mpc_apicid;
  		found_bsp = 1;
- 	} else {
- 		bios_cpu_apicid[num_processors - found_bsp] = m->mpc_apicid;
- 		x86_cpu_to_apicid[num_processors - found_bsp] = m->mpc_apicid;
- 	}
+ 	} else
+		cpu = num_processors - found_bsp;
+	bios_cpu_apicid[cpu] = m->mpc_apicid;
+	x86_cpu_to_apicid[cpu] = m->mpc_apicid;
+
+	cpu_set(cpu, cpu_possible_map);
+	cpu_set(cpu, cpu_present_map);
 }
 
 static void __init MP_bus_info (struct mpc_config_bus *m)
diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c
index 57e71dbdfd6..47f95687905 100644
--- a/arch/x86_64/kernel/reboot.c
+++ b/arch/x86_64/kernel/reboot.c
@@ -109,23 +109,10 @@ void machine_shutdown(void)
 	local_irq_enable();
 }
 
-void machine_restart(char * __unused)
+void machine_emergency_restart(void)
 {
 	int i;
 
-	printk("machine restart\n");
-
-	machine_shutdown();
-
-	if (!reboot_force) {
-		local_irq_disable();
-#ifndef CONFIG_SMP
-		disable_local_APIC();
-#endif
-		disable_IO_APIC();
-		local_irq_enable();
-	}
-	
 	/* Tell the BIOS if we want cold or warm reboot */
 	*((unsigned short *)__va(0x472)) = reboot_mode;
        
@@ -150,18 +137,26 @@ void machine_restart(char * __unused)
 	}      
 }
 
-EXPORT_SYMBOL(machine_restart);
+void machine_restart(char * __unused)
+{
+	printk("machine restart\n");
+
+	if (!reboot_force) {
+		machine_shutdown();
+	}
+	machine_emergency_restart();
+}
 
 void machine_halt(void)
 {
 }
 
-EXPORT_SYMBOL(machine_halt);
-
 void machine_power_off(void)
 {
+	if (!reboot_force) {
+		machine_shutdown();
+	}
 	if (pm_power_off)
 		pm_power_off();
 }
 
-EXPORT_SYMBOL(machine_power_off);
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 5fd03225058..0aa526298e9 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -765,8 +765,6 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
 	int cpu = smp_processor_id();
 	int node = 0;
 	unsigned bits;
-	if (c->x86_num_cores == 1)
-		return;
 
 	bits = 0;
 	while ((1 << bits) < c->x86_num_cores)
@@ -1081,8 +1079,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 	else
 		mtrr_ap_init();
 #ifdef CONFIG_NUMA
-	if (c != &boot_cpu_data)
-		numa_add_cpu(c - cpu_data);
+	numa_add_cpu(smp_processor_id());
 #endif
 }
  
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index f2b9c6bc999..34082c1cc41 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -12,6 +12,7 @@
 #include <linux/string.h>
 #include <linux/bootmem.h>
 #include <linux/bitops.h>
+#include <linux/module.h>
 #include <asm/bootsetup.h>
 #include <asm/pda.h>
 #include <asm/pgtable.h>
@@ -22,10 +23,8 @@
 #include <asm/smp.h>
 #include <asm/i387.h>
 #include <asm/percpu.h>
-#include <asm/mtrr.h>
 #include <asm/proto.h>
-#include <asm/mman.h>
-#include <asm/numa.h>
+#include <asm/sections.h>
 
 char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
 
@@ -33,11 +32,6 @@ cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
 
 struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned; 
 
-extern struct task_struct init_task;
-
-extern unsigned char __per_cpu_start[], __per_cpu_end[]; 
-
-extern struct desc_ptr cpu_gdt_descr[];
 struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table }; 
 
 char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
@@ -101,7 +95,7 @@ void __init setup_per_cpu_areas(void)
 #endif
 
 	for (i = 0; i < NR_CPUS; i++) { 
-		unsigned char *ptr;
+		char *ptr;
 
 		if (!NODE_DATA(cpu_to_node(i))) {
 			printk("cpu with no node %d, num_online_nodes %d\n",
@@ -190,11 +184,7 @@ void __cpuinit check_efer(void)
  */
 void __cpuinit cpu_init (void)
 {
-#ifdef CONFIG_SMP
 	int cpu = stack_smp_processor_id();
-#else
-	int cpu = smp_processor_id();
-#endif
 	struct tss_struct *t = &per_cpu(init_tss, cpu);
 	unsigned long v; 
 	char *estacks = NULL; 
@@ -214,7 +204,7 @@ void __cpuinit cpu_init (void)
 
 	printk("Initializing CPU#%d\n", cpu);
 
-		clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+	clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
 
 	/*
 	 * Initialize the per-CPU GDT with the boot GDT,
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index ccae392886a..e5958220d6b 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -129,10 +129,9 @@ asmlinkage void smp_invalidate_interrupt (void)
 		} else
 			leave_mm(cpu);
 	}
+out:
 	ack_APIC_irq();
 	cpu_clear(cpu, flush_cpumask);
-
-out:
 	put_cpu_no_resched();
 }
 
@@ -294,6 +293,69 @@ void unlock_ipi_call_lock(void)
 }
 
 /*
+ * this function sends a 'generic call function' IPI to one other CPU
+ * in the system.
+ */
+static void __smp_call_function_single (int cpu, void (*func) (void *info), void *info,
+				int nonatomic, int wait)
+{
+	struct call_data_struct data;
+	int cpus = 1;
+
+	data.func = func;
+	data.info = info;
+	atomic_set(&data.started, 0);
+	data.wait = wait;
+	if (wait)
+		atomic_set(&data.finished, 0);
+
+	call_data = &data;
+	wmb();
+	/* Send a message to all other CPUs and wait for them to respond */
+	send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
+
+	/* Wait for response */
+	while (atomic_read(&data.started) != cpus)
+		cpu_relax();
+
+	if (!wait)
+		return;
+
+	while (atomic_read(&data.finished) != cpus)
+		cpu_relax();
+}
+
+/*
+ * smp_call_function_single - Run a function on another CPU
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: Currently unused.
+ * @wait: If true, wait until function has completed on other CPUs.
+ *
+ * Retrurns 0 on success, else a negative status code.
+ *
+ * Does not return until the remote CPU is nearly ready to execute <func>
+ * or is or has executed.
+ */
+
+int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
+	int nonatomic, int wait)
+{
+	/* prevent preemption and reschedule on another processor */
+	int me = get_cpu();
+	if (cpu == me) {
+		WARN_ON(1);
+		put_cpu();
+		return -EBUSY;
+	}
+	spin_lock_bh(&call_lock);
+	__smp_call_function_single(cpu, func, info, nonatomic, wait);
+	spin_unlock_bh(&call_lock);
+	put_cpu();
+	return 0;
+}
+
+/*
  * this function sends a 'generic call function' IPI to all other CPUs
  * in the system.
  */
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index b969ee12872..6e4807d64d4 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -113,24 +113,6 @@ struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
 #define set_idle_for_cpu(x,p)   (idle_thread_array[(x)] = (p))
 
 /*
- * cpu_possible_map should be static, it cannot change as cpu's
- * are onlined, or offlined. The reason is per-cpu data-structures
- * are allocated by some modules at init time, and dont expect to
- * do this dynamically on cpu arrival/departure.
- * cpu_present_map on the other hand can change dynamically.
- * In case when cpu_hotplug is not compiled, then we resort to current
- * behaviour, which is cpu_possible == cpu_present.
- * If cpu-hotplug is supported, then we need to preallocate for all
- * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range.
- * - Ashok Raj
- */
-#ifdef CONFIG_HOTPLUG_CPU
-#define fixup_cpu_possible_map(x)	cpu_set((x), cpu_possible_map)
-#else
-#define fixup_cpu_possible_map(x)
-#endif
-
-/*
  * Currently trivial. Write the real->protected mode
  * bootstrap into the page concerned. The caller
  * has made sure it's suitably aligned.
@@ -229,9 +211,6 @@ static __cpuinit void sync_master(void *arg)
 {
 	unsigned long flags, i;
 
-	if (smp_processor_id() != boot_cpu_id)
-		return;
-
 	go[MASTER] = 0;
 
 	local_irq_save(flags);
@@ -280,12 +259,12 @@ get_delta(long *rt, long *master)
 	return tcenter - best_tm;
 }
 
-static __cpuinit void sync_tsc(void)
+static __cpuinit void sync_tsc(unsigned int master)
 {
 	int i, done = 0;
 	long delta, adj, adjust_latency = 0;
 	unsigned long flags, rt, master_time_stamp, bound;
-#if DEBUG_TSC_SYNC
+#ifdef DEBUG_TSC_SYNC
 	static struct syncdebug {
 		long rt;	/* roundtrip time */
 		long master;	/* master's timestamp */
@@ -294,9 +273,17 @@ static __cpuinit void sync_tsc(void)
 	} t[NUM_ROUNDS] __cpuinitdata;
 #endif
 
+	printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n",
+		smp_processor_id(), master);
+
 	go[MASTER] = 1;
 
-	smp_call_function(sync_master, NULL, 1, 0);
+	/* It is dangerous to broadcast IPI as cpus are coming up,
+	 * as they may not be ready to accept them.  So since
+	 * we only need to send the ipi to the boot cpu direct
+	 * the message, and avoid the race.
+	 */
+	smp_call_function_single(master, sync_master, NULL, 1, 0);
 
 	while (go[MASTER])	/* wait for master to be ready */
 		no_cpu_relax();
@@ -321,7 +308,7 @@ static __cpuinit void sync_tsc(void)
 				rdtscll(t);
 				wrmsrl(MSR_IA32_TSC, t + adj);
 			}
-#if DEBUG_TSC_SYNC
+#ifdef DEBUG_TSC_SYNC
 			t[i].rt = rt;
 			t[i].master = master_time_stamp;
 			t[i].diff = delta;
@@ -331,7 +318,7 @@ static __cpuinit void sync_tsc(void)
 	}
 	spin_unlock_irqrestore(&tsc_sync_lock, flags);
 
-#if DEBUG_TSC_SYNC
+#ifdef DEBUG_TSC_SYNC
 	for (i = 0; i < NUM_ROUNDS; ++i)
 		printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
 		       t[i].rt, t[i].master, t[i].diff, t[i].lat);
@@ -340,16 +327,14 @@ static __cpuinit void sync_tsc(void)
 	printk(KERN_INFO
 	       "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, "
 	       "maxerr %lu cycles)\n",
-	       smp_processor_id(), boot_cpu_id, delta, rt);
+	       smp_processor_id(), master, delta, rt);
 }
 
 static void __cpuinit tsc_sync_wait(void)
 {
 	if (notscsync || !cpu_has_tsc)
 		return;
-	printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", smp_processor_id(),
-			boot_cpu_id);
-	sync_tsc();
+	sync_tsc(boot_cpu_id);
 }
 
 static __init int notscsync_setup(char *s)
@@ -537,7 +522,7 @@ void __cpuinit start_secondary(void)
 extern volatile unsigned long init_rsp;
 extern void (*initial_code)(void);
 
-#if APIC_DEBUG
+#ifdef APIC_DEBUG
 static void inquire_remote_apic(int apicid)
 {
 	unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
@@ -773,8 +758,9 @@ do_rest:
 	initial_code = start_secondary;
 	clear_ti_thread_flag(c_idle.idle->thread_info, TIF_FORK);
 
-	printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid,
-	       start_rip, init_rsp);
+	printk(KERN_INFO "Booting processor %d/%d APIC 0x%x\n", cpu,
+		cpus_weight(cpu_present_map),
+		apicid);
 
 	/*
 	 * This grunge runs the startup process for
@@ -841,7 +827,7 @@ do_rest:
 			else
 				/* trampoline code not run */
 				printk("Not responding.\n");
-#if APIC_DEBUG
+#ifdef APIC_DEBUG
 			inquire_remote_apic(apicid);
 #endif
 		}
@@ -924,6 +910,27 @@ static __init void enforce_max_cpus(unsigned max_cpus)
 	}
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * cpu_possible_map should be static, it cannot change as cpu's
+ * are onlined, or offlined. The reason is per-cpu data-structures
+ * are allocated by some modules at init time, and dont expect to
+ * do this dynamically on cpu arrival/departure.
+ * cpu_present_map on the other hand can change dynamically.
+ * In case when cpu_hotplug is not compiled, then we resort to current
+ * behaviour, which is cpu_possible == cpu_present.
+ * If cpu-hotplug is supported, then we need to preallocate for all
+ * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range.
+ * - Ashok Raj
+ */
+static void prefill_possible_map(void)
+{
+	int i;
+	for (i = 0; i < NR_CPUS; i++)
+		cpu_set(i, cpu_possible_map);
+}
+#endif
+
 /*
  * Various sanity checks.
  */
@@ -987,25 +994,15 @@ static int __init smp_sanity_check(unsigned max_cpus)
  */
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
-	int i;
-
 	nmi_watchdog_default();
 	current_cpu_data = boot_cpu_data;
 	current_thread_info()->cpu = 0;  /* needed? */
 
 	enforce_max_cpus(max_cpus);
 
-	/*
-	 * Fill in cpu_present_mask
-	 */
-	for (i = 0; i < NR_CPUS; i++) {
-		int apicid = cpu_present_to_apicid(i);
-		if (physid_isset(apicid, phys_cpu_present_map)) {
-			cpu_set(i, cpu_present_map);
-			cpu_set(i, cpu_possible_map);
-		}
-		fixup_cpu_possible_map(i);
-	}
+#ifdef CONFIG_HOTPLUG_CPU
+	prefill_possible_map();
+#endif
 
 	if (smp_sanity_check(max_cpus) < 0) {
 		printk(KERN_INFO "SMP disabled\n");
@@ -1189,8 +1186,7 @@ void __cpu_die(unsigned int cpu)
 			printk ("CPU %d is now offline\n", cpu);
 			return;
 		}
-		current->state = TASK_UNINTERRUPTIBLE;
-		schedule_timeout(HZ/10);
+		msleep(100);
 	}
  	printk(KERN_ERR "CPU %u didn't die...\n", cpu);
 }
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 10273663000..6ead433a388 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -594,9 +594,6 @@ asmlinkage void default_do_nmi(struct pt_regs *regs)
 	if (!cpu)
 		reason = get_nmi_reason();
 
-	if (!cpu_online(cpu))
-		return;
-
 	if (!(reason & 0xc0)) {
 		if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT)
 								== NOTIFY_STOP)