aboutsummaryrefslogtreecommitdiff
path: root/arch/x86_64/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64/kernel')
-rw-r--r--arch/x86_64/kernel/entry.S22
-rw-r--r--arch/x86_64/kernel/genapic.c33
-rw-r--r--arch/x86_64/kernel/genapic_flat.c142
-rw-r--r--arch/x86_64/kernel/head.S16
-rw-r--r--arch/x86_64/kernel/irq.c19
-rw-r--r--arch/x86_64/kernel/machine_kexec.c101
-rw-r--r--arch/x86_64/kernel/mce.c93
-rw-r--r--arch/x86_64/kernel/mpparse.c17
-rw-r--r--arch/x86_64/kernel/reboot.c31
-rw-r--r--arch/x86_64/kernel/setup.c5
-rw-r--r--arch/x86_64/kernel/setup64.c18
-rw-r--r--arch/x86_64/kernel/smp.c66
-rw-r--r--arch/x86_64/kernel/smpboot.c94
-rw-r--r--arch/x86_64/kernel/traps.c3
14 files changed, 380 insertions, 280 deletions
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 28817490fdc..096d470e280 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -76,7 +76,7 @@
.macro FAKE_STACK_FRAME child_rip
/* push in order ss, rsp, eflags, cs, rip */
- xorq %rax, %rax
+ xorl %eax, %eax
pushq %rax /* ss */
CFI_ADJUST_CFA_OFFSET 8
pushq %rax /* rsp */
@@ -423,7 +423,7 @@ ENTRY(stub_rt_sigreturn)
testl $3,CS(%rdi)
je 1f
swapgs
-1: addl $1,%gs:pda_irqcount # RED-PEN should check preempt count
+1: incl %gs:pda_irqcount # RED-PEN should check preempt count
movq %gs:pda_irqstackptr,%rax
cmoveq %rax,%rsp
pushq %rdi # save old stack
@@ -436,7 +436,7 @@ ENTRY(common_interrupt)
ret_from_intr:
popq %rdi
cli
- subl $1,%gs:pda_irqcount
+ decl %gs:pda_irqcount
#ifdef CONFIG_DEBUG_INFO
movq RBP(%rdi),%rbp
#endif
@@ -494,7 +494,7 @@ retint_signal:
sti
SAVE_REST
movq $-1,ORIG_RAX(%rsp)
- xorq %rsi,%rsi # oldset
+ xorl %esi,%esi # oldset
movq %rsp,%rdi # &pt_regs
call do_notify_resume
RESTORE_REST
@@ -752,7 +752,7 @@ child_rip:
movq %rsi, %rdi
call *%rax
# exit
- xorq %rdi, %rdi
+ xorl %edi, %edi
call do_exit
/*
@@ -918,3 +918,15 @@ ENTRY(machine_check)
ENTRY(call_debug)
zeroentry do_call_debug
+ENTRY(call_softirq)
+ movq %gs:pda_irqstackptr,%rax
+ pushq %r15
+ movq %rsp,%r15
+ incl %gs:pda_irqcount
+ cmove %rax,%rsp
+ call __do_softirq
+ movq %r15,%rsp
+ decl %gs:pda_irqcount
+ popq %r15
+ ret
+
diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c
index 69b9c25a8fc..30c843a5efd 100644
--- a/arch/x86_64/kernel/genapic.c
+++ b/arch/x86_64/kernel/genapic.c
@@ -31,6 +31,7 @@ u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
extern struct genapic apic_cluster;
extern struct genapic apic_flat;
+extern struct genapic apic_physflat;
struct genapic *genapic = &apic_flat;
@@ -44,12 +45,7 @@ void __init clustered_apic_check(void)
u8 clusters, max_cluster;
u8 id;
u8 cluster_cnt[NUM_APIC_CLUSTERS];
-
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
- /* AMD always uses flat mode right now */
- genapic = &apic_flat;
- goto print;
- }
+ int num_cpus = 0;
#if defined(CONFIG_ACPI_BUS)
/*
@@ -64,15 +60,34 @@ void __init clustered_apic_check(void)
#endif
memset(cluster_cnt, 0, sizeof(cluster_cnt));
-
for (i = 0; i < NR_CPUS; i++) {
id = bios_cpu_apicid[i];
- if (id != BAD_APICID)
- cluster_cnt[APIC_CLUSTERID(id)]++;
+ if (id == BAD_APICID)
+ continue;
+ num_cpus++;
+ cluster_cnt[APIC_CLUSTERID(id)]++;
}
+ /* Don't use clustered mode on AMD platforms. */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+ genapic = &apic_physflat;
+#ifndef CONFIG_CPU_HOTPLUG
+ /* In the CPU hotplug case we cannot use broadcast mode
+ because that opens a race when a CPU is removed.
+ Stay at physflat mode in this case.
+ It is bad to do this unconditionally though. Once
+ we have ACPI platform support for CPU hotplug
+ we should detect hotplug capablity from ACPI tables and
+ only do this when really needed. -AK */
+ if (num_cpus <= 8)
+ genapic = &apic_flat;
+#endif
+ goto print;
+ }
+
clusters = 0;
max_cluster = 0;
+
for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
if (cluster_cnt[i] > 0) {
++clusters;
diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c
index 28284696508..adc96282a9e 100644
--- a/arch/x86_64/kernel/genapic_flat.c
+++ b/arch/x86_64/kernel/genapic_flat.c
@@ -2,13 +2,11 @@
* Copyright 2004 James Cleverdon, IBM.
* Subject to the GNU Public License, v.2
*
- * Flat APIC subarch code. Maximum 8 CPUs, logical delivery.
+ * Flat APIC subarch code.
*
* Hacked for x86-64 by James Cleverdon from i386 architecture code by
* Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
* James Cleverdon.
- * Ashok Raj <ashok.raj@intel.com>
- * Removed IPI broadcast shortcut to support CPU hotplug
*/
#include <linux/config.h>
#include <linux/threads.h>
@@ -20,47 +18,6 @@
#include <asm/smp.h>
#include <asm/ipi.h>
-/*
- * The following permit choosing broadcast IPI shortcut v.s sending IPI only
- * to online cpus via the send_IPI_mask varient.
- * The mask version is my preferred option, since it eliminates a lot of
- * other extra code that would need to be written to cleanup intrs sent
- * to a CPU while offline.
- *
- * Sending broadcast introduces lots of trouble in CPU hotplug situations.
- * These IPI's are delivered to cpu's irrespective of their offline status
- * and could pickup stale intr data when these CPUS are turned online.
- *
- * Not using broadcast is a cleaner approach IMO, but Andi Kleen disagrees with
- * the idea of not using broadcast IPI's anymore. Hence the run time check
- * is introduced, on his request so we can choose an alternate mechanism.
- *
- * Initial wacky performance tests that collect cycle counts show
- * no increase in using mask v.s broadcast version. In fact they seem
- * identical in terms of cycle counts.
- *
- * if we need to use broadcast, we need to do the following.
- *
- * cli;
- * hold call_lock;
- * clear any pending IPI, just ack and clear all pending intr
- * set cpu_online_map;
- * release call_lock;
- * sti;
- *
- * The complicated dummy irq processing shown above is not required if
- * we didnt sent IPI's to wrong CPU's in the first place.
- *
- * - Ashok Raj <ashok.raj@intel.com>
- */
-#ifdef CONFIG_HOTPLUG_CPU
-#define DEFAULT_SEND_IPI (1)
-#else
-#define DEFAULT_SEND_IPI (0)
-#endif
-
-static int no_broadcast=DEFAULT_SEND_IPI;
-
static cpumask_t flat_target_cpus(void)
{
return cpu_online_map;
@@ -119,37 +76,15 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
local_irq_restore(flags);
}
-static inline void __local_flat_send_IPI_allbutself(int vector)
-{
- if (no_broadcast) {
- cpumask_t mask = cpu_online_map;
- int this_cpu = get_cpu();
-
- cpu_clear(this_cpu, mask);
- flat_send_IPI_mask(mask, vector);
- put_cpu();
- }
- else
- __send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
-}
-
-static inline void __local_flat_send_IPI_all(int vector)
-{
- if (no_broadcast)
- flat_send_IPI_mask(cpu_online_map, vector);
- else
- __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
-}
-
static void flat_send_IPI_allbutself(int vector)
{
if (((num_online_cpus()) - 1) >= 1)
- __local_flat_send_IPI_allbutself(vector);
+ __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL);
}
static void flat_send_IPI_all(int vector)
{
- __local_flat_send_IPI_all(vector);
+ __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
}
static int flat_apic_id_registered(void)
@@ -170,16 +105,6 @@ static unsigned int phys_pkg_id(int index_msb)
return ((ebx >> 24) & 0xFF) >> index_msb;
}
-static __init int no_ipi_broadcast(char *str)
-{
- get_option(&str, &no_broadcast);
- printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" :
- "IPI Broadcast");
- return 1;
-}
-
-__setup("no_ipi_broadcast", no_ipi_broadcast);
-
struct genapic apic_flat = {
.name = "flat",
.int_delivery_mode = dest_LowestPrio,
@@ -195,11 +120,62 @@ struct genapic apic_flat = {
.phys_pkg_id = phys_pkg_id,
};
-static int __init print_ipi_mode(void)
+/*
+ * Physflat mode is used when there are more than 8 CPUs on a AMD system.
+ * We cannot use logical delivery in this case because the mask
+ * overflows, so use physical mode.
+ */
+
+static cpumask_t physflat_target_cpus(void)
+{
+ return cpumask_of_cpu(0);
+}
+
+static void physflat_send_IPI_mask(cpumask_t cpumask, int vector)
+{
+ send_IPI_mask_sequence(cpumask, vector);
+}
+
+static void physflat_send_IPI_allbutself(int vector)
+{
+ cpumask_t allbutme = cpu_online_map;
+ int me = get_cpu();
+ cpu_clear(me, allbutme);
+ physflat_send_IPI_mask(allbutme, vector);
+ put_cpu();
+}
+
+static void physflat_send_IPI_all(int vector)
{
- printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" :
- "Shortcut");
- return 0;
+ physflat_send_IPI_mask(cpu_online_map, vector);
}
-late_initcall(print_ipi_mode);
+static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
+{
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ cpu = first_cpu(cpumask);
+ if ((unsigned)cpu < NR_CPUS)
+ return x86_cpu_to_apicid[cpu];
+ else
+ return BAD_APICID;
+}
+
+struct genapic apic_physflat = {
+ .name = "physical flat",
+ .int_delivery_mode = dest_LowestPrio,
+ .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
+ .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_LOWEST,
+ .target_cpus = physflat_target_cpus,
+ .apic_id_registered = flat_apic_id_registered,
+ .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/
+ .send_IPI_all = physflat_send_IPI_all,
+ .send_IPI_allbutself = physflat_send_IPI_allbutself,
+ .send_IPI_mask = physflat_send_IPI_mask,
+ .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
+ .phys_pkg_id = phys_pkg_id,
+};
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 8d765aa77a2..98ff5eb32b9 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -137,14 +137,14 @@ startup_64:
wrmsr
/* Setup cr0 */
- xorq %rax, %rax
- btsq $31, %rax /* Enable paging */
- btsq $0, %rax /* Enable protected mode */
- btsq $1, %rax /* Enable MP */
- btsq $4, %rax /* Enable ET */
- btsq $5, %rax /* Enable NE */
- btsq $16, %rax /* Enable WP */
- btsq $18, %rax /* Enable AM */
+#define CR0_PM 1 /* protected mode */
+#define CR0_MP (1<<1)
+#define CR0_ET (1<<4)
+#define CR0_NE (1<<5)
+#define CR0_WP (1<<16)
+#define CR0_AM (1<<18)
+#define CR0_PAGING (1<<31)
+ movl $CR0_PM|CR0_MP|CR0_ET|CR0_NE|CR0_WP|CR0_AM|CR0_PAGING,%eax
/* Make changes effective */
movq %rax, %cr0
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index cc3fb85f514..849a20aec7c 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -135,3 +135,22 @@ void fixup_irqs(cpumask_t map)
local_irq_disable();
}
#endif
+
+extern void call_softirq(void);
+
+asmlinkage void do_softirq(void)
+{
+ __u32 pending;
+ unsigned long flags;
+
+ if (in_interrupt())
+ return;
+
+ local_irq_save(flags);
+ pending = local_softirq_pending();
+ /* Switch to interrupt stack */
+ if (pending)
+ call_softirq();
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(do_softirq);
diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c
index 60d1eff4156..89fab51e20f 100644
--- a/arch/x86_64/kernel/machine_kexec.c
+++ b/arch/x86_64/kernel/machine_kexec.c
@@ -8,43 +8,26 @@
#include <linux/mm.h>
#include <linux/kexec.h>
-#include <linux/delay.h>
#include <linux/string.h>
#include <linux/reboot.h>
-#include <asm/pda.h>
#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/io.h>
-#include <asm/apic.h>
-#include <asm/cpufeature.h>
-#include <asm/hw_irq.h>
-
-#define LEVEL0_SIZE (1UL << 12UL)
-#define LEVEL1_SIZE (1UL << 21UL)
-#define LEVEL2_SIZE (1UL << 30UL)
-#define LEVEL3_SIZE (1UL << 39UL)
-#define LEVEL4_SIZE (1UL << 48UL)
-
-#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
-#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE)
-#define L2_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
-#define L3_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
-
-static void init_level2_page(u64 *level2p, unsigned long addr)
+
+static void init_level2_page(pmd_t *level2p, unsigned long addr)
{
unsigned long end_addr;
addr &= PAGE_MASK;
- end_addr = addr + LEVEL2_SIZE;
+ end_addr = addr + PUD_SIZE;
while (addr < end_addr) {
- *(level2p++) = addr | L1_ATTR;
- addr += LEVEL1_SIZE;
+ set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+ addr += PMD_SIZE;
}
}
-static int init_level3_page(struct kimage *image, u64 *level3p,
+static int init_level3_page(struct kimage *image, pud_t *level3p,
unsigned long addr, unsigned long last_addr)
{
unsigned long end_addr;
@@ -52,32 +35,32 @@ static int init_level3_page(struct kimage *image, u64 *level3p,
result = 0;
addr &= PAGE_MASK;
- end_addr = addr + LEVEL3_SIZE;
+ end_addr = addr + PGDIR_SIZE;
while ((addr < last_addr) && (addr < end_addr)) {
struct page *page;
- u64 *level2p;
+ pmd_t *level2p;
page = kimage_alloc_control_pages(image, 0);
if (!page) {
result = -ENOMEM;
goto out;
}
- level2p = (u64 *)page_address(page);
+ level2p = (pmd_t *)page_address(page);
init_level2_page(level2p, addr);
- *(level3p++) = __pa(level2p) | L2_ATTR;
- addr += LEVEL2_SIZE;
+ set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
+ addr += PUD_SIZE;
}
/* clear the unused entries */
while (addr < end_addr) {
- *(level3p++) = 0;
- addr += LEVEL2_SIZE;
+ pud_clear(level3p++);
+ addr += PUD_SIZE;
}
out:
return result;
}
-static int init_level4_page(struct kimage *image, u64 *level4p,
+static int init_level4_page(struct kimage *image, pgd_t *level4p,
unsigned long addr, unsigned long last_addr)
{
unsigned long end_addr;
@@ -85,28 +68,28 @@ static int init_level4_page(struct kimage *image, u64 *level4p,
result = 0;
addr &= PAGE_MASK;
- end_addr = addr + LEVEL4_SIZE;
+ end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
while ((addr < last_addr) && (addr < end_addr)) {
struct page *page;
- u64 *level3p;
+ pud_t *level3p;
page = kimage_alloc_control_pages(image, 0);
if (!page) {
result = -ENOMEM;
goto out;
}
- level3p = (u64 *)page_address(page);
+ level3p = (pud_t *)page_address(page);
result = init_level3_page(image, level3p, addr, last_addr);
if (result) {
goto out;
}
- *(level4p++) = __pa(level3p) | L3_ATTR;
- addr += LEVEL3_SIZE;
+ set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
+ addr += PGDIR_SIZE;
}
/* clear the unused entries */
while (addr < end_addr) {
- *(level4p++) = 0;
- addr += LEVEL3_SIZE;
+ pgd_clear(level4p++);
+ addr += PGDIR_SIZE;
}
out:
return result;
@@ -115,52 +98,50 @@ out:
static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
{
- u64 *level4p;
- level4p = (u64 *)__va(start_pgtable);
+ pgd_t *level4p;
+ level4p = (pgd_t *)__va(start_pgtable);
return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
}
static void set_idt(void *newidt, u16 limit)
{
- unsigned char curidt[10];
+ struct desc_ptr curidt;
/* x86-64 supports unaliged loads & stores */
- (*(u16 *)(curidt)) = limit;
- (*(u64 *)(curidt +2)) = (unsigned long)(newidt);
+ curidt.size = limit;
+ curidt.address = (unsigned long)newidt;
__asm__ __volatile__ (
- "lidt %0\n"
- : "=m" (curidt)
+ "lidtq %0\n"
+ : : "m" (curidt)
);
};
static void set_gdt(void *newgdt, u16 limit)
{
- unsigned char curgdt[10];
+ struct desc_ptr curgdt;
/* x86-64 supports unaligned loads & stores */
- (*(u16 *)(curgdt)) = limit;
- (*(u64 *)(curgdt +2)) = (unsigned long)(newgdt);
+ curgdt.size = limit;
+ curgdt.address = (unsigned long)newgdt;
__asm__ __volatile__ (
- "lgdt %0\n"
- : "=m" (curgdt)
+ "lgdtq %0\n"
+ : : "m" (curgdt)
);
};
static void load_segments(void)
{
__asm__ __volatile__ (
- "\tmovl $"STR(__KERNEL_DS)",%eax\n"
- "\tmovl %eax,%ds\n"
- "\tmovl %eax,%es\n"
- "\tmovl %eax,%ss\n"
- "\tmovl %eax,%fs\n"
- "\tmovl %eax,%gs\n"
+ "\tmovl %0,%%ds\n"
+ "\tmovl %0,%%es\n"
+ "\tmovl %0,%%ss\n"
+ "\tmovl %0,%%fs\n"
+ "\tmovl %0,%%gs\n"
+ : : "a" (__KERNEL_DS)
);
-#undef STR
-#undef __STR
}
typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
@@ -178,7 +159,7 @@ int machine_kexec_prepare(struct kimage *image)
/* Calculate the offsets */
start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
- control_code_buffer = start_pgtable + 4096UL;
+ control_code_buffer = start_pgtable + PAGE_SIZE;
/* Setup the identity mapped 64bit page table */
result = init_pgtable(image, start_pgtable);
@@ -214,7 +195,7 @@ NORET_TYPE void machine_kexec(struct kimage *image)
/* Calculate the offsets */
page_list = image->head;
start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
- control_code_buffer = start_pgtable + 4096UL;
+ control_code_buffer = start_pgtable + PAGE_SIZE;
/* Set the low half of the page table to my identity mapped
* page table for kexec. Leave the high half pointing at the
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 21e70625a49..3b267c91bb0 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -15,6 +15,8 @@
#include <linux/sysdev.h>
#include <linux/miscdevice.h>
#include <linux/fs.h>
+#include <linux/cpu.h>
+#include <linux/percpu.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/mce.h>
@@ -514,10 +516,7 @@ static struct sysdev_class mce_sysclass = {
set_kset_name("machinecheck"),
};
-static struct sys_device device_mce = {
- .id = 0,
- .cls = &mce_sysclass,
-};
+static DEFINE_PER_CPU(struct sys_device, device_mce);
/* Why are there no generic functions for this? */
#define ACCESSOR(name, var, start) \
@@ -542,27 +541,83 @@ ACCESSOR(bank4ctl,bank[4],mce_restart())
ACCESSOR(tolerant,tolerant,)
ACCESSOR(check_interval,check_interval,mce_restart())
-static __cpuinit int mce_init_device(void)
+/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
+static __cpuinit int mce_create_device(unsigned int cpu)
{
int err;
+ if (!mce_available(&cpu_data[cpu]))
+ return -EIO;
+
+ per_cpu(device_mce,cpu).id = cpu;
+ per_cpu(device_mce,cpu).cls = &mce_sysclass;
+
+ err = sysdev_register(&per_cpu(device_mce,cpu));
+
+ if (!err) {
+ sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
+ sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
+ sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
+ sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
+ sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
+ sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
+ sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
+ }
+ return err;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static __cpuinit void mce_remove_device(unsigned int cpu)
+{
+ sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
+ sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
+ sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
+ sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
+ sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
+ sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
+ sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
+ sysdev_unregister(&per_cpu(device_mce,cpu));
+}
+#endif
+
+/* Get notified when a cpu comes on/off. Be hotplug friendly. */
+static __cpuinit int
+mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+
+ switch (action) {
+ case CPU_ONLINE:
+ mce_create_device(cpu);
+ break;
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_DEAD:
+ mce_remove_device(cpu);
+ break;
+#endif
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block mce_cpu_notifier = {
+ .notifier_call = mce_cpu_callback,
+};
+
+static __init int mce_init_device(void)
+{
+ int err;
+ int i = 0;
+
if (!mce_available(&boot_cpu_data))
return -EIO;
err = sysdev_class_register(&mce_sysclass);
- if (!err)
- err = sysdev_register(&device_mce);
- if (!err) {
- /* could create per CPU objects, but it is not worth it. */
- sysdev_create_file(&device_mce, &attr_bank0ctl);
- sysdev_create_file(&device_mce, &attr_bank1ctl);
- sysdev_create_file(&device_mce, &attr_bank2ctl);
- sysdev_create_file(&device_mce, &attr_bank3ctl);
- sysdev_create_file(&device_mce, &attr_bank4ctl);
- sysdev_create_file(&device_mce, &attr_tolerant);
- sysdev_create_file(&device_mce, &attr_check_interval);
- }
-
+
+ for_each_online_cpu(i) {
+ mce_create_device(i);
+ }
+
+ register_cpu_notifier(&mce_cpu_notifier);
misc_register(&mce_log_device);
return err;
-
}
+
device_initcall(mce_init_device);
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index 9c5aa2a790c..08abf9f5b15 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -109,7 +109,7 @@ static int __init mpf_checksum(unsigned char *mp, int len)
static void __init MP_processor_info (struct mpc_config_processor *m)
{
- int ver;
+ int ver, cpu;
static int found_bsp=0;
if (!(m->mpc_cpuflag & CPU_ENABLED))
@@ -131,7 +131,7 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
return;
}
- num_processors++;
+ cpu = num_processors++;
if (m->mpc_apicid > MAX_APICS) {
printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
@@ -155,13 +155,18 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
* in same order as logical cpu numbers. Hence the first
* entry is BSP, and so on.
*/
+ cpu = 0;
+
bios_cpu_apicid[0] = m->mpc_apicid;
x86_cpu_to_apicid[0] = m->mpc_apicid;
found_bsp = 1;
- } else {
- bios_cpu_apicid[num_processors - found_bsp] = m->mpc_apicid;
- x86_cpu_to_apicid[num_processors - found_bsp] = m->mpc_apicid;
- }
+ } else
+ cpu = num_processors - found_bsp;
+ bios_cpu_apicid[cpu] = m->mpc_apicid;
+ x86_cpu_to_apicid[cpu] = m->mpc_apicid;
+
+ cpu_set(cpu, cpu_possible_map);
+ cpu_set(cpu, cpu_present_map);
}
static void __init MP_bus_info (struct mpc_config_bus *m)
diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c
index 57e71dbdfd6..47f95687905 100644
--- a/arch/x86_64/kernel/reboot.c
+++ b/arch/x86_64/kernel/reboot.c
@@ -109,23 +109,10 @@ void machine_shutdown(void)
local_irq_enable();
}
-void machine_restart(char * __unused)
+void machine_emergency_restart(void)
{
int i;
- printk("machine restart\n");
-
- machine_shutdown();
-
- if (!reboot_force) {
- local_irq_disable();
-#ifndef CONFIG_SMP
- disable_local_APIC();
-#endif
- disable_IO_APIC();
- local_irq_enable();
- }
-
/* Tell the BIOS if we want cold or warm reboot */
*((unsigned short *)__va(0x472)) = reboot_mode;
@@ -150,18 +137,26 @@ void machine_restart(char * __unused)
}
}
-EXPORT_SYMBOL(machine_restart);
+void machine_restart(char * __unused)
+{
+ printk("machine restart\n");
+
+ if (!reboot_force) {
+ machine_shutdown();
+ }
+ machine_emergency_restart();
+}
void machine_halt(void)
{
}
-EXPORT_SYMBOL(machine_halt);
-
void machine_power_off(void)
{
+ if (!reboot_force) {
+ machine_shutdown();
+ }
if (pm_power_off)
pm_power_off();
}
-EXPORT_SYMBOL(machine_power_off);
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 5fd03225058..0aa526298e9 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -765,8 +765,6 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
int cpu = smp_processor_id();
int node = 0;
unsigned bits;
- if (c->x86_num_cores == 1)
- return;
bits = 0;
while ((1 << bits) < c->x86_num_cores)
@@ -1081,8 +1079,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
else
mtrr_ap_init();
#ifdef CONFIG_NUMA
- if (c != &boot_cpu_data)
- numa_add_cpu(c - cpu_data);
+ numa_add_cpu(smp_processor_id());
#endif
}
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index f2b9c6bc999..34082c1cc41 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -12,6 +12,7 @@
#include <linux/string.h>
#include <linux/bootmem.h>
#include <linux/bitops.h>
+#include <linux/module.h>
#include <asm/bootsetup.h>
#include <asm/pda.h>
#include <asm/pgtable.h>
@@ -22,10 +23,8 @@
#include <asm/smp.h>
#include <asm/i387.h>
#include <asm/percpu.h>
-#include <asm/mtrr.h>
#include <asm/proto.h>
-#include <asm/mman.h>
-#include <asm/numa.h>
+#include <asm/sections.h>
char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
@@ -33,11 +32,6 @@ cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned;
-extern struct task_struct init_task;
-
-extern unsigned char __per_cpu_start[], __per_cpu_end[];
-
-extern struct desc_ptr cpu_gdt_descr[];
struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table };
char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
@@ -101,7 +95,7 @@ void __init setup_per_cpu_areas(void)
#endif
for (i = 0; i < NR_CPUS; i++) {
- unsigned char *ptr;
+ char *ptr;
if (!NODE_DATA(cpu_to_node(i))) {
printk("cpu with no node %d, num_online_nodes %d\n",
@@ -190,11 +184,7 @@ void __cpuinit check_efer(void)
*/
void __cpuinit cpu_init (void)
{
-#ifdef CONFIG_SMP
int cpu = stack_smp_processor_id();
-#else
- int cpu = smp_processor_id();
-#endif
struct tss_struct *t = &per_cpu(init_tss, cpu);
unsigned long v;
char *estacks = NULL;
@@ -214,7 +204,7 @@ void __cpuinit cpu_init (void)
printk("Initializing CPU#%d\n", cpu);
- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+ clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
/*
* Initialize the per-CPU GDT with the boot GDT,
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index ccae392886a..e5958220d6b 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -129,10 +129,9 @@ asmlinkage void smp_invalidate_interrupt (void)
} else
leave_mm(cpu);
}
+out:
ack_APIC_irq();
cpu_clear(cpu, flush_cpumask);
-
-out:
put_cpu_no_resched();
}
@@ -294,6 +293,69 @@ void unlock_ipi_call_lock(void)
}
/*
+ * this function sends a 'generic call function' IPI to one other CPU
+ * in the system.
+ */
+static void __smp_call_function_single (int cpu, void (*func) (void *info), void *info,
+ int nonatomic, int wait)
+{
+ struct call_data_struct data;
+ int cpus = 1;
+
+ data.func = func;
+ data.info = info;
+ atomic_set(&data.started, 0);
+ data.wait = wait;
+ if (wait)
+ atomic_set(&data.finished, 0);
+
+ call_data = &data;
+ wmb();
+ /* Send a message to all other CPUs and wait for them to respond */
+ send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
+
+ /* Wait for response */
+ while (atomic_read(&data.started) != cpus)
+ cpu_relax();
+
+ if (!wait)
+ return;
+
+ while (atomic_read(&data.finished) != cpus)
+ cpu_relax();
+}
+
+/*
+ * smp_call_function_single - Run a function on another CPU
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: Currently unused.
+ * @wait: If true, wait until function has completed on other CPUs.
+ *
+ * Retrurns 0 on success, else a negative status code.
+ *
+ * Does not return until the remote CPU is nearly ready to execute <func>
+ * or is or has executed.
+ */
+
+int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
+ int nonatomic, int wait)
+{
+ /* prevent preemption and reschedule on another processor */
+ int me = get_cpu();
+ if (cpu == me) {
+ WARN_ON(1);
+ put_cpu();
+ return -EBUSY;
+ }
+ spin_lock_bh(&call_lock);
+ __smp_call_function_single(cpu, func, info, nonatomic, wait);
+ spin_unlock_bh(&call_lock);
+ put_cpu();
+ return 0;
+}
+
+/*
* this function sends a 'generic call function' IPI to all other CPUs
* in the system.
*/
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index b969ee12872..6e4807d64d4 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -113,24 +113,6 @@ struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
#define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p))
/*
- * cpu_possible_map should be static, it cannot change as cpu's
- * are onlined, or offlined. The reason is per-cpu data-structures
- * are allocated by some modules at init time, and dont expect to
- * do this dynamically on cpu arrival/departure.
- * cpu_present_map on the other hand can change dynamically.
- * In case when cpu_hotplug is not compiled, then we resort to current
- * behaviour, which is cpu_possible == cpu_present.
- * If cpu-hotplug is supported, then we need to preallocate for all
- * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range.
- * - Ashok Raj
- */
-#ifdef CONFIG_HOTPLUG_CPU
-#define fixup_cpu_possible_map(x) cpu_set((x), cpu_possible_map)
-#else
-#define fixup_cpu_possible_map(x)
-#endif
-
-/*
* Currently trivial. Write the real->protected mode
* bootstrap into the page concerned. The caller
* has made sure it's suitably aligned.
@@ -229,9 +211,6 @@ static __cpuinit void sync_master(void *arg)
{
unsigned long flags, i;
- if (smp_processor_id() != boot_cpu_id)
- return;
-
go[MASTER] = 0;
local_irq_save(flags);
@@ -280,12 +259,12 @@ get_delta(long *rt, long *master)
return tcenter - best_tm;
}
-static __cpuinit void sync_tsc(void)
+static __cpuinit void sync_tsc(unsigned int master)
{
int i, done = 0;
long delta, adj, adjust_latency = 0;
unsigned long flags, rt, master_time_stamp, bound;
-#if DEBUG_TSC_SYNC
+#ifdef DEBUG_TSC_SYNC
static struct syncdebug {
long rt; /* roundtrip time */
long master; /* master's timestamp */
@@ -294,9 +273,17 @@ static __cpuinit void sync_tsc(void)
} t[NUM_ROUNDS] __cpuinitdata;
#endif
+ printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n",
+ smp_processor_id(), master);
+
go[MASTER] = 1;
- smp_call_function(sync_master, NULL, 1, 0);
+ /* It is dangerous to broadcast IPI as cpus are coming up,
+ * as they may not be ready to accept them. So since
+ * we only need to send the ipi to the boot cpu direct
+ * the message, and avoid the race.
+ */
+ smp_call_function_single(master, sync_master, NULL, 1, 0);
while (go[MASTER]) /* wait for master to be ready */
no_cpu_relax();
@@ -321,7 +308,7 @@ static __cpuinit void sync_tsc(void)
rdtscll(t);
wrmsrl(MSR_IA32_TSC, t + adj);
}
-#if DEBUG_TSC_SYNC
+#ifdef DEBUG_TSC_SYNC
t[i].rt = rt;
t[i].master = master_time_stamp;
t[i].diff = delta;
@@ -331,7 +318,7 @@ static __cpuinit void sync_tsc(void)
}
spin_unlock_irqrestore(&tsc_sync_lock, flags);
-#if DEBUG_TSC_SYNC
+#ifdef DEBUG_TSC_SYNC
for (i = 0; i < NUM_ROUNDS; ++i)
printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
t[i].rt, t[i].master, t[i].diff, t[i].lat);
@@ -340,16 +327,14 @@ static __cpuinit void sync_tsc(void)
printk(KERN_INFO
"CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, "
"maxerr %lu cycles)\n",
- smp_processor_id(), boot_cpu_id, delta, rt);
+ smp_processor_id(), master, delta, rt);
}
static void __cpuinit tsc_sync_wait(void)
{
if (notscsync || !cpu_has_tsc)
return;
- printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", smp_processor_id(),
- boot_cpu_id);
- sync_tsc();
+ sync_tsc(boot_cpu_id);
}
static __init int notscsync_setup(char *s)
@@ -537,7 +522,7 @@ void __cpuinit start_secondary(void)
extern volatile unsigned long init_rsp;
extern void (*initial_code)(void);
-#if APIC_DEBUG
+#ifdef APIC_DEBUG
static void inquire_remote_apic(int apicid)
{
unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
@@ -773,8 +758,9 @@ do_rest:
initial_code = start_secondary;
clear_ti_thread_flag(c_idle.idle->thread_info, TIF_FORK);
- printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid,
- start_rip, init_rsp);
+ printk(KERN_INFO "Booting processor %d/%d APIC 0x%x\n", cpu,
+ cpus_weight(cpu_present_map),
+ apicid);
/*
* This grunge runs the startup process for
@@ -841,7 +827,7 @@ do_rest:
else
/* trampoline code not run */
printk("Not responding.\n");
-#if APIC_DEBUG
+#ifdef APIC_DEBUG
inquire_remote_apic(apicid);
#endif
}
@@ -924,6 +910,27 @@ static __init void enforce_max_cpus(unsigned max_cpus)
}
}
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * cpu_possible_map should be static, it cannot change as cpu's
+ * are onlined, or offlined. The reason is per-cpu data-structures
+ * are allocated by some modules at init time, and dont expect to
+ * do this dynamically on cpu arrival/departure.
+ * cpu_present_map on the other hand can change dynamically.
+ * In case when cpu_hotplug is not compiled, then we resort to current
+ * behaviour, which is cpu_possible == cpu_present.
+ * If cpu-hotplug is supported, then we need to preallocate for all
+ * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range.
+ * - Ashok Raj
+ */
+static void prefill_possible_map(void)
+{
+ int i;
+ for (i = 0; i < NR_CPUS; i++)
+ cpu_set(i, cpu_possible_map);
+}
+#endif
+
/*
* Various sanity checks.
*/
@@ -987,25 +994,15 @@ static int __init smp_sanity_check(unsigned max_cpus)
*/
void __init smp_prepare_cpus(unsigned int max_cpus)
{
- int i;
-
nmi_watchdog_default();
current_cpu_data = boot_cpu_data;
current_thread_info()->cpu = 0; /* needed? */
enforce_max_cpus(max_cpus);
- /*
- * Fill in cpu_present_mask
- */
- for (i = 0; i < NR_CPUS; i++) {
- int apicid = cpu_present_to_apicid(i);
- if (physid_isset(apicid, phys_cpu_present_map)) {
- cpu_set(i, cpu_present_map);
- cpu_set(i, cpu_possible_map);
- }
- fixup_cpu_possible_map(i);
- }
+#ifdef CONFIG_HOTPLUG_CPU
+ prefill_possible_map();
+#endif
if (smp_sanity_check(max_cpus) < 0) {
printk(KERN_INFO "SMP disabled\n");
@@ -1189,8 +1186,7 @@ void __cpu_die(unsigned int cpu)
printk ("CPU %d is now offline\n", cpu);
return;
}
- current->state = TASK_UNINTERRUPTIBLE;
- schedule_timeout(HZ/10);
+ msleep(100);
}
printk(KERN_ERR "CPU %u didn't die...\n", cpu);
}
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 10273663000..6ead433a388 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -594,9 +594,6 @@ asmlinkage void default_do_nmi(struct pt_regs *regs)
if (!cpu)
reason = get_nmi_reason();
- if (!cpu_online(cpu))
- return;
-
if (!(reason & 0xc0)) {
if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT)
== NOTIFY_STOP)