diff options
Diffstat (limited to 'arch/x86_64/kernel')
-rw-r--r-- | arch/x86_64/kernel/entry.S | 22 | ||||
-rw-r--r-- | arch/x86_64/kernel/genapic.c | 33 | ||||
-rw-r--r-- | arch/x86_64/kernel/genapic_flat.c | 142 | ||||
-rw-r--r-- | arch/x86_64/kernel/head.S | 16 | ||||
-rw-r--r-- | arch/x86_64/kernel/irq.c | 19 | ||||
-rw-r--r-- | arch/x86_64/kernel/machine_kexec.c | 101 | ||||
-rw-r--r-- | arch/x86_64/kernel/mce.c | 93 | ||||
-rw-r--r-- | arch/x86_64/kernel/mpparse.c | 17 | ||||
-rw-r--r-- | arch/x86_64/kernel/reboot.c | 31 | ||||
-rw-r--r-- | arch/x86_64/kernel/setup.c | 5 | ||||
-rw-r--r-- | arch/x86_64/kernel/setup64.c | 18 | ||||
-rw-r--r-- | arch/x86_64/kernel/smp.c | 66 | ||||
-rw-r--r-- | arch/x86_64/kernel/smpboot.c | 94 | ||||
-rw-r--r-- | arch/x86_64/kernel/traps.c | 3 |
14 files changed, 380 insertions, 280 deletions
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 28817490fdc..096d470e280 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -76,7 +76,7 @@ .macro FAKE_STACK_FRAME child_rip /* push in order ss, rsp, eflags, cs, rip */ - xorq %rax, %rax + xorl %eax, %eax pushq %rax /* ss */ CFI_ADJUST_CFA_OFFSET 8 pushq %rax /* rsp */ @@ -423,7 +423,7 @@ ENTRY(stub_rt_sigreturn) testl $3,CS(%rdi) je 1f swapgs -1: addl $1,%gs:pda_irqcount # RED-PEN should check preempt count +1: incl %gs:pda_irqcount # RED-PEN should check preempt count movq %gs:pda_irqstackptr,%rax cmoveq %rax,%rsp pushq %rdi # save old stack @@ -436,7 +436,7 @@ ENTRY(common_interrupt) ret_from_intr: popq %rdi cli - subl $1,%gs:pda_irqcount + decl %gs:pda_irqcount #ifdef CONFIG_DEBUG_INFO movq RBP(%rdi),%rbp #endif @@ -494,7 +494,7 @@ retint_signal: sti SAVE_REST movq $-1,ORIG_RAX(%rsp) - xorq %rsi,%rsi # oldset + xorl %esi,%esi # oldset movq %rsp,%rdi # &pt_regs call do_notify_resume RESTORE_REST @@ -752,7 +752,7 @@ child_rip: movq %rsi, %rdi call *%rax # exit - xorq %rdi, %rdi + xorl %edi, %edi call do_exit /* @@ -918,3 +918,15 @@ ENTRY(machine_check) ENTRY(call_debug) zeroentry do_call_debug +ENTRY(call_softirq) + movq %gs:pda_irqstackptr,%rax + pushq %r15 + movq %rsp,%r15 + incl %gs:pda_irqcount + cmove %rax,%rsp + call __do_softirq + movq %r15,%rsp + decl %gs:pda_irqcount + popq %r15 + ret + diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c index 69b9c25a8fc..30c843a5efd 100644 --- a/arch/x86_64/kernel/genapic.c +++ b/arch/x86_64/kernel/genapic.c @@ -31,6 +31,7 @@ u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; extern struct genapic apic_cluster; extern struct genapic apic_flat; +extern struct genapic apic_physflat; struct genapic *genapic = &apic_flat; @@ -44,12 +45,7 @@ void __init clustered_apic_check(void) u8 clusters, max_cluster; u8 id; u8 cluster_cnt[NUM_APIC_CLUSTERS]; - - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - /* AMD always uses flat mode right now */ - genapic = &apic_flat; - goto print; - } + int num_cpus = 0; #if defined(CONFIG_ACPI_BUS) /* @@ -64,15 +60,34 @@ void __init clustered_apic_check(void) #endif memset(cluster_cnt, 0, sizeof(cluster_cnt)); - for (i = 0; i < NR_CPUS; i++) { id = bios_cpu_apicid[i]; - if (id != BAD_APICID) - cluster_cnt[APIC_CLUSTERID(id)]++; + if (id == BAD_APICID) + continue; + num_cpus++; + cluster_cnt[APIC_CLUSTERID(id)]++; } + /* Don't use clustered mode on AMD platforms. */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + genapic = &apic_physflat; +#ifndef CONFIG_CPU_HOTPLUG + /* In the CPU hotplug case we cannot use broadcast mode + because that opens a race when a CPU is removed. + Stay at physflat mode in this case. + It is bad to do this unconditionally though. Once + we have ACPI platform support for CPU hotplug + we should detect hotplug capablity from ACPI tables and + only do this when really needed. -AK */ + if (num_cpus <= 8) + genapic = &apic_flat; +#endif + goto print; + } + clusters = 0; max_cluster = 0; + for (i = 0; i < NUM_APIC_CLUSTERS; i++) { if (cluster_cnt[i] > 0) { ++clusters; diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c index 28284696508..adc96282a9e 100644 --- a/arch/x86_64/kernel/genapic_flat.c +++ b/arch/x86_64/kernel/genapic_flat.c @@ -2,13 +2,11 @@ * Copyright 2004 James Cleverdon, IBM. * Subject to the GNU Public License, v.2 * - * Flat APIC subarch code. Maximum 8 CPUs, logical delivery. + * Flat APIC subarch code. * * Hacked for x86-64 by James Cleverdon from i386 architecture code by * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and * James Cleverdon. - * Ashok Raj <ashok.raj@intel.com> - * Removed IPI broadcast shortcut to support CPU hotplug */ #include <linux/config.h> #include <linux/threads.h> @@ -20,47 +18,6 @@ #include <asm/smp.h> #include <asm/ipi.h> -/* - * The following permit choosing broadcast IPI shortcut v.s sending IPI only - * to online cpus via the send_IPI_mask varient. - * The mask version is my preferred option, since it eliminates a lot of - * other extra code that would need to be written to cleanup intrs sent - * to a CPU while offline. - * - * Sending broadcast introduces lots of trouble in CPU hotplug situations. - * These IPI's are delivered to cpu's irrespective of their offline status - * and could pickup stale intr data when these CPUS are turned online. - * - * Not using broadcast is a cleaner approach IMO, but Andi Kleen disagrees with - * the idea of not using broadcast IPI's anymore. Hence the run time check - * is introduced, on his request so we can choose an alternate mechanism. - * - * Initial wacky performance tests that collect cycle counts show - * no increase in using mask v.s broadcast version. In fact they seem - * identical in terms of cycle counts. - * - * if we need to use broadcast, we need to do the following. - * - * cli; - * hold call_lock; - * clear any pending IPI, just ack and clear all pending intr - * set cpu_online_map; - * release call_lock; - * sti; - * - * The complicated dummy irq processing shown above is not required if - * we didnt sent IPI's to wrong CPU's in the first place. - * - * - Ashok Raj <ashok.raj@intel.com> - */ -#ifdef CONFIG_HOTPLUG_CPU -#define DEFAULT_SEND_IPI (1) -#else -#define DEFAULT_SEND_IPI (0) -#endif - -static int no_broadcast=DEFAULT_SEND_IPI; - static cpumask_t flat_target_cpus(void) { return cpu_online_map; @@ -119,37 +76,15 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector) local_irq_restore(flags); } -static inline void __local_flat_send_IPI_allbutself(int vector) -{ - if (no_broadcast) { - cpumask_t mask = cpu_online_map; - int this_cpu = get_cpu(); - - cpu_clear(this_cpu, mask); - flat_send_IPI_mask(mask, vector); - put_cpu(); - } - else - __send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL); -} - -static inline void __local_flat_send_IPI_all(int vector) -{ - if (no_broadcast) - flat_send_IPI_mask(cpu_online_map, vector); - else - __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); -} - static void flat_send_IPI_allbutself(int vector) { if (((num_online_cpus()) - 1) >= 1) - __local_flat_send_IPI_allbutself(vector); + __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL); } static void flat_send_IPI_all(int vector) { - __local_flat_send_IPI_all(vector); + __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); } static int flat_apic_id_registered(void) @@ -170,16 +105,6 @@ static unsigned int phys_pkg_id(int index_msb) return ((ebx >> 24) & 0xFF) >> index_msb; } -static __init int no_ipi_broadcast(char *str) -{ - get_option(&str, &no_broadcast); - printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" : - "IPI Broadcast"); - return 1; -} - -__setup("no_ipi_broadcast", no_ipi_broadcast); - struct genapic apic_flat = { .name = "flat", .int_delivery_mode = dest_LowestPrio, @@ -195,11 +120,62 @@ struct genapic apic_flat = { .phys_pkg_id = phys_pkg_id, }; -static int __init print_ipi_mode(void) +/* + * Physflat mode is used when there are more than 8 CPUs on a AMD system. + * We cannot use logical delivery in this case because the mask + * overflows, so use physical mode. + */ + +static cpumask_t physflat_target_cpus(void) +{ + return cpumask_of_cpu(0); +} + +static void physflat_send_IPI_mask(cpumask_t cpumask, int vector) +{ + send_IPI_mask_sequence(cpumask, vector); +} + +static void physflat_send_IPI_allbutself(int vector) +{ + cpumask_t allbutme = cpu_online_map; + int me = get_cpu(); + cpu_clear(me, allbutme); + physflat_send_IPI_mask(allbutme, vector); + put_cpu(); +} + +static void physflat_send_IPI_all(int vector) { - printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" : - "Shortcut"); - return 0; + physflat_send_IPI_mask(cpu_online_map, vector); } -late_initcall(print_ipi_mode); +static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + cpu = first_cpu(cpumask); + if ((unsigned)cpu < NR_CPUS) + return x86_cpu_to_apicid[cpu]; + else + return BAD_APICID; +} + +struct genapic apic_physflat = { + .name = "physical flat", + .int_delivery_mode = dest_LowestPrio, + .int_dest_mode = (APIC_DEST_PHYSICAL != 0), + .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_LOWEST, + .target_cpus = physflat_target_cpus, + .apic_id_registered = flat_apic_id_registered, + .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/ + .send_IPI_all = physflat_send_IPI_all, + .send_IPI_allbutself = physflat_send_IPI_allbutself, + .send_IPI_mask = physflat_send_IPI_mask, + .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, + .phys_pkg_id = phys_pkg_id, +}; diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 8d765aa77a2..98ff5eb32b9 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -137,14 +137,14 @@ startup_64: wrmsr /* Setup cr0 */ - xorq %rax, %rax - btsq $31, %rax /* Enable paging */ - btsq $0, %rax /* Enable protected mode */ - btsq $1, %rax /* Enable MP */ - btsq $4, %rax /* Enable ET */ - btsq $5, %rax /* Enable NE */ - btsq $16, %rax /* Enable WP */ - btsq $18, %rax /* Enable AM */ +#define CR0_PM 1 /* protected mode */ +#define CR0_MP (1<<1) +#define CR0_ET (1<<4) +#define CR0_NE (1<<5) +#define CR0_WP (1<<16) +#define CR0_AM (1<<18) +#define CR0_PAGING (1<<31) + movl $CR0_PM|CR0_MP|CR0_ET|CR0_NE|CR0_WP|CR0_AM|CR0_PAGING,%eax /* Make changes effective */ movq %rax, %cr0 diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c index cc3fb85f514..849a20aec7c 100644 --- a/arch/x86_64/kernel/irq.c +++ b/arch/x86_64/kernel/irq.c @@ -135,3 +135,22 @@ void fixup_irqs(cpumask_t map) local_irq_disable(); } #endif + +extern void call_softirq(void); + +asmlinkage void do_softirq(void) +{ + __u32 pending; + unsigned long flags; + + if (in_interrupt()) + return; + + local_irq_save(flags); + pending = local_softirq_pending(); + /* Switch to interrupt stack */ + if (pending) + call_softirq(); + local_irq_restore(flags); +} +EXPORT_SYMBOL(do_softirq); diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c index 60d1eff4156..89fab51e20f 100644 --- a/arch/x86_64/kernel/machine_kexec.c +++ b/arch/x86_64/kernel/machine_kexec.c @@ -8,43 +8,26 @@ #include <linux/mm.h> #include <linux/kexec.h> -#include <linux/delay.h> #include <linux/string.h> #include <linux/reboot.h> -#include <asm/pda.h> #include <asm/pgtable.h> -#include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/io.h> -#include <asm/apic.h> -#include <asm/cpufeature.h> -#include <asm/hw_irq.h> - -#define LEVEL0_SIZE (1UL << 12UL) -#define LEVEL1_SIZE (1UL << 21UL) -#define LEVEL2_SIZE (1UL << 30UL) -#define LEVEL3_SIZE (1UL << 39UL) -#define LEVEL4_SIZE (1UL << 48UL) - -#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) -#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE) -#define L2_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) -#define L3_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) - -static void init_level2_page(u64 *level2p, unsigned long addr) + +static void init_level2_page(pmd_t *level2p, unsigned long addr) { unsigned long end_addr; addr &= PAGE_MASK; - end_addr = addr + LEVEL2_SIZE; + end_addr = addr + PUD_SIZE; while (addr < end_addr) { - *(level2p++) = addr | L1_ATTR; - addr += LEVEL1_SIZE; + set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); + addr += PMD_SIZE; } } -static int init_level3_page(struct kimage *image, u64 *level3p, +static int init_level3_page(struct kimage *image, pud_t *level3p, unsigned long addr, unsigned long last_addr) { unsigned long end_addr; @@ -52,32 +35,32 @@ static int init_level3_page(struct kimage *image, u64 *level3p, result = 0; addr &= PAGE_MASK; - end_addr = addr + LEVEL3_SIZE; + end_addr = addr + PGDIR_SIZE; while ((addr < last_addr) && (addr < end_addr)) { struct page *page; - u64 *level2p; + pmd_t *level2p; page = kimage_alloc_control_pages(image, 0); if (!page) { result = -ENOMEM; goto out; } - level2p = (u64 *)page_address(page); + level2p = (pmd_t *)page_address(page); init_level2_page(level2p, addr); - *(level3p++) = __pa(level2p) | L2_ATTR; - addr += LEVEL2_SIZE; + set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE)); + addr += PUD_SIZE; } /* clear the unused entries */ while (addr < end_addr) { - *(level3p++) = 0; - addr += LEVEL2_SIZE; + pud_clear(level3p++); + addr += PUD_SIZE; } out: return result; } -static int init_level4_page(struct kimage *image, u64 *level4p, +static int init_level4_page(struct kimage *image, pgd_t *level4p, unsigned long addr, unsigned long last_addr) { unsigned long end_addr; @@ -85,28 +68,28 @@ static int init_level4_page(struct kimage *image, u64 *level4p, result = 0; addr &= PAGE_MASK; - end_addr = addr + LEVEL4_SIZE; + end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE); while ((addr < last_addr) && (addr < end_addr)) { struct page *page; - u64 *level3p; + pud_t *level3p; page = kimage_alloc_control_pages(image, 0); if (!page) { result = -ENOMEM; goto out; } - level3p = (u64 *)page_address(page); + level3p = (pud_t *)page_address(page); result = init_level3_page(image, level3p, addr, last_addr); if (result) { goto out; } - *(level4p++) = __pa(level3p) | L3_ATTR; - addr += LEVEL3_SIZE; + set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); + addr += PGDIR_SIZE; } /* clear the unused entries */ while (addr < end_addr) { - *(level4p++) = 0; - addr += LEVEL3_SIZE; + pgd_clear(level4p++); + addr += PGDIR_SIZE; } out: return result; @@ -115,52 +98,50 @@ out: static int init_pgtable(struct kimage *image, unsigned long start_pgtable) { - u64 *level4p; - level4p = (u64 *)__va(start_pgtable); + pgd_t *level4p; + level4p = (pgd_t *)__va(start_pgtable); return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT); } static void set_idt(void *newidt, u16 limit) { - unsigned char curidt[10]; + struct desc_ptr curidt; /* x86-64 supports unaliged loads & stores */ - (*(u16 *)(curidt)) = limit; - (*(u64 *)(curidt +2)) = (unsigned long)(newidt); + curidt.size = limit; + curidt.address = (unsigned long)newidt; __asm__ __volatile__ ( - "lidt %0\n" - : "=m" (curidt) + "lidtq %0\n" + : : "m" (curidt) ); }; static void set_gdt(void *newgdt, u16 limit) { - unsigned char curgdt[10]; + struct desc_ptr curgdt; /* x86-64 supports unaligned loads & stores */ - (*(u16 *)(curgdt)) = limit; - (*(u64 *)(curgdt +2)) = (unsigned long)(newgdt); + curgdt.size = limit; + curgdt.address = (unsigned long)newgdt; __asm__ __volatile__ ( - "lgdt %0\n" - : "=m" (curgdt) + "lgdtq %0\n" + : : "m" (curgdt) ); }; static void load_segments(void) { __asm__ __volatile__ ( - "\tmovl $"STR(__KERNEL_DS)",%eax\n" - "\tmovl %eax,%ds\n" - "\tmovl %eax,%es\n" - "\tmovl %eax,%ss\n" - "\tmovl %eax,%fs\n" - "\tmovl %eax,%gs\n" + "\tmovl %0,%%ds\n" + "\tmovl %0,%%es\n" + "\tmovl %0,%%ss\n" + "\tmovl %0,%%fs\n" + "\tmovl %0,%%gs\n" + : : "a" (__KERNEL_DS) ); -#undef STR -#undef __STR } typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page, @@ -178,7 +159,7 @@ int machine_kexec_prepare(struct kimage *image) /* Calculate the offsets */ start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; - control_code_buffer = start_pgtable + 4096UL; + control_code_buffer = start_pgtable + PAGE_SIZE; /* Setup the identity mapped 64bit page table */ result = init_pgtable(image, start_pgtable); @@ -214,7 +195,7 @@ NORET_TYPE void machine_kexec(struct kimage *image) /* Calculate the offsets */ page_list = image->head; start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; - control_code_buffer = start_pgtable + 4096UL; + control_code_buffer = start_pgtable + PAGE_SIZE; /* Set the low half of the page table to my identity mapped * page table for kexec. Leave the high half pointing at the diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 21e70625a49..3b267c91bb0 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -15,6 +15,8 @@ #include <linux/sysdev.h> #include <linux/miscdevice.h> #include <linux/fs.h> +#include <linux/cpu.h> +#include <linux/percpu.h> #include <asm/processor.h> #include <asm/msr.h> #include <asm/mce.h> @@ -514,10 +516,7 @@ static struct sysdev_class mce_sysclass = { set_kset_name("machinecheck"), }; -static struct sys_device device_mce = { - .id = 0, - .cls = &mce_sysclass, -}; +static DEFINE_PER_CPU(struct sys_device, device_mce); /* Why are there no generic functions for this? */ #define ACCESSOR(name, var, start) \ @@ -542,27 +541,83 @@ ACCESSOR(bank4ctl,bank[4],mce_restart()) ACCESSOR(tolerant,tolerant,) ACCESSOR(check_interval,check_interval,mce_restart()) -static __cpuinit int mce_init_device(void) +/* Per cpu sysdev init. All of the cpus still share the same ctl bank */ +static __cpuinit int mce_create_device(unsigned int cpu) { int err; + if (!mce_available(&cpu_data[cpu])) + return -EIO; + + per_cpu(device_mce,cpu).id = cpu; + per_cpu(device_mce,cpu).cls = &mce_sysclass; + + err = sysdev_register(&per_cpu(device_mce,cpu)); + + if (!err) { + sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank0ctl); + sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank1ctl); + sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank2ctl); + sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank3ctl); + sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank4ctl); + sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant); + sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval); + } + return err; +} + +#ifdef CONFIG_HOTPLUG_CPU +static __cpuinit void mce_remove_device(unsigned int cpu) +{ + sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank0ctl); + sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank1ctl); + sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank2ctl); + sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank3ctl); + sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank4ctl); + sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant); + sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval); + sysdev_unregister(&per_cpu(device_mce,cpu)); +} +#endif + +/* Get notified when a cpu comes on/off. Be hotplug friendly. */ +static __cpuinit int +mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action) { + case CPU_ONLINE: + mce_create_device(cpu); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + mce_remove_device(cpu); + break; +#endif + } + return NOTIFY_OK; +} + +static struct notifier_block mce_cpu_notifier = { + .notifier_call = mce_cpu_callback, +}; + +static __init int mce_init_device(void) +{ + int err; + int i = 0; + if (!mce_available(&boot_cpu_data)) return -EIO; err = sysdev_class_register(&mce_sysclass); - if (!err) - err = sysdev_register(&device_mce); - if (!err) { - /* could create per CPU objects, but it is not worth it. */ - sysdev_create_file(&device_mce, &attr_bank0ctl); - sysdev_create_file(&device_mce, &attr_bank1ctl); - sysdev_create_file(&device_mce, &attr_bank2ctl); - sysdev_create_file(&device_mce, &attr_bank3ctl); - sysdev_create_file(&device_mce, &attr_bank4ctl); - sysdev_create_file(&device_mce, &attr_tolerant); - sysdev_create_file(&device_mce, &attr_check_interval); - } - + + for_each_online_cpu(i) { + mce_create_device(i); + } + + register_cpu_notifier(&mce_cpu_notifier); misc_register(&mce_log_device); return err; - } + device_initcall(mce_init_device); diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 9c5aa2a790c..08abf9f5b15 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -109,7 +109,7 @@ static int __init mpf_checksum(unsigned char *mp, int len) static void __init MP_processor_info (struct mpc_config_processor *m) { - int ver; + int ver, cpu; static int found_bsp=0; if (!(m->mpc_cpuflag & CPU_ENABLED)) @@ -131,7 +131,7 @@ static void __init MP_processor_info (struct mpc_config_processor *m) return; } - num_processors++; + cpu = num_processors++; if (m->mpc_apicid > MAX_APICS) { printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", @@ -155,13 +155,18 @@ static void __init MP_processor_info (struct mpc_config_processor *m) * in same order as logical cpu numbers. Hence the first * entry is BSP, and so on. */ + cpu = 0; + bios_cpu_apicid[0] = m->mpc_apicid; x86_cpu_to_apicid[0] = m->mpc_apicid; found_bsp = 1; - } else { - bios_cpu_apicid[num_processors - found_bsp] = m->mpc_apicid; - x86_cpu_to_apicid[num_processors - found_bsp] = m->mpc_apicid; - } + } else + cpu = num_processors - found_bsp; + bios_cpu_apicid[cpu] = m->mpc_apicid; + x86_cpu_to_apicid[cpu] = m->mpc_apicid; + + cpu_set(cpu, cpu_possible_map); + cpu_set(cpu, cpu_present_map); } static void __init MP_bus_info (struct mpc_config_bus *m) diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c index 57e71dbdfd6..47f95687905 100644 --- a/arch/x86_64/kernel/reboot.c +++ b/arch/x86_64/kernel/reboot.c @@ -109,23 +109,10 @@ void machine_shutdown(void) local_irq_enable(); } -void machine_restart(char * __unused) +void machine_emergency_restart(void) { int i; - printk("machine restart\n"); - - machine_shutdown(); - - if (!reboot_force) { - local_irq_disable(); -#ifndef CONFIG_SMP - disable_local_APIC(); -#endif - disable_IO_APIC(); - local_irq_enable(); - } - /* Tell the BIOS if we want cold or warm reboot */ *((unsigned short *)__va(0x472)) = reboot_mode; @@ -150,18 +137,26 @@ void machine_restart(char * __unused) } } -EXPORT_SYMBOL(machine_restart); +void machine_restart(char * __unused) +{ + printk("machine restart\n"); + + if (!reboot_force) { + machine_shutdown(); + } + machine_emergency_restart(); +} void machine_halt(void) { } -EXPORT_SYMBOL(machine_halt); - void machine_power_off(void) { + if (!reboot_force) { + machine_shutdown(); + } if (pm_power_off) pm_power_off(); } -EXPORT_SYMBOL(machine_power_off); diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 5fd03225058..0aa526298e9 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -765,8 +765,6 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) int cpu = smp_processor_id(); int node = 0; unsigned bits; - if (c->x86_num_cores == 1) - return; bits = 0; while ((1 << bits) < c->x86_num_cores) @@ -1081,8 +1079,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) else mtrr_ap_init(); #ifdef CONFIG_NUMA - if (c != &boot_cpu_data) - numa_add_cpu(c - cpu_data); + numa_add_cpu(smp_processor_id()); #endif } diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index f2b9c6bc999..34082c1cc41 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -12,6 +12,7 @@ #include <linux/string.h> #include <linux/bootmem.h> #include <linux/bitops.h> +#include <linux/module.h> #include <asm/bootsetup.h> #include <asm/pda.h> #include <asm/pgtable.h> @@ -22,10 +23,8 @@ #include <asm/smp.h> #include <asm/i387.h> #include <asm/percpu.h> -#include <asm/mtrr.h> #include <asm/proto.h> -#include <asm/mman.h> -#include <asm/numa.h> +#include <asm/sections.h> char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,}; @@ -33,11 +32,6 @@ cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned; -extern struct task_struct init_task; - -extern unsigned char __per_cpu_start[], __per_cpu_end[]; - -extern struct desc_ptr cpu_gdt_descr[]; struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table }; char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); @@ -101,7 +95,7 @@ void __init setup_per_cpu_areas(void) #endif for (i = 0; i < NR_CPUS; i++) { - unsigned char *ptr; + char *ptr; if (!NODE_DATA(cpu_to_node(i))) { printk("cpu with no node %d, num_online_nodes %d\n", @@ -190,11 +184,7 @@ void __cpuinit check_efer(void) */ void __cpuinit cpu_init (void) { -#ifdef CONFIG_SMP int cpu = stack_smp_processor_id(); -#else - int cpu = smp_processor_id(); -#endif struct tss_struct *t = &per_cpu(init_tss, cpu); unsigned long v; char *estacks = NULL; @@ -214,7 +204,7 @@ void __cpuinit cpu_init (void) printk("Initializing CPU#%d\n", cpu); - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); /* * Initialize the per-CPU GDT with the boot GDT, diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index ccae392886a..e5958220d6b 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -129,10 +129,9 @@ asmlinkage void smp_invalidate_interrupt (void) } else leave_mm(cpu); } +out: ack_APIC_irq(); cpu_clear(cpu, flush_cpumask); - -out: put_cpu_no_resched(); } @@ -294,6 +293,69 @@ void unlock_ipi_call_lock(void) } /* + * this function sends a 'generic call function' IPI to one other CPU + * in the system. + */ +static void __smp_call_function_single (int cpu, void (*func) (void *info), void *info, + int nonatomic, int wait) +{ + struct call_data_struct data; + int cpus = 1; + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + call_data = &data; + wmb(); + /* Send a message to all other CPUs and wait for them to respond */ + send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus) + cpu_relax(); + + if (!wait) + return; + + while (atomic_read(&data.finished) != cpus) + cpu_relax(); +} + +/* + * smp_call_function_single - Run a function on another CPU + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @nonatomic: Currently unused. + * @wait: If true, wait until function has completed on other CPUs. + * + * Retrurns 0 on success, else a negative status code. + * + * Does not return until the remote CPU is nearly ready to execute <func> + * or is or has executed. + */ + +int smp_call_function_single (int cpu, void (*func) (void *info), void *info, + int nonatomic, int wait) +{ + /* prevent preemption and reschedule on another processor */ + int me = get_cpu(); + if (cpu == me) { + WARN_ON(1); + put_cpu(); + return -EBUSY; + } + spin_lock_bh(&call_lock); + __smp_call_function_single(cpu, func, info, nonatomic, wait); + spin_unlock_bh(&call_lock); + put_cpu(); + return 0; +} + +/* * this function sends a 'generic call function' IPI to all other CPUs * in the system. */ diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index b969ee12872..6e4807d64d4 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -113,24 +113,6 @@ struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; #define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p)) /* - * cpu_possible_map should be static, it cannot change as cpu's - * are onlined, or offlined. The reason is per-cpu data-structures - * are allocated by some modules at init time, and dont expect to - * do this dynamically on cpu arrival/departure. - * cpu_present_map on the other hand can change dynamically. - * In case when cpu_hotplug is not compiled, then we resort to current - * behaviour, which is cpu_possible == cpu_present. - * If cpu-hotplug is supported, then we need to preallocate for all - * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range. - * - Ashok Raj - */ -#ifdef CONFIG_HOTPLUG_CPU -#define fixup_cpu_possible_map(x) cpu_set((x), cpu_possible_map) -#else -#define fixup_cpu_possible_map(x) -#endif - -/* * Currently trivial. Write the real->protected mode * bootstrap into the page concerned. The caller * has made sure it's suitably aligned. @@ -229,9 +211,6 @@ static __cpuinit void sync_master(void *arg) { unsigned long flags, i; - if (smp_processor_id() != boot_cpu_id) - return; - go[MASTER] = 0; local_irq_save(flags); @@ -280,12 +259,12 @@ get_delta(long *rt, long *master) return tcenter - best_tm; } -static __cpuinit void sync_tsc(void) +static __cpuinit void sync_tsc(unsigned int master) { int i, done = 0; long delta, adj, adjust_latency = 0; unsigned long flags, rt, master_time_stamp, bound; -#if DEBUG_TSC_SYNC +#ifdef DEBUG_TSC_SYNC static struct syncdebug { long rt; /* roundtrip time */ long master; /* master's timestamp */ @@ -294,9 +273,17 @@ static __cpuinit void sync_tsc(void) } t[NUM_ROUNDS] __cpuinitdata; #endif + printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", + smp_processor_id(), master); + go[MASTER] = 1; - smp_call_function(sync_master, NULL, 1, 0); + /* It is dangerous to broadcast IPI as cpus are coming up, + * as they may not be ready to accept them. So since + * we only need to send the ipi to the boot cpu direct + * the message, and avoid the race. + */ + smp_call_function_single(master, sync_master, NULL, 1, 0); while (go[MASTER]) /* wait for master to be ready */ no_cpu_relax(); @@ -321,7 +308,7 @@ static __cpuinit void sync_tsc(void) rdtscll(t); wrmsrl(MSR_IA32_TSC, t + adj); } -#if DEBUG_TSC_SYNC +#ifdef DEBUG_TSC_SYNC t[i].rt = rt; t[i].master = master_time_stamp; t[i].diff = delta; @@ -331,7 +318,7 @@ static __cpuinit void sync_tsc(void) } spin_unlock_irqrestore(&tsc_sync_lock, flags); -#if DEBUG_TSC_SYNC +#ifdef DEBUG_TSC_SYNC for (i = 0; i < NUM_ROUNDS; ++i) printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n", t[i].rt, t[i].master, t[i].diff, t[i].lat); @@ -340,16 +327,14 @@ static __cpuinit void sync_tsc(void) printk(KERN_INFO "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, " "maxerr %lu cycles)\n", - smp_processor_id(), boot_cpu_id, delta, rt); + smp_processor_id(), master, delta, rt); } static void __cpuinit tsc_sync_wait(void) { if (notscsync || !cpu_has_tsc) return; - printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", smp_processor_id(), - boot_cpu_id); - sync_tsc(); + sync_tsc(boot_cpu_id); } static __init int notscsync_setup(char *s) @@ -537,7 +522,7 @@ void __cpuinit start_secondary(void) extern volatile unsigned long init_rsp; extern void (*initial_code)(void); -#if APIC_DEBUG +#ifdef APIC_DEBUG static void inquire_remote_apic(int apicid) { unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; @@ -773,8 +758,9 @@ do_rest: initial_code = start_secondary; clear_ti_thread_flag(c_idle.idle->thread_info, TIF_FORK); - printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid, - start_rip, init_rsp); + printk(KERN_INFO "Booting processor %d/%d APIC 0x%x\n", cpu, + cpus_weight(cpu_present_map), + apicid); /* * This grunge runs the startup process for @@ -841,7 +827,7 @@ do_rest: else /* trampoline code not run */ printk("Not responding.\n"); -#if APIC_DEBUG +#ifdef APIC_DEBUG inquire_remote_apic(apicid); #endif } @@ -924,6 +910,27 @@ static __init void enforce_max_cpus(unsigned max_cpus) } } +#ifdef CONFIG_HOTPLUG_CPU +/* + * cpu_possible_map should be static, it cannot change as cpu's + * are onlined, or offlined. The reason is per-cpu data-structures + * are allocated by some modules at init time, and dont expect to + * do this dynamically on cpu arrival/departure. + * cpu_present_map on the other hand can change dynamically. + * In case when cpu_hotplug is not compiled, then we resort to current + * behaviour, which is cpu_possible == cpu_present. + * If cpu-hotplug is supported, then we need to preallocate for all + * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range. + * - Ashok Raj + */ +static void prefill_possible_map(void) +{ + int i; + for (i = 0; i < NR_CPUS; i++) + cpu_set(i, cpu_possible_map); +} +#endif + /* * Various sanity checks. */ @@ -987,25 +994,15 @@ static int __init smp_sanity_check(unsigned max_cpus) */ void __init smp_prepare_cpus(unsigned int max_cpus) { - int i; - nmi_watchdog_default(); current_cpu_data = boot_cpu_data; current_thread_info()->cpu = 0; /* needed? */ enforce_max_cpus(max_cpus); - /* - * Fill in cpu_present_mask - */ - for (i = 0; i < NR_CPUS; i++) { - int apicid = cpu_present_to_apicid(i); - if (physid_isset(apicid, phys_cpu_present_map)) { - cpu_set(i, cpu_present_map); - cpu_set(i, cpu_possible_map); - } - fixup_cpu_possible_map(i); - } +#ifdef CONFIG_HOTPLUG_CPU + prefill_possible_map(); +#endif if (smp_sanity_check(max_cpus) < 0) { printk(KERN_INFO "SMP disabled\n"); @@ -1189,8 +1186,7 @@ void __cpu_die(unsigned int cpu) printk ("CPU %d is now offline\n", cpu); return; } - current->state = TASK_UNINTERRUPTIBLE; - schedule_timeout(HZ/10); + msleep(100); } printk(KERN_ERR "CPU %u didn't die...\n", cpu); } diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 10273663000..6ead433a388 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -594,9 +594,6 @@ asmlinkage void default_do_nmi(struct pt_regs *regs) if (!cpu) reason = get_nmi_reason(); - if (!cpu_online(cpu)) - return; - if (!(reason & 0xc0)) { if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT) == NOTIFY_STOP) |