diff options
Diffstat (limited to 'arch/x86')
30 files changed, 1242 insertions, 313 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 469f3450bf8..31758378bcd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -138,6 +138,9 @@ config ARCH_HAS_CACHE_LINE_SIZE config HAVE_SETUP_PER_CPU_AREA def_bool y +config HAVE_DYNAMIC_PER_CPU_AREA + def_bool y + config HAVE_CPUMASK_OF_CPU_MAP def_bool X86_64_SMP @@ -780,6 +783,11 @@ config X86_MCE_AMD Additional support for AMD specific MCE features such as the DRAM Error Threshold. +config X86_MCE_THRESHOLD + depends on X86_MCE_AMD || X86_MCE_INTEL + bool + default y + config X86_MCE_NONFATAL tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" depends on X86_32 && X86_MCE @@ -1125,7 +1133,7 @@ config NODES_SHIFT Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accomodate various tables. -config HAVE_ARCH_BOOTMEM_NODE +config HAVE_ARCH_BOOTMEM def_bool y depends on X86_32 && NUMA diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 63134e31e8b..bc9514fb3b1 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -53,6 +53,7 @@ #define APIC_ESR_SENDILL 0x00020 #define APIC_ESR_RECVILL 0x00040 #define APIC_ESR_ILLREGA 0x00080 +#define APIC_LVTCMCI 0x2f0 #define APIC_ICR 0x300 #define APIC_DEST_SELF 0x40000 #define APIC_DEST_ALLINC 0x80000 diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 2f8466540fb..5b301b7ff5f 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -5,24 +5,43 @@ #include <linux/mm.h> /* Caches aren't brain-dead on the intel. */ -#define flush_cache_all() do { } while (0) -#define flush_cache_mm(mm) do { } while (0) -#define flush_cache_dup_mm(mm) do { } while (0) -#define flush_cache_range(vma, start, end) do { } while (0) -#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) -#define flush_dcache_page(page) do { } while (0) -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) -#define flush_icache_range(start, end) do { } while (0) -#define flush_icache_page(vma, pg) do { } while (0) -#define flush_icache_user_range(vma, pg, adr, len) do { } while (0) -#define flush_cache_vmap(start, end) do { } while (0) -#define flush_cache_vunmap(start, end) do { } while (0) +static inline void flush_cache_all(void) { } +static inline void flush_cache_mm(struct mm_struct *mm) { } +static inline void flush_cache_dup_mm(struct mm_struct *mm) { } +static inline void flush_cache_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) { } +static inline void flush_cache_page(struct vm_area_struct *vma, + unsigned long vmaddr, unsigned long pfn) { } +static inline void flush_dcache_page(struct page *page) { } +static inline void flush_dcache_mmap_lock(struct address_space *mapping) { } +static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { } +static inline void flush_icache_range(unsigned long start, + unsigned long end) { } +static inline void flush_icache_page(struct vm_area_struct *vma, + struct page *page) { } +static inline void flush_icache_user_range(struct vm_area_struct *vma, + struct page *page, + unsigned long addr, + unsigned long len) { } +static inline void flush_cache_vmap(unsigned long start, unsigned long end) { } +static inline void flush_cache_vunmap(unsigned long start, + unsigned long end) { } -#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ - memcpy((dst), (src), (len)) -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy((dst), (src), (len)) +static inline void copy_to_user_page(struct vm_area_struct *vma, + struct page *page, unsigned long vaddr, + void *dst, const void *src, + unsigned long len) +{ + memcpy(dst, src, len); +} + +static inline void copy_from_user_page(struct vm_area_struct *vma, + struct page *page, unsigned long vaddr, + void *dst, const void *src, + unsigned long len) +{ + memcpy(dst, src, len); +} #define PG_non_WB PG_arch_1 PAGEFLAG(NonWB, non_WB) diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 48f0004db8c..71c9e518398 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -172,7 +172,13 @@ static inline void __save_init_fpu(struct task_struct *tsk) #else /* CONFIG_X86_32 */ -extern void finit(void); +#ifdef CONFIG_MATH_EMULATION +extern void finit_task(struct task_struct *tsk); +#else +static inline void finit_task(struct task_struct *tsk) +{ +} +#endif static inline void tolerant_fwait(void) { diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 683d0b4c00f..e5383e3d2f8 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -172,8 +172,6 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) extern void iounmap(volatile void __iomem *addr); -extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); - #ifdef CONFIG_X86_32 # include "io_32.h" @@ -198,7 +196,6 @@ extern void early_ioremap_reset(void); extern void __iomem *early_ioremap(unsigned long offset, unsigned long size); extern void __iomem *early_memremap(unsigned long offset, unsigned long size); extern void early_iounmap(void __iomem *addr, unsigned long size); -extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); #define IO_SPACE_LIMIT 0xffff diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 32c6e17b960..563933e06a3 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -11,6 +11,8 @@ */ #define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ +#define MCG_EXT_P (1ULL<<9) /* Extended registers available */ +#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ #define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ #define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */ @@ -90,14 +92,29 @@ extern int mce_disabled; #include <asm/atomic.h> +void mce_setup(struct mce *m); void mce_log(struct mce *m); DECLARE_PER_CPU(struct sys_device, device_mce); extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); +/* + * To support more than 128 would need to escape the predefined + * Linux defined extended banks first. + */ +#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) + #ifdef CONFIG_X86_MCE_INTEL void mce_intel_feature_init(struct cpuinfo_x86 *c); +void cmci_clear(void); +void cmci_reenable(void); +void cmci_rediscover(int dying); +void cmci_recheck(void); #else static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } +static inline void cmci_clear(void) {} +static inline void cmci_reenable(void) {} +static inline void cmci_rediscover(int dying) {} +static inline void cmci_recheck(void) {} #endif #ifdef CONFIG_X86_MCE_AMD @@ -106,11 +123,23 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c); static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } #endif -void mce_log_therm_throt_event(unsigned int cpu, __u64 status); +extern int mce_available(struct cpuinfo_x86 *c); + +void mce_log_therm_throt_event(__u64 status); extern atomic_t mce_entry; extern void do_machine_check(struct pt_regs *, long); + +typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); +DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); + +enum mcp_flags { + MCP_TIMESTAMP = (1 << 0), /* log time stamp */ + MCP_UC = (1 << 1), /* log uncorrected errors */ +}; +extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); + extern int mce_notify_user(void); #endif /* !CONFIG_X86_32 */ @@ -120,8 +149,8 @@ extern void mcheck_init(struct cpuinfo_x86 *c); #else #define mcheck_init(c) do { } while (0) #endif -extern void stop_mce(void); -extern void restart_mce(void); + +extern void (*mce_threshold_vector)(void); #endif /* __KERNEL__ */ #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index 105fb90a063..ede6998bd92 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h @@ -91,46 +91,9 @@ static inline int pfn_valid(int pfn) #endif /* CONFIG_DISCONTIGMEM */ #ifdef CONFIG_NEED_MULTIPLE_NODES - -/* - * Following are macros that are specific to this numa platform. - */ -#define reserve_bootmem(addr, size, flags) \ - reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags)) -#define alloc_bootmem(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_nopanic(x) \ - __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \ - __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_low(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0) -#define alloc_bootmem_pages(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_pages_nopanic(x) \ - __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \ - __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_low_pages(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0) -#define alloc_bootmem_node(pgdat, x) \ -({ \ - struct pglist_data __maybe_unused \ - *__alloc_bootmem_node__pgdat = (pgdat); \ - __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, \ - __pa(MAX_DMA_ADDRESS)); \ -}) -#define alloc_bootmem_pages_node(pgdat, x) \ -({ \ - struct pglist_data __maybe_unused \ - *__alloc_bootmem_node__pgdat = (pgdat); \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, \ - __pa(MAX_DMA_ADDRESS)); \ -}) -#define alloc_bootmem_low_pages_node(pgdat, x) \ -({ \ - struct pglist_data __maybe_unused \ - *__alloc_bootmem_node__pgdat = (pgdat); \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0); \ -}) +/* always use node 0 for bootmem on this numa platform */ +#define bootmem_arch_preferred_node(__bdata, size, align, goal, limit) \ + (NODE_DATA(0)->bdata) #endif /* CONFIG_NEED_MULTIPLE_NODES */ #endif /* _ASM_X86_MMZONE_32_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 358acc59ae0..2dbd2314139 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -77,6 +77,11 @@ #define MSR_IA32_MC0_ADDR 0x00000402 #define MSR_IA32_MC0_MISC 0x00000403 +/* These are consecutive and not in the normal 4er MCE bank block */ +#define MSR_IA32_MC0_CTL2 0x00000280 +#define CMCI_EN (1ULL << 30) +#define CMCI_THRESHOLD_MASK 0xffffULL + #define MSR_P6_PERFCTR0 0x000000c1 #define MSR_P6_PERFCTR1 0x000000c2 #define MSR_P6_EVNTSEL0 0x00000186 diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index aee103b26d0..8f1d2fbec1d 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -43,6 +43,14 @@ #else /* ...!ASSEMBLY */ #include <linux/stringify.h> +#include <asm/sections.h> + +#define __addr_to_pcpu_ptr(addr) \ + (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ + + (unsigned long)__per_cpu_start) +#define __pcpu_ptr_to_addr(ptr) \ + (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ + - (unsigned long)__per_cpu_start) #ifdef CONFIG_SMP #define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1c097a3a666..d0812e155f1 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -288,6 +288,8 @@ static inline int is_new_memtype_allowed(unsigned long flags, return 1; } +pmd_t *populate_extra_pmd(unsigned long vaddr); +pte_t *populate_extra_pte(unsigned long vaddr); #endif /* __ASSEMBLY__ */ #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 4bd990ee43d..1a918dde46b 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -164,6 +164,7 @@ static inline pte_t __pte_ma(pteval_t x) xmaddr_t arbitrary_virt_to_machine(void *address); +unsigned long arbitrary_virt_to_mfn(void *vaddr); void make_lowmem_page_readonly(void *vaddr); void make_lowmem_page_readwrite(void *vaddr); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 6907b8e85d5..4c80f155743 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -414,9 +414,17 @@ void __init alternative_instructions(void) that might execute the to be patched code. Other CPUs are not running. */ stop_nmi(); -#ifdef CONFIG_X86_MCE - stop_mce(); -#endif + + /* + * Don't stop machine check exceptions while patching. + * MCEs only happen when something got corrupted and in this + * case we must do something about the corruption. + * Ignoring it is worse than a unlikely patching race. + * Also machine checks tend to be broadcast and if one CPU + * goes into machine check the others follow quickly, so we don't + * expect a machine check to cause undue problems during to code + * patching. + */ apply_alternatives(__alt_instructions, __alt_instructions_end); @@ -456,9 +464,6 @@ void __init alternative_instructions(void) (unsigned long)__smp_locks_end); restart_nmi(); -#ifdef CONFIG_X86_MCE - restart_mce(); -#endif } /** diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f9cecdfd05c..30909a258d0 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -46,6 +46,7 @@ #include <asm/idle.h> #include <asm/mtrr.h> #include <asm/smp.h> +#include <asm/mce.h> unsigned int num_processors; @@ -842,6 +843,14 @@ void clear_local_APIC(void) apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); } #endif +#ifdef CONFIG_X86_MCE_INTEL + if (maxlvt >= 6) { + v = apic_read(APIC_LVTCMCI); + if (!(v & APIC_LVT_MASKED)) + apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED); + } +#endif + /* * Clean APIC state for other OSs: */ @@ -1241,6 +1250,12 @@ void __cpuinit setup_local_APIC(void) apic_write(APIC_LVT1, value); preempt_enable(); + +#ifdef CONFIG_X86_MCE_INTEL + /* Recheck CMCI information after local APIC is up on CPU #0 */ + if (smp_processor_id() == 0) + cmci_recheck(); +#endif } void __cpuinit end_local_APIC_setup(void) diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 4b1c319d30c..22590cf688a 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -601,7 +601,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) if (!data) return -ENOMEM; - data->acpi_data = percpu_ptr(acpi_perf_data, cpu); + data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu); per_cpu(drv_data, cpu) = data; if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index d7d2323bbb6..b2f89829bbe 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o +obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c index dfaebce3633..3552119b091 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_32.c +++ b/arch/x86/kernel/cpu/mcheck/mce_32.c @@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c) } } -static unsigned long old_cr4 __initdata; - -void __init stop_mce(void) -{ - old_cr4 = read_cr4(); - clear_in_cr4(X86_CR4_MCE); -} - -void __init restart_mce(void) -{ - if (old_cr4 & X86_CR4_MCE) - set_in_cr4(X86_CR4_MCE); -} - static int __init mcheck_disable(char *str) { mce_disabled = 1; diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index fe79985ce0f..bfbd5323a63 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -3,6 +3,8 @@ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. * Rest from unknown author(s). * 2004 Andi Kleen. Rewrote most of it. + * Copyright 2008 Intel Corporation + * Author: Andi Kleen */ #include <linux/init.h> @@ -24,6 +26,9 @@ #include <linux/ctype.h> #include <linux/kmod.h> #include <linux/kdebug.h> +#include <linux/kobject.h> +#include <linux/sysfs.h> +#include <linux/ratelimit.h> #include <asm/processor.h> #include <asm/msr.h> #include <asm/mce.h> @@ -32,7 +37,6 @@ #include <asm/idle.h> #define MISC_MCELOG_MINOR 227 -#define NR_SYSFS_BANKS 6 atomic_t mce_entry; @@ -47,7 +51,7 @@ static int mce_dont_init; */ static int tolerant = 1; static int banks; -static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL }; +static u64 *bank; static unsigned long notify_user; static int rip_msr; static int mce_bootlog = -1; @@ -58,6 +62,19 @@ static char *trigger_argv[2] = { trigger, NULL }; static DECLARE_WAIT_QUEUE_HEAD(mce_wait); +/* MCA banks polled by the period polling timer for corrected events */ +DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { + [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL +}; + +/* Do initial initialization of a struct mce */ +void mce_setup(struct mce *m) +{ + memset(m, 0, sizeof(struct mce)); + m->cpu = smp_processor_id(); + rdtscll(m->tsc); +} + /* * Lockless MCE logging infrastructure. * This avoids deadlocks on printk locks without having to break locks. Also @@ -119,11 +136,11 @@ static void print_mce(struct mce *m) print_symbol("{%s}", m->ip); printk("\n"); } - printk(KERN_EMERG "TSC %Lx ", m->tsc); + printk(KERN_EMERG "TSC %llx ", m->tsc); if (m->addr) - printk("ADDR %Lx ", m->addr); + printk("ADDR %llx ", m->addr); if (m->misc) - printk("MISC %Lx ", m->misc); + printk("MISC %llx ", m->misc); printk("\n"); printk(KERN_EMERG "This is not a software problem!\n"); printk(KERN_EMERG "Run through mcelog --ascii to decode " @@ -149,8 +166,10 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start) panic(msg); } -static int mce_available(struct cpuinfo_x86 *c) +int mce_available(struct cpuinfo_x86 *c) { + if (mce_dont_init) + return 0; return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } @@ -172,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) } /* - * The actual machine check handler + * Poll for corrected events or events that happened before reset. + * Those are just logged through /dev/mcelog. + * + * This is executed in standard interrupt context. + */ +void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) +{ + struct mce m; + int i; + + mce_setup(&m); + + rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + for (i = 0; i < banks; i++) { + if (!bank[i] || !test_bit(i, *b)) + continue; + + m.misc = 0; + m.addr = 0; + m.bank = i; + m.tsc = 0; + + barrier(); + rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); + if (!(m.status & MCI_STATUS_VAL)) + continue; + + /* + * Uncorrected events are handled by the exception handler + * when it is enabled. But when the exception is disabled log + * everything. + * + * TBD do the same check for MCI_STATUS_EN here? + */ + if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) + continue; + + if (m.status & MCI_STATUS_MISCV) + rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); + if (m.status & MCI_STATUS_ADDRV) + rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); + + if (!(flags & MCP_TIMESTAMP)) + m.tsc = 0; + /* + * Don't get the IP here because it's unlikely to + * have anything to do with the actual error location. + */ + + mce_log(&m); + add_taint(TAINT_MACHINE_CHECK); + + /* + * Clear state for this bank. + */ + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } + + /* + * Don't clear MCG_STATUS here because it's only defined for + * exceptions. + */ +} + +/* + * The actual machine check handler. This only handles real + * exceptions when something got corrupted coming in through int 18. + * + * This is executed in NMI context not subject to normal locking rules. This + * implies that most kernel services cannot be safely used. Don't even + * think about putting a printk in there! */ void do_machine_check(struct pt_regs * regs, long error_code) { @@ -190,17 +279,18 @@ void do_machine_check(struct pt_regs * regs, long error_code) * error. */ int kill_it = 0; + DECLARE_BITMAP(toclear, MAX_NR_BANKS); atomic_inc(&mce_entry); - if ((regs - && notify_die(DIE_NMI, "machine check", regs, error_code, + if (notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL) == NOTIFY_STOP) - || !banks) + goto out2; + if (!banks) goto out2; - memset(&m, 0, sizeof(struct mce)); - m.cpu = smp_processor_id(); + mce_setup(&m); + rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); /* if the restart IP is not valid, we're done for */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) @@ -210,18 +300,32 @@ void do_machine_check(struct pt_regs * regs, long error_code) barrier(); for (i = 0; i < banks; i++) { - if (i < NR_SYSFS_BANKS && !bank[i]) + __clear_bit(i, toclear); + if (!bank[i]) continue; m.misc = 0; m.addr = 0; m.bank = i; - m.tsc = 0; rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); if ((m.status & MCI_STATUS_VAL) == 0) continue; + /* + * Non uncorrected errors are handled by machine_check_poll + * Leave them alone. + */ + if ((m.status & MCI_STATUS_UC) == 0) + continue; + + /* + * Set taint even when machine check was not enabled. + */ + add_taint(TAINT_MACHINE_CHECK); + + __set_bit(i, toclear); + if (m.status & MCI_STATUS_EN) { /* if PCC was set, there's no way out */ no_way_out |= !!(m.status & MCI_STATUS_PCC); @@ -235,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code) no_way_out = 1; kill_it = 1; } + } else { + /* + * Machine check event was not enabled. Clear, but + * ignore. + */ + continue; } if (m.status & MCI_STATUS_MISCV) @@ -243,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); mce_get_rip(&m, regs); - if (error_code >= 0) - rdtscll(m.tsc); - if (error_code != -2) - mce_log(&m); + mce_log(&m); /* Did this bank cause the exception? */ /* Assume that the bank with uncorrectable errors did it, @@ -255,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code) panicm = m; panicm_found = 1; } - - add_taint(TAINT_MACHINE_CHECK); } - /* Never do anything final in the polling timer */ - if (!regs) - goto out; - /* If we didn't find an uncorrectable error, pick the last one (shouldn't happen, just being safe). */ if (!panicm_found) @@ -309,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code) /* notify userspace ASAP */ set_thread_flag(TIF_MCE_NOTIFY); - out: /* the last thing we do is clear state */ - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + for (i = 0; i < banks; i++) { + if (test_bit(i, toclear)) + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } wrmsrl(MSR_IA32_MCG_STATUS, 0); out2: atomic_dec(&mce_entry); @@ -332,15 +434,13 @@ void do_machine_check(struct pt_regs * regs, long error_code) * and historically has been the register value of the * MSR_IA32_THERMAL_STATUS (Intel) msr. */ -void mce_log_therm_throt_event(unsigned int cpu, __u64 status) +void mce_log_therm_throt_event(__u64 status) { struct mce m; - memset(&m, 0, sizeof(m)); - m.cpu = cpu; + mce_setup(&m); m.bank = MCE_THERMAL_BANK; m.status = status; - rdtscll(m.tsc); mce_log(&m); } #endif /* CONFIG_X86_MCE_INTEL */ @@ -353,18 +453,18 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status) static int check_interval = 5 * 60; /* 5 minutes */ static int next_interval; /* in jiffies */ -static void mcheck_timer(struct work_struct *work); -static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); +static void mcheck_timer(unsigned long); +static DEFINE_PER_CPU(struct timer_list, mce_timer); -static void mcheck_check_cpu(void *info) +static void mcheck_timer(unsigned long data) { - if (mce_available(¤t_cpu_data)) - do_machine_check(NULL, 0); -} + struct timer_list *t = &per_cpu(mce_timer, data); -static void mcheck_timer(struct work_struct *work) -{ - on_each_cpu(mcheck_check_cpu, NULL, 1); + WARN_ON(smp_processor_id() != data); + + if (mce_available(¤t_cpu_data)) + machine_check_poll(MCP_TIMESTAMP, + &__get_cpu_var(mce_poll_banks)); /* * Alert userspace if needed. If we logged an MCE, reduce the @@ -377,31 +477,41 @@ static void mcheck_timer(struct work_struct *work) (int)round_jiffies_relative(check_interval*HZ)); } - schedule_delayed_work(&mcheck_work, next_interval); + t->expires = jiffies + next_interval; + add_timer(t); +} + +static void mce_do_trigger(struct work_struct *work) +{ + call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); } +static DECLARE_WORK(mce_trigger_work, mce_do_trigger); + /* - * This is only called from process context. This is where we do - * anything we need to alert userspace about new MCEs. This is called - * directly from the poller and also from entry.S and idle, thanks to - * TIF_MCE_NOTIFY. + * Notify the user(s) about new machine check events. + * Can be called from interrupt context, but not from machine check/NMI + * context. */ int mce_notify_user(void) { + /* Not more than two messages every minute */ + static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); + clear_thread_flag(TIF_MCE_NOTIFY); if (test_and_clear_bit(0, ¬ify_user)) { - static unsigned long last_print; - unsigned long now = jiffies; - wake_up_interruptible(&mce_wait); - if (trigger[0]) - call_usermodehelper(trigger, trigger_argv, NULL, - UMH_NO_WAIT); - if (time_after_eq(now, last_print + (check_interval*HZ))) { - last_print = now; + /* + * There is no risk of missing notifications because + * work_pending is always cleared before the function is + * executed. + */ + if (trigger[0] && !work_pending(&mce_trigger_work)) + schedule_work(&mce_trigger_work); + + if (__ratelimit(&ratelimit)) printk(KERN_INFO "Machine check events logged\n"); - } return 1; } @@ -425,63 +535,78 @@ static struct notifier_block mce_idle_notifier = { static __init int periodic_mcheck_init(void) { - next_interval = check_interval * HZ; - if (next_interval) - schedule_delayed_work(&mcheck_work, - round_jiffies_relative(next_interval)); - idle_notifier_register(&mce_idle_notifier); - return 0; + idle_notifier_register(&mce_idle_notifier); + return 0; } __initcall(periodic_mcheck_init); - /* * Initialize Machine Checks for a CPU. */ -static void mce_init(void *dummy) +static int mce_cap_init(void) { u64 cap; - int i; + unsigned b; rdmsrl(MSR_IA32_MCG_CAP, cap); - banks = cap & 0xff; - if (banks > MCE_EXTENDED_BANK) { - banks = MCE_EXTENDED_BANK; - printk(KERN_INFO "MCE: warning: using only %d banks\n", - MCE_EXTENDED_BANK); + b = cap & 0xff; + if (b > MAX_NR_BANKS) { + printk(KERN_WARNING + "MCE: Using only %u machine check banks out of %u\n", + MAX_NR_BANKS, b); + b = MAX_NR_BANKS; } + + /* Don't support asymmetric configurations today */ + WARN_ON(banks != 0 && b != banks); + banks = b; + if (!bank) { + bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); + if (!bank) + return -ENOMEM; + memset(bank, 0xff, banks * sizeof(u64)); + } + /* Use accurate RIP reporting if available. */ if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) rip_msr = MSR_IA32_MCG_EIP; - /* Log the machine checks left over from the previous reset. - This also clears all registers */ - do_machine_check(NULL, mce_bootlog ? -1 : -2); + return 0; +} + +static void mce_init(void *dummy) +{ + u64 cap; + int i; + mce_banks_t all_banks; + + /* + * Log the machine checks left over from the previous reset. + */ + bitmap_fill(all_banks, MAX_NR_BANKS); + machine_check_poll(MCP_UC, &all_banks); set_in_cr4(X86_CR4_MCE); + rdmsrl(MSR_IA32_MCG_CAP, cap); if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); for (i = 0; i < banks; i++) { - if (i < NR_SYSFS_BANKS) - wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); - else - wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL); - + wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } } /* Add per CPU specific workarounds here */ -static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) +static void mce_cpu_quirks(struct cpuinfo_x86 *c) { /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { - if(c->x86 == 15) + if (c->x86 == 15 && banks > 4) /* disable GART TBL walk error reporting, which trips off incorrectly with the IOMMU & 3ware & Cerberus. */ - clear_bit(10, &bank[4]); + clear_bit(10, (unsigned long *)&bank[4]); if(c->x86 <= 17 && mce_bootlog < 0) /* Lots of broken BIOS around that don't clear them by default and leave crap in there. Don't log. */ @@ -504,20 +629,38 @@ static void mce_cpu_features(struct cpuinfo_x86 *c) } } +static void mce_init_timer(void) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + + /* data race harmless because everyone sets to the same value */ + if (!next_interval) + next_interval = check_interval * HZ; + if (!next_interval) + return; + setup_timer(t, mcheck_timer, smp_processor_id()); + t->expires = round_jiffies_relative(jiffies + next_interval); + add_timer(t); +} + /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off. */ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) { - mce_cpu_quirks(c); + if (!mce_available(c)) + return; - if (mce_dont_init || - !mce_available(c)) + if (mce_cap_init() < 0) { + mce_dont_init = 1; return; + } + mce_cpu_quirks(c); mce_init(NULL); mce_cpu_features(c); + mce_init_timer(); } /* @@ -573,7 +716,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, { unsigned long *cpu_tsc; static DEFINE_MUTEX(mce_read_mutex); - unsigned next; + unsigned prev, next; char __user *buf = ubuf; int i, err; @@ -592,25 +735,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, } err = 0; - for (i = 0; i < next; i++) { - unsigned long start = jiffies; - - while (!mcelog.entry[i].finished) { - if (time_after_eq(jiffies, start + 2)) { - memset(mcelog.entry + i,0, sizeof(struct mce)); - goto timeout; + prev = 0; + do { + for (i = prev; i < next; i++) { + unsigned long start = jiffies; + + while (!mcelog.entry[i].finished) { + if (time_after_eq(jiffies, start + 2)) { + memset(mcelog.entry + i, 0, + sizeof(struct mce)); + goto timeout; + } + cpu_relax(); } - cpu_relax(); + smp_rmb(); + err |= copy_to_user(buf, mcelog.entry + i, + sizeof(struct mce)); + buf += sizeof(struct mce); +timeout: + ; } - smp_rmb(); - err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce)); - buf += sizeof(struct mce); - timeout: - ; - } - memset(mcelog.entry, 0, next * sizeof(struct mce)); - mcelog.next = 0; + memset(mcelog.entry + prev, 0, + (next - prev) * sizeof(struct mce)); + prev = next; + next = cmpxchg(&mcelog.next, prev, 0); + } while (next != prev); synchronize_sched(); @@ -680,20 +830,6 @@ static struct miscdevice mce_log_device = { &mce_chrdev_ops, }; -static unsigned long old_cr4 __initdata; - -void __init stop_mce(void) -{ - old_cr4 = read_cr4(); - clear_in_cr4(X86_CR4_MCE); -} - -void __init restart_mce(void) -{ - if (old_cr4 & X86_CR4_MCE) - set_in_cr4(X86_CR4_MCE); -} - /* * Old style boot options parsing. Only for compatibility. */ @@ -703,8 +839,7 @@ static int __init mcheck_disable(char *str) return 1; } -/* mce=off disables machine check. Note you can re-enable it later - using sysfs. +/* mce=off disables machine check. mce=TOLERANCELEVEL (number, see above) mce=bootlog Log MCEs from before booting. Disabled by default on AMD. mce=nobootlog Don't log MCEs from before booting. */ @@ -728,6 +863,29 @@ __setup("mce=", mcheck_enable); * Sysfs support */ +/* + * Disable machine checks on suspend and shutdown. We can't really handle + * them later. + */ +static int mce_disable(void) +{ + int i; + + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + return 0; +} + +static int mce_suspend(struct sys_device *dev, pm_message_t state) +{ + return mce_disable(); +} + +static int mce_shutdown(struct sys_device *dev) +{ + return mce_disable(); +} + /* On resume clear all MCE state. Don't want to see leftovers from the BIOS. Only one CPU is active at this time, the others get readded later using CPU hotplug. */ @@ -738,20 +896,24 @@ static int mce_resume(struct sys_device *dev) return 0; } +static void mce_cpu_restart(void *data) +{ + del_timer_sync(&__get_cpu_var(mce_timer)); + if (mce_available(¤t_cpu_data)) + mce_init(NULL); + mce_init_timer(); +} + /* Reinit MCEs after user configuration changes */ static void mce_restart(void) { - if (next_interval) - cancel_delayed_work(&mcheck_work); - /* Timer race is harmless here */ - on_each_cpu(mce_init, NULL, 1); next_interval = check_interval * HZ; - if (next_interval) - schedule_delayed_work(&mcheck_work, - round_jiffies_relative(next_interval)); + on_each_cpu(mce_cpu_restart, NULL, 1); } static struct sysdev_class mce_sysclass = { + .suspend = mce_suspend, + .shutdown = mce_shutdown, .resume = mce_resume, .name = "machinecheck", }; @@ -778,16 +940,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit } \ static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); -/* - * TBD should generate these dynamically based on number of available banks. - * Have only 6 contol banks in /sysfs until then. - */ -ACCESSOR(bank0ctl,bank[0],mce_restart()) -ACCESSOR(bank1ctl,bank[1],mce_restart()) -ACCESSOR(bank2ctl,bank[2],mce_restart()) -ACCESSOR(bank3ctl,bank[3],mce_restart()) -ACCESSOR(bank4ctl,bank[4],mce_restart()) -ACCESSOR(bank5ctl,bank[5],mce_restart()) +static struct sysdev_attribute *bank_attrs; + +static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, + char *buf) +{ + u64 b = bank[attr - bank_attrs]; + return sprintf(buf, "%llx\n", b); +} + +static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, + const char *buf, size_t siz) +{ + char *end; + u64 new = simple_strtoull(buf, &end, 0); + if (end == buf) + return -EINVAL; + bank[attr - bank_attrs] = new; + mce_restart(); + return end-buf; +} static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) @@ -814,8 +986,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); ACCESSOR(check_interval,check_interval,mce_restart()) static struct sysdev_attribute *mce_attributes[] = { - &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, - &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, &attr_tolerant.attr, &attr_check_interval, &attr_trigger, NULL }; @@ -845,11 +1015,22 @@ static __cpuinit int mce_create_device(unsigned int cpu) if (err) goto error; } + for (i = 0; i < banks; i++) { + err = sysdev_create_file(&per_cpu(device_mce, cpu), + &bank_attrs[i]); + if (err) + goto error2; + } cpu_set(cpu, mce_device_initialized); return 0; +error2: + while (--i >= 0) { + sysdev_remove_file(&per_cpu(device_mce, cpu), + &bank_attrs[i]); + } error: - while (i--) { + while (--i >= 0) { sysdev_remove_file(&per_cpu(device_mce,cpu), mce_attributes[i]); } @@ -868,15 +1049,46 @@ static __cpuinit void mce_remove_device(unsigned int cpu) for (i = 0; mce_attributes[i]; i++) sysdev_remove_file(&per_cpu(device_mce,cpu), mce_attributes[i]); + for (i = 0; i < banks; i++) + sysdev_remove_file(&per_cpu(device_mce, cpu), + &bank_attrs[i]); sysdev_unregister(&per_cpu(device_mce,cpu)); cpu_clear(cpu, mce_device_initialized); } +/* Make sure there are no machine checks on offlined CPUs. */ +static void mce_disable_cpu(void *h) +{ + int i; + unsigned long action = *(unsigned long *)h; + + if (!mce_available(¤t_cpu_data)) + return; + if (!(action & CPU_TASKS_FROZEN)) + cmci_clear(); + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); +} + +static void mce_reenable_cpu(void *h) +{ + int i; + unsigned long action = *(unsigned long *)h; + + if (!mce_available(¤t_cpu_data)) + return; + if (!(action & CPU_TASKS_FROZEN)) + cmci_reenable(); + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); +} + /* Get notified when a cpu comes on/off. Be hotplug friendly. */ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; + struct timer_list *t = &per_cpu(mce_timer, cpu); switch (action) { case CPU_ONLINE: @@ -891,6 +1103,21 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, threshold_cpu_callback(action, cpu); mce_remove_device(cpu); break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + del_timer_sync(t); + smp_call_function_single(cpu, mce_disable_cpu, &action, 1); + break; + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + t->expires = round_jiffies_relative(jiffies + next_interval); + add_timer_on(t, cpu); + smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); + break; + case CPU_POST_DEAD: + /* intentionally ignoring frozen here */ + cmci_rediscover(cpu); + break; } return NOTIFY_OK; } @@ -899,6 +1126,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = { .notifier_call = mce_cpu_callback, }; +static __init int mce_init_banks(void) +{ + int i; + + bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, + GFP_KERNEL); + if (!bank_attrs) + return -ENOMEM; + + for (i = 0; i < banks; i++) { + struct sysdev_attribute *a = &bank_attrs[i]; + a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); + if (!a->attr.name) + goto nomem; + a->attr.mode = 0644; + a->show = show_bank; + a->store = set_bank; + } + return 0; + +nomem: + while (--i >= 0) + kfree(bank_attrs[i].attr.name); + kfree(bank_attrs); + bank_attrs = NULL; + return -ENOMEM; +} + static __init int mce_init_device(void) { int err; @@ -906,6 +1161,11 @@ static __init int mce_init_device(void) if (!mce_available(&boot_cpu_data)) return -EIO; + + err = mce_init_banks(); + if (err) + return err; + err = sysdev_class_register(&mce_sysclass); if (err) return err; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 9817506dd46..c5a32f92d07 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = { static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ +static void amd_threshold_interrupt(void); + /* * CPU Initialization */ @@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) tr.reset = 0; tr.old_limit = 0; threshold_restart_bank(&tr); + + mce_threshold_vector = amd_threshold_interrupt; } } } @@ -187,19 +191,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) * the interrupt goes off when error_count reaches threshold_limit. * the handler will simply log mcelog w/ software defined bank number. */ -asmlinkage void mce_threshold_interrupt(void) +static void amd_threshold_interrupt(void) { unsigned int bank, block; struct mce m; u32 low = 0, high = 0, address = 0; - ack_APIC_irq(); - exit_idle(); - irq_enter(); - - memset(&m, 0, sizeof(m)); - rdtscll(m.tsc); - m.cpu = smp_processor_id(); + mce_setup(&m); /* assume first bank caused it */ for (bank = 0; bank < NR_BANKS; ++bank) { @@ -233,7 +231,8 @@ asmlinkage void mce_threshold_interrupt(void) /* Log the machine check that caused the threshold event. */ - do_machine_check(NULL, 0); + machine_check_poll(MCP_TIMESTAMP, + &__get_cpu_var(mce_poll_banks)); if (high & MASK_OVERFLOW_HI) { rdmsrl(address, m.misc); @@ -243,13 +242,10 @@ asmlinkage void mce_threshold_interrupt(void) + bank * NR_BLOCKS + block; mce_log(&m); - goto out; + return; } } } -out: - inc_irq_stat(irq_threshold_count); - irq_exit(); } /* diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index aa5e287c98e..aaa7d973093 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -1,6 +1,8 @@ /* * Intel specific MCE features. * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> + * Copyright (C) 2008, 2009 Intel Corporation + * Author: Andi Kleen */ #include <linux/init.h> @@ -13,6 +15,7 @@ #include <asm/hw_irq.h> #include <asm/idle.h> #include <asm/therm_throt.h> +#include <asm/apic.h> asmlinkage void smp_thermal_interrupt(void) { @@ -25,7 +28,7 @@ asmlinkage void smp_thermal_interrupt(void) rdmsrl(MSR_IA32_THERM_STATUS, msr_val); if (therm_throt_process(msr_val & 1)) - mce_log_therm_throt_event(smp_processor_id(), msr_val); + mce_log_therm_throt_event(msr_val); inc_irq_stat(irq_thermal_count); irq_exit(); @@ -85,7 +88,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) return; } +/* + * Support for Intel Correct Machine Check Interrupts. This allows + * the CPU to raise an interrupt when a corrected machine check happened. + * Normally we pick those up using a regular polling timer. + * Also supports reliable discovery of shared banks. + */ + +static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); + +/* + * cmci_discover_lock protects against parallel discovery attempts + * which could race against each other. + */ +static DEFINE_SPINLOCK(cmci_discover_lock); + +#define CMCI_THRESHOLD 1 + +static int cmci_supported(int *banks) +{ + u64 cap; + + /* + * Vendor check is not strictly needed, but the initial + * initialization is vendor keyed and this + * makes sure none of the backdoors are entered otherwise. + */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return 0; + if (!cpu_has_apic || lapic_get_maxlvt() < 6) + return 0; + rdmsrl(MSR_IA32_MCG_CAP, cap); + *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); + return !!(cap & MCG_CMCI_P); +} + +/* + * The interrupt handler. This is called on every event. + * Just call the poller directly to log any events. + * This could in theory increase the threshold under high load, + * but doesn't for now. + */ +static void intel_threshold_interrupt(void) +{ + machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); + mce_notify_user(); +} + +static void print_update(char *type, int *hdr, int num) +{ + if (*hdr == 0) + printk(KERN_INFO "CPU %d MCA banks", smp_processor_id()); + *hdr = 1; + printk(KERN_CONT " %s:%d", type, num); +} + +/* + * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks + * on this CPU. Use the algorithm recommended in the SDM to discover shared + * banks. + */ +static void cmci_discover(int banks, int boot) +{ + unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); + int hdr = 0; + int i; + + spin_lock(&cmci_discover_lock); + for (i = 0; i < banks; i++) { + u64 val; + + if (test_bit(i, owned)) + continue; + + rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + + /* Already owned by someone else? */ + if (val & CMCI_EN) { + if (test_and_clear_bit(i, owned) || boot) + print_update("SHD", &hdr, i); + __clear_bit(i, __get_cpu_var(mce_poll_banks)); + continue; + } + + val |= CMCI_EN | CMCI_THRESHOLD; + wrmsrl(MSR_IA32_MC0_CTL2 + i, val); + rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + + /* Did the enable bit stick? -- the bank supports CMCI */ + if (val & CMCI_EN) { + if (!test_and_set_bit(i, owned) || boot) + print_update("CMCI", &hdr, i); + __clear_bit(i, __get_cpu_var(mce_poll_banks)); + } else { + WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); + } + } + spin_unlock(&cmci_discover_lock); + if (hdr) + printk(KERN_CONT "\n"); +} + +/* + * Just in case we missed an event during initialization check + * all the CMCI owned banks. + */ +void cmci_recheck(void) +{ + unsigned long flags; + int banks; + + if (!mce_available(¤t_cpu_data) || !cmci_supported(&banks)) + return; + local_irq_save(flags); + machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); + local_irq_restore(flags); +} + +/* + * Disable CMCI on this CPU for all banks it owns when it goes down. + * This allows other CPUs to claim the banks on rediscovery. + */ +void cmci_clear(void) +{ + int i; + int banks; + u64 val; + + if (!cmci_supported(&banks)) + return; + spin_lock(&cmci_discover_lock); + for (i = 0; i < banks; i++) { + if (!test_bit(i, __get_cpu_var(mce_banks_owned))) + continue; + /* Disable CMCI */ + rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); + wrmsrl(MSR_IA32_MC0_CTL2 + i, val); + __clear_bit(i, __get_cpu_var(mce_banks_owned)); + } + spin_unlock(&cmci_discover_lock); +} + +/* + * After a CPU went down cycle through all the others and rediscover + * Must run in process context. + */ +void cmci_rediscover(int dying) +{ + int banks; + int cpu; + cpumask_var_t old; + + if (!cmci_supported(&banks)) + return; + if (!alloc_cpumask_var(&old, GFP_KERNEL)) + return; + cpumask_copy(old, ¤t->cpus_allowed); + + for_each_online_cpu (cpu) { + if (cpu == dying) + continue; + if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu))) + continue; + /* Recheck banks in case CPUs don't all have the same */ + if (cmci_supported(&banks)) + cmci_discover(banks, 0); + } + + set_cpus_allowed_ptr(current, old); + free_cpumask_var(old); +} + +/* + * Reenable CMCI on this CPU in case a CPU down failed. + */ +void cmci_reenable(void) +{ + int banks; + if (cmci_supported(&banks)) + cmci_discover(banks, 0); +} + +static __cpuinit void intel_init_cmci(void) +{ + int banks; + + if (!cmci_supported(&banks)) + return; + + mce_threshold_vector = intel_threshold_interrupt; + cmci_discover(banks, 1); + /* + * For CPU #0 this runs with still disabled APIC, but that's + * ok because only the vector is set up. We still do another + * check for the banks later for CPU #0 just to make sure + * to not miss any events. + */ + apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED); + cmci_recheck(); +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); + intel_init_cmci(); } diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c new file mode 100644 index 00000000000..23ee9e730f7 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -0,0 +1,29 @@ +/* + * Common corrected MCE threshold handler code: + */ +#include <linux/interrupt.h> +#include <linux/kernel.h> + +#include <asm/irq_vectors.h> +#include <asm/apic.h> +#include <asm/idle.h> +#include <asm/mce.h> + +static void default_threshold_interrupt(void) +{ + printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n", + THRESHOLD_APIC_VECTOR); +} + +void (*mce_threshold_vector)(void) = default_threshold_interrupt; + +asmlinkage void mce_threshold_interrupt(void) +{ + exit_idle(); + irq_enter(); + inc_irq_stat(irq_threshold_count); + mce_threshold_vector(); + irq_exit(); + /* Ack only at the end to avoid potential reentry */ + ack_APIC_irq(); +} diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index b0f61f0dcd0..f2f8540a7f3 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -136,7 +136,7 @@ int init_fpu(struct task_struct *tsk) #ifdef CONFIG_X86_32 if (!HAVE_HWFP) { memset(tsk->thread.xstate, 0, xstate_size); - finit(); + finit_task(tsk); set_stopped_child_used_math(tsk); return 0; } diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 9dc6b2b2427..3b09634a515 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -16,6 +16,7 @@ #include <linux/cpu.h> #include <linux/delay.h> #include <linux/uaccess.h> +#include <linux/percpu.h> #include <asm/apic.h> @@ -55,13 +56,13 @@ static inline void print_stack_overflow(void) { } union irq_ctx { struct thread_info tinfo; u32 stack[THREAD_SIZE/sizeof(u32)]; -}; +} __attribute__((aligned(PAGE_SIZE))); -static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; -static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; +static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx); +static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx); -static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; -static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; +static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack); +static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack); static void call_on_stack(void *func, void *stack) { @@ -81,7 +82,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) u32 *isp, arg1, arg2; curctx = (union irq_ctx *) current_thread_info(); - irqctx = hardirq_ctx[smp_processor_id()]; + irqctx = __get_cpu_var(hardirq_ctx); /* * this is where we switch to the IRQ stack. However, if we are @@ -125,34 +126,34 @@ void __cpuinit irq_ctx_init(int cpu) { union irq_ctx *irqctx; - if (hardirq_ctx[cpu]) + if (per_cpu(hardirq_ctx, cpu)) return; - irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; + irqctx = &per_cpu(hardirq_stack, cpu); irqctx->tinfo.task = NULL; irqctx->tinfo.exec_domain = NULL; irqctx->tinfo.cpu = cpu; irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); - hardirq_ctx[cpu] = irqctx; + per_cpu(hardirq_ctx, cpu) = irqctx; - irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE]; + irqctx = &per_cpu(softirq_stack, cpu); irqctx->tinfo.task = NULL; irqctx->tinfo.exec_domain = NULL; irqctx->tinfo.cpu = cpu; irqctx->tinfo.preempt_count = 0; irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); - softirq_ctx[cpu] = irqctx; + per_cpu(softirq_ctx, cpu) = irqctx; printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", - cpu, hardirq_ctx[cpu], softirq_ctx[cpu]); + cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu)); } void irq_ctx_exit(int cpu) { - hardirq_ctx[cpu] = NULL; + per_cpu(hardirq_ctx, cpu) = NULL; } asmlinkage void do_softirq(void) @@ -169,7 +170,7 @@ asmlinkage void do_softirq(void) if (local_softirq_pending()) { curctx = current_thread_info(); - irqctx = softirq_ctx[smp_processor_id()]; + irqctx = __get_cpu_var(softirq_ctx); irqctx->tinfo.task = curctx->task; irqctx->tinfo.previous_esp = current_stack_pointer; diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 1cc18d439bb..2aef36d8aca 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -216,6 +216,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"), }, }, + { /* Handle problems with rebooting on Dell XPS710 */ + .callback = set_bios_reboot, + .ident = "Dell XPS710", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"), + }, + }, { } }; diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index d992e6cff73..c29f301d388 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -7,6 +7,7 @@ #include <linux/crash_dump.h> #include <linux/smp.h> #include <linux/topology.h> +#include <linux/pfn.h> #include <asm/sections.h> #include <asm/processor.h> #include <asm/setup.h> @@ -41,6 +42,321 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { }; EXPORT_SYMBOL(__per_cpu_offset); +/** + * pcpu_need_numa - determine percpu allocation needs to consider NUMA + * + * If NUMA is not configured or there is only one NUMA node available, + * there is no reason to consider NUMA. This function determines + * whether percpu allocation should consider NUMA or not. + * + * RETURNS: + * true if NUMA should be considered; otherwise, false. + */ +static bool __init pcpu_need_numa(void) +{ +#ifdef CONFIG_NEED_MULTIPLE_NODES + pg_data_t *last = NULL; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + int node = early_cpu_to_node(cpu); + + if (node_online(node) && NODE_DATA(node) && + last && last != NODE_DATA(node)) + return true; + + last = NODE_DATA(node); + } +#endif + return false; +} + +/** + * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu + * @cpu: cpu to allocate for + * @size: size allocation in bytes + * @align: alignment + * + * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper + * does the right thing for NUMA regardless of the current + * configuration. + * + * RETURNS: + * Pointer to the allocated area on success, NULL on failure. + */ +static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, + unsigned long align) +{ + const unsigned long goal = __pa(MAX_DMA_ADDRESS); +#ifdef CONFIG_NEED_MULTIPLE_NODES + int node = early_cpu_to_node(cpu); + void *ptr; + + if (!node_online(node) || !NODE_DATA(node)) { + ptr = __alloc_bootmem_nopanic(size, align, goal); + pr_info("cpu %d has no node %d or node-local memory\n", + cpu, node); + pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n", + cpu, size, __pa(ptr)); + } else { + ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node), + size, align, goal); + pr_debug("per cpu data for cpu%d %lu bytes on node%d at " + "%016lx\n", cpu, size, node, __pa(ptr)); + } + return ptr; +#else + return __alloc_bootmem_nopanic(size, align, goal); +#endif +} + +/* + * Remap allocator + * + * This allocator uses PMD page as unit. A PMD page is allocated for + * each cpu and each is remapped into vmalloc area using PMD mapping. + * As PMD page is quite large, only part of it is used for the first + * chunk. Unused part is returned to the bootmem allocator. + * + * So, the PMD pages are mapped twice - once to the physical mapping + * and to the vmalloc area for the first percpu chunk. The double + * mapping does add one more PMD TLB entry pressure but still is much + * better than only using 4k mappings while still being NUMA friendly. + */ +#ifdef CONFIG_NEED_MULTIPLE_NODES +static size_t pcpur_size __initdata; +static void **pcpur_ptrs __initdata; + +static struct page * __init pcpur_get_page(unsigned int cpu, int pageno) +{ + size_t off = (size_t)pageno << PAGE_SHIFT; + + if (off >= pcpur_size) + return NULL; + + return virt_to_page(pcpur_ptrs[cpu] + off); +} + +static ssize_t __init setup_pcpu_remap(size_t static_size) +{ + static struct vm_struct vm; + pg_data_t *last; + size_t ptrs_size; + unsigned int cpu; + ssize_t ret; + + /* + * If large page isn't supported, there's no benefit in doing + * this. Also, on non-NUMA, embedding is better. + */ + if (!cpu_has_pse || pcpu_need_numa()) + return -EINVAL; + + last = NULL; + for_each_possible_cpu(cpu) { + int node = early_cpu_to_node(cpu); + + if (node_online(node) && NODE_DATA(node) && + last && last != NODE_DATA(node)) + goto proceed; + + last = NODE_DATA(node); + } + return -EINVAL; + +proceed: + /* + * Currently supports only single page. Supporting multiple + * pages won't be too difficult if it ever becomes necessary. + */ + pcpur_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); + if (pcpur_size > PMD_SIZE) { + pr_warning("PERCPU: static data is larger than large page, " + "can't use large page\n"); + return -EINVAL; + } + + /* allocate pointer array and alloc large pages */ + ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); + pcpur_ptrs = alloc_bootmem(ptrs_size); + + for_each_possible_cpu(cpu) { + pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); + if (!pcpur_ptrs[cpu]) + goto enomem; + + /* + * Only use pcpur_size bytes and give back the rest. + * + * Ingo: The 2MB up-rounding bootmem is needed to make + * sure the partial 2MB page is still fully RAM - it's + * not well-specified to have a PAT-incompatible area + * (unmapped RAM, device memory, etc.) in that hole. + */ + free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), + PMD_SIZE - pcpur_size); + + memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); + } + + /* allocate address and map */ + vm.flags = VM_ALLOC; + vm.size = num_possible_cpus() * PMD_SIZE; + vm_area_register_early(&vm, PMD_SIZE); + + for_each_possible_cpu(cpu) { + pmd_t *pmd; + + pmd = populate_extra_pmd((unsigned long)vm.addr + + cpu * PMD_SIZE); + set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])), + PAGE_KERNEL_LARGE)); + } + + /* we're ready, commit */ + pr_info("PERCPU: Remapped at %p with large pages, static data " + "%zu bytes\n", vm.addr, static_size); + + ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE, + pcpur_size - static_size, vm.addr, NULL); + goto out_free_ar; + +enomem: + for_each_possible_cpu(cpu) + if (pcpur_ptrs[cpu]) + free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE); + ret = -ENOMEM; +out_free_ar: + free_bootmem(__pa(pcpur_ptrs), ptrs_size); + return ret; +} +#else +static ssize_t __init setup_pcpu_remap(size_t static_size) +{ + return -EINVAL; +} +#endif + +/* + * Embedding allocator + * + * The first chunk is sized to just contain the static area plus + * PERCPU_DYNAMIC_RESERVE and allocated as a contiguous area using + * bootmem allocator and used as-is without being mapped into vmalloc + * area. This enables the first chunk to piggy back on the linear + * physical PMD mapping and doesn't add any additional pressure to + * TLB. + */ +static void *pcpue_ptr __initdata; +static size_t pcpue_unit_size __initdata; + +static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) +{ + return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + + ((size_t)pageno << PAGE_SHIFT)); +} + +static ssize_t __init setup_pcpu_embed(size_t static_size) +{ + unsigned int cpu; + + /* + * If large page isn't supported, there's no benefit in doing + * this. Also, embedding allocation doesn't play well with + * NUMA. + */ + if (!cpu_has_pse || pcpu_need_numa()) + return -EINVAL; + + /* allocate and copy */ + pcpue_unit_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); + pcpue_unit_size = max_t(size_t, pcpue_unit_size, PCPU_MIN_UNIT_SIZE); + pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size, + PAGE_SIZE); + if (!pcpue_ptr) + return -ENOMEM; + + for_each_possible_cpu(cpu) + memcpy(pcpue_ptr + cpu * pcpue_unit_size, __per_cpu_load, + static_size); + + /* we're ready, commit */ + pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", + pcpue_unit_size >> PAGE_SHIFT, pcpue_ptr, static_size); + + return pcpu_setup_first_chunk(pcpue_get_page, static_size, + pcpue_unit_size, + pcpue_unit_size - static_size, pcpue_ptr, + NULL); +} + +/* + * 4k page allocator + * + * This is the basic allocator. Static percpu area is allocated + * page-by-page and most of initialization is done by the generic + * setup function. + */ +static struct page **pcpu4k_pages __initdata; +static int pcpu4k_nr_static_pages __initdata; + +static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) +{ + if (pageno < pcpu4k_nr_static_pages) + return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; + return NULL; +} + +static void __init pcpu4k_populate_pte(unsigned long addr) +{ + populate_extra_pte(addr); +} + +static ssize_t __init setup_pcpu_4k(size_t static_size) +{ + size_t pages_size; + unsigned int cpu; + int i, j; + ssize_t ret; + + pcpu4k_nr_static_pages = PFN_UP(static_size); + + /* unaligned allocations can't be freed, round up to page size */ + pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() + * sizeof(pcpu4k_pages[0])); + pcpu4k_pages = alloc_bootmem(pages_size); + + /* allocate and copy */ + j = 0; + for_each_possible_cpu(cpu) + for (i = 0; i < pcpu4k_nr_static_pages; i++) { + void *ptr; + + ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); + if (!ptr) + goto enomem; + + memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); + pcpu4k_pages[j++] = virt_to_page(ptr); + } + + /* we're ready, commit */ + pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", + pcpu4k_nr_static_pages, static_size); + + ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, 0, NULL, + pcpu4k_populate_pte); + goto out_free_ar; + +enomem: + while (--j >= 0) + free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE); + ret = -ENOMEM; +out_free_ar: + free_bootmem(__pa(pcpu4k_pages), pages_size); + return ret; +} + static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -61,38 +377,35 @@ static inline void setup_percpu_segment(int cpu) */ void __init setup_per_cpu_areas(void) { - ssize_t size; - char *ptr; - int cpu; - - /* Copy section for each CPU (we discard the original) */ - size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE); + size_t static_size = __per_cpu_end - __per_cpu_start; + unsigned int cpu; + unsigned long delta; + size_t pcpu_unit_size; + ssize_t ret; pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); - pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size); + /* + * Allocate percpu area. If PSE is supported, try to make use + * of large page mappings. Please read comments on top of + * each allocator for details. + */ + ret = setup_pcpu_remap(static_size); + if (ret < 0) + ret = setup_pcpu_embed(static_size); + if (ret < 0) + ret = setup_pcpu_4k(static_size); + if (ret < 0) + panic("cannot allocate static percpu area (%zu bytes, err=%zd)", + static_size, ret); - for_each_possible_cpu(cpu) { -#ifndef CONFIG_NEED_MULTIPLE_NODES - ptr = alloc_bootmem_pages(size); -#else - int node = early_cpu_to_node(cpu); - if (!node_online(node) || !NODE_DATA(node)) { - ptr = alloc_bootmem_pages(size); - pr_info("cpu %d has no node %d or node-local memory\n", - cpu, node); - pr_debug("per cpu data for cpu%d at %016lx\n", - cpu, __pa(ptr)); - } else { - ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); - pr_debug("per cpu data for cpu%d on node%d at %016lx\n", - cpu, node, __pa(ptr)); - } -#endif + pcpu_unit_size = ret; - memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start); - per_cpu_offset(cpu) = ptr - __per_cpu_start; + /* alrighty, percpu areas up and running */ + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) { + per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); @@ -125,8 +438,6 @@ void __init setup_per_cpu_areas(void) */ if (cpu == boot_cpu_id) switch_to_new_gdt(cpu); - - DBG("PERCPU: cpu %4d %p\n", cpu, ptr); } /* indicate the early static arrays will soon be gone */ diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c index 491e737ce54..aa098708877 100644 --- a/arch/x86/math-emu/fpu_aux.c +++ b/arch/x86/math-emu/fpu_aux.c @@ -30,20 +30,29 @@ static void fclex(void) } /* Needs to be externally visible */ -void finit(void) +void finit_task(struct task_struct *tsk) { - control_word = 0x037f; - partial_status = 0; - top = 0; /* We don't keep top in the status word internally. */ - fpu_tag_word = 0xffff; + struct i387_soft_struct *soft = &tsk->thread.xstate->soft; + struct address *oaddr, *iaddr; + soft->cwd = 0x037f; + soft->swd = 0; + soft->ftop = 0; /* We don't keep top in the status word internally. */ + soft->twd = 0xffff; /* The behaviour is different from that detailed in Section 15.1.6 of the Intel manual */ - operand_address.offset = 0; - operand_address.selector = 0; - instruction_address.offset = 0; - instruction_address.selector = 0; - instruction_address.opcode = 0; - no_ip_update = 1; + oaddr = (struct address *)&soft->foo; + oaddr->offset = 0; + oaddr->selector = 0; + iaddr = (struct address *)&soft->fip; + iaddr->offset = 0; + iaddr->selector = 0; + iaddr->opcode = 0; + soft->no_update = 1; +} + +void finit(void) +{ + finit_task(current); } /* diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index d57dfffb021..2966c6b8d30 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -131,6 +131,23 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) return pte_offset_kernel(pmd, 0); } +pmd_t * __init populate_extra_pmd(unsigned long vaddr) +{ + int pgd_idx = pgd_index(vaddr); + int pmd_idx = pmd_index(vaddr); + + return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx; +} + +pte_t * __init populate_extra_pte(unsigned long vaddr) +{ + int pte_idx = pte_index(vaddr); + pmd_t *pmd; + + pmd = populate_extra_pmd(vaddr); + return one_page_table_init(pmd) + pte_idx; +} + static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, unsigned long vaddr, pte_t *lastpte) { diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 7dd7ce49d69..8a853bc3b28 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -161,34 +161,51 @@ static __ref void *spp_getpage(void) return ptr; } -void -set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr) { - pud_t *pud; - pmd_t *pmd; - pte_t *pte; + if (pgd_none(*pgd)) { + pud_t *pud = (pud_t *)spp_getpage(); + pgd_populate(&init_mm, pgd, pud); + if (pud != pud_offset(pgd, 0)) + printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", + pud, pud_offset(pgd, 0)); + } + return pud_offset(pgd, vaddr); +} - pud = pud_page + pud_index(vaddr); +static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) +{ if (pud_none(*pud)) { - pmd = (pmd_t *) spp_getpage(); + pmd_t *pmd = (pmd_t *) spp_getpage(); pud_populate(&init_mm, pud, pmd); - if (pmd != pmd_offset(pud, 0)) { + if (pmd != pmd_offset(pud, 0)) printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", - pmd, pmd_offset(pud, 0)); - return; - } + pmd, pmd_offset(pud, 0)); } - pmd = pmd_offset(pud, vaddr); + return pmd_offset(pud, vaddr); +} + +static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr) +{ if (pmd_none(*pmd)) { - pte = (pte_t *) spp_getpage(); + pte_t *pte = (pte_t *) spp_getpage(); pmd_populate_kernel(&init_mm, pmd, pte); - if (pte != pte_offset_kernel(pmd, 0)) { + if (pte != pte_offset_kernel(pmd, 0)) printk(KERN_ERR "PAGETABLE BUG #02!\n"); - return; - } } + return pte_offset_kernel(pmd, vaddr); +} + +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +{ + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pud = pud_page + pud_index(vaddr); + pmd = fill_pmd(pud, vaddr); + pte = fill_pte(pmd, vaddr); - pte = pte_offset_kernel(pmd, vaddr); set_pte(pte, new_pte); /* @@ -198,8 +215,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) __flush_tlb_one(vaddr); } -void -set_pte_vaddr(unsigned long vaddr, pte_t pteval) +void set_pte_vaddr(unsigned long vaddr, pte_t pteval) { pgd_t *pgd; pud_t *pud_page; @@ -216,6 +232,24 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval) set_pte_vaddr_pud(pud_page, vaddr, pteval); } +pmd_t * __init populate_extra_pmd(unsigned long vaddr) +{ + pgd_t *pgd; + pud_t *pud; + + pgd = pgd_offset_k(vaddr); + pud = fill_pud(pgd, vaddr); + return fill_pmd(pud, vaddr); +} + +pte_t * __init populate_extra_pte(unsigned long vaddr) +{ + pmd_t *pmd; + + pmd = populate_extra_pmd(vaddr); + return fill_pte(pmd, vaddr); +} + /* * Create large page table mappings for a range of physical addresses. */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c52f4034c7f..82cd39a6cbd 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -103,7 +103,7 @@ static void xen_vcpu_setup(int cpu) vcpup = &per_cpu(xen_vcpu_info, cpu); - info.mfn = virt_to_mfn(vcpup); + info.mfn = arbitrary_virt_to_mfn(vcpup); info.offset = offset_in_page(vcpup); printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", @@ -301,8 +301,10 @@ static void xen_load_gdt(const struct desc_ptr *dtr) frames = mcs.args; for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { - frames[f] = virt_to_mfn(va); + frames[f] = arbitrary_virt_to_mfn((void *)va); + make_lowmem_page_readonly((void *)va); + make_lowmem_page_readonly(mfn_to_virt(frames[f])); } MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct)); @@ -314,7 +316,7 @@ static void load_TLS_descriptor(struct thread_struct *t, unsigned int cpu, unsigned int i) { struct desc_struct *gdt = get_cpu_gdt_table(cpu); - xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); + xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); struct multicall_space mc = __xen_mc_entry(0); MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); @@ -488,7 +490,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry, break; default: { - xmaddr_t maddr = virt_to_machine(&dt[entry]); + xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]); xen_mc_flush(); if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 319bd40a57c..cb6afa4ec95 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -276,6 +276,13 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn) p2m_top[topidx][idx] = mfn; } +unsigned long arbitrary_virt_to_mfn(void *vaddr) +{ + xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); + + return PFN_DOWN(maddr.maddr); +} + xmaddr_t arbitrary_virt_to_machine(void *vaddr) { unsigned long address = (unsigned long)vaddr; diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 035582ae815..8d470562ffc 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -219,6 +219,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) { struct vcpu_guest_context *ctxt; struct desc_struct *gdt; + unsigned long gdt_mfn; if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) return 0; @@ -248,9 +249,12 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->ldt_ents = 0; BUG_ON((unsigned long)gdt & ~PAGE_MASK); + + gdt_mfn = arbitrary_virt_to_mfn(gdt); make_lowmem_page_readonly(gdt); + make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); - ctxt->gdt_frames[0] = virt_to_mfn(gdt); + ctxt->gdt_frames[0] = gdt_mfn; ctxt->gdt_ents = GDT_ENTRIES; ctxt->user_regs.cs = __KERNEL_CS; |