From 874c4fe389d1358f82c96dc9b5092fc5c7690604 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:26 +0200 Subject: [PATCH] i386: Allow to use GENERICARCH for UP kernels There are some machines around (large xSeries or Unisys ES7000) that need physical IO-APIC destination mode to access all of their IO devices. This currently doesn't work in UP kernels as used in distribution installers. This patch allows to compile even UP kernels as GENERICARCH which allows to use physical or clustered APIC mode. Signed-off-by: Andi Kleen --- arch/i386/kernel/io_apic.c | 1 + arch/i386/kernel/mpparse.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 4fb32c551fe..4eacd52eeb7 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -40,6 +40,7 @@ #include #include +#include #include "io_ports.h" diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c index a70b5fa0ef0..82756968856 100644 --- a/arch/i386/kernel/mpparse.c +++ b/arch/i386/kernel/mpparse.c @@ -30,6 +30,7 @@ #include #include +#include #include #include -- cgit v1.2.3 From b07f8915cda3fcd73b8b68075ba1e6cd0673365d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:26 +0200 Subject: [PATCH] x86: Temporarily revert parts of the Core 2 nmi nmi watchdog support This makes merging easier. They are readded a few patches later. Signed-off-by: Andi Kleen --- arch/i386/kernel/nmi.c | 65 +------------------------------------------------- 1 file changed, 1 insertion(+), 64 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index acb351478e4..1282d70ff97 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -24,7 +24,6 @@ #include #include -#include #include "mach_traps.h" @@ -96,9 +95,6 @@ int nmi_active; (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) -#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL -#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK - #ifdef CONFIG_SMP /* The performance counters used by NMI_LOCAL_APIC don't trigger when * the CPU is idle. To make sure the NMI watchdog really ticks on all @@ -211,8 +207,6 @@ static int __init setup_nmi_watchdog(char *str) __setup("nmi_watchdog=", setup_nmi_watchdog); -static void disable_intel_arch_watchdog(void); - static void disable_lapic_nmi_watchdog(void) { if (nmi_active <= 0) @@ -222,10 +216,6 @@ static void disable_lapic_nmi_watchdog(void) wrmsr(MSR_K7_EVNTSEL0, 0, 0); break; case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - disable_intel_arch_watchdog(); - break; - } switch (boot_cpu_data.x86) { case 6: if (boot_cpu_data.x86_model > 0xd) @@ -454,53 +444,6 @@ static int setup_p4_watchdog(void) return 1; } -static void disable_intel_arch_watchdog(void) -{ - unsigned ebx; - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebp indicates event present. - */ - ebx = cpuid_ebx(10); - if (!(ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, 0, 0); -} - -static int setup_intel_arch_watchdog(void) -{ - unsigned int evntsel; - unsigned ebx; - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebp indicates event present. - */ - ebx = cpuid_ebx(10); - if ((ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - return 0; - - nmi_perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0; - - clear_msr_range(MSR_ARCH_PERFMON_EVENTSEL0, 2); - clear_msr_range(MSR_ARCH_PERFMON_PERFCTR0, 2); - - evntsel = ARCH_PERFMON_EVENTSEL_INT - | ARCH_PERFMON_EVENTSEL_OS - | ARCH_PERFMON_EVENTSEL_USR - | ARCH_PERFMON_NMI_EVENT_SEL - | ARCH_PERFMON_NMI_EVENT_UMASK; - - wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0); - write_watchdog_counter("INTEL_ARCH_PERFCTR0"); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0); - return 1; -} - void setup_apic_nmi_watchdog (void) { switch (boot_cpu_data.x86_vendor) { @@ -510,11 +453,6 @@ void setup_apic_nmi_watchdog (void) setup_k7_watchdog(); break; case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - if (!setup_intel_arch_watchdog()) - return; - break; - } switch (boot_cpu_data.x86) { case 6: if (boot_cpu_data.x86_model > 0xd) @@ -619,8 +557,7 @@ void nmi_watchdog_tick (struct pt_regs * regs) wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); apic_write(APIC_LVTPC, APIC_DM_NMI); } - else if (nmi_perfctr_msr == MSR_P6_PERFCTR0 || - nmi_perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { + else if (nmi_perfctr_msr == MSR_P6_PERFCTR0) { /* Only P6 based Pentium M need to re-unmask * the apic vector but it doesn't hurt * other P6 variant */ -- cgit v1.2.3 From 828f0afda123a96ff4e8078f057a302f4b4232ae Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 26 Sep 2006 10:52:26 +0200 Subject: [PATCH] x86: Add performance counter reservation framework for UP kernels Adds basic infrastructure to allow subsystems to reserve performance counters on the x86 chips. Only UP kernels are supported in this patch to make reviewing easier. The SMP portion makes a lot more changes. Think of this as a locking mechanism where each bit represents a different counter. In addition, each subsystem should also reserve an appropriate event selection register that will correspond to the performance counter it will be using (this is mainly neccessary for the Pentium 4 chips as they break the 1:1 relationship to performance counters). This will help prevent subsystems like oprofile from interfering with the nmi watchdog. Signed-off-by: Don Zickus Signed-off-by: Andi Kleen --- arch/i386/kernel/nmi.c | 188 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 156 insertions(+), 32 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 1282d70ff97..5d58dfeacd5 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -34,6 +34,20 @@ static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ static unsigned int nmi_p4_cccr_val; extern void show_registers(struct pt_regs *regs); +/* perfctr_nmi_owner tracks the ownership of the perfctr registers: + * evtsel_nmi_owner tracks the ownership of the event selection + * - different performance counters/ event selection may be reserved for + * different subsystems this reservation system just tries to coordinate + * things a little + */ +static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner); +static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]); + +/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's + * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) + */ +#define NMI_MAX_COUNTER_BITS 66 + /* * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: * - it may be reserved by some other driver, or not @@ -95,6 +109,105 @@ int nmi_active; (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) +/* converts an msr to an appropriate reservation bit */ +static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) +{ + /* returns the bit offset of the performance counter register */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + return (msr - MSR_K7_PERFCTR0); + case X86_VENDOR_INTEL: + switch (boot_cpu_data.x86) { + case 6: + return (msr - MSR_P6_PERFCTR0); + case 15: + return (msr - MSR_P4_BPU_PERFCTR0); + } + } + return 0; +} + +/* converts an msr to an appropriate reservation bit */ +static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) +{ + /* returns the bit offset of the event selection register */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + return (msr - MSR_K7_EVNTSEL0); + case X86_VENDOR_INTEL: + switch (boot_cpu_data.x86) { + case 6: + return (msr - MSR_P6_EVNTSEL0); + case 15: + return (msr - MSR_P4_BSU_ESCR0); + } + } + return 0; +} + +/* checks for a bit availability (hack for oprofile) */ +int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) +{ + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner))); +} + +/* checks the an msr for availability */ +int avail_to_resrv_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner))); +} + +int reserve_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + if (!test_and_set_bit(counter, &__get_cpu_var(perfctr_nmi_owner))) + return 1; + return 0; +} + +void release_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + clear_bit(counter, &__get_cpu_var(perfctr_nmi_owner)); +} + +int reserve_evntsel_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_evntsel_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + if (!test_and_set_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0])) + return 1; + return 0; +} + +void release_evntsel_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_evntsel_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + clear_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]); +} + #ifdef CONFIG_SMP /* The performance counters used by NMI_LOCAL_APIC don't trigger when * the CPU is idle. To make sure the NMI watchdog really ticks on all @@ -344,14 +457,6 @@ late_initcall(init_lapic_nmi_sysfs); * Original code written by Keith Owens. */ -static void clear_msr_range(unsigned int base, unsigned int n) -{ - unsigned int i; - - for(i = 0; i < n; ++i) - wrmsr(base+i, 0, 0); -} - static void write_watchdog_counter(const char *descr) { u64 count = (u64)cpu_khz * 1000; @@ -362,14 +467,19 @@ static void write_watchdog_counter(const char *descr) wrmsrl(nmi_perfctr_msr, 0 - count); } -static void setup_k7_watchdog(void) +static int setup_k7_watchdog(void) { unsigned int evntsel; nmi_perfctr_msr = MSR_K7_PERFCTR0; - clear_msr_range(MSR_K7_EVNTSEL0, 4); - clear_msr_range(MSR_K7_PERFCTR0, 4); + if (!reserve_perfctr_nmi(nmi_perfctr_msr)) + goto fail; + + if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0)) + goto fail1; + + wrmsrl(MSR_K7_PERFCTR0, 0UL); evntsel = K7_EVNTSEL_INT | K7_EVNTSEL_OS @@ -381,16 +491,24 @@ static void setup_k7_watchdog(void) apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= K7_EVNTSEL_ENABLE; wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); + return 1; +fail1: + release_perfctr_nmi(nmi_perfctr_msr); +fail: + return 0; } -static void setup_p6_watchdog(void) +static int setup_p6_watchdog(void) { unsigned int evntsel; nmi_perfctr_msr = MSR_P6_PERFCTR0; - clear_msr_range(MSR_P6_EVNTSEL0, 2); - clear_msr_range(MSR_P6_PERFCTR0, 2); + if (!reserve_perfctr_nmi(nmi_perfctr_msr)) + goto fail; + + if (!reserve_evntsel_nmi(MSR_P6_EVNTSEL0)) + goto fail1; evntsel = P6_EVNTSEL_INT | P6_EVNTSEL_OS @@ -402,6 +520,11 @@ static void setup_p6_watchdog(void) apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= P6_EVNTSEL0_ENABLE; wrmsr(MSR_P6_EVNTSEL0, evntsel, 0); + return 1; +fail1: + release_perfctr_nmi(nmi_perfctr_msr); +fail: + return 0; } static int setup_p4_watchdog(void) @@ -419,22 +542,11 @@ static int setup_p4_watchdog(void) nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1; #endif - if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL)) - clear_msr_range(0x3F1, 2); - /* MSR 0x3F0 seems to have a default value of 0xFC00, but current - docs doesn't fully define it, so leave it alone for now. */ - if (boot_cpu_data.x86_model >= 0x3) { - /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */ - clear_msr_range(0x3A0, 26); - clear_msr_range(0x3BC, 3); - } else { - clear_msr_range(0x3A0, 31); - } - clear_msr_range(0x3C0, 6); - clear_msr_range(0x3C8, 6); - clear_msr_range(0x3E0, 2); - clear_msr_range(MSR_P4_CCCR0, 18); - clear_msr_range(MSR_P4_PERFCTR0, 18); + if (!reserve_perfctr_nmi(nmi_perfctr_msr)) + goto fail; + + if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0)) + goto fail1; wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0); wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0); @@ -442,6 +554,10 @@ static int setup_p4_watchdog(void) apic_write(APIC_LVTPC, APIC_DM_NMI); wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); return 1; +fail1: + release_perfctr_nmi(nmi_perfctr_msr); +fail: + return 0; } void setup_apic_nmi_watchdog (void) @@ -450,7 +566,8 @@ void setup_apic_nmi_watchdog (void) case X86_VENDOR_AMD: if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15) return; - setup_k7_watchdog(); + if (!setup_k7_watchdog()) + return; break; case X86_VENDOR_INTEL: switch (boot_cpu_data.x86) { @@ -458,7 +575,8 @@ void setup_apic_nmi_watchdog (void) if (boot_cpu_data.x86_model > 0xd) return; - setup_p6_watchdog(); + if(!setup_p6_watchdog()) + return; break; case 15: if (boot_cpu_data.x86_model > 0x4) @@ -612,6 +730,12 @@ int proc_unknown_nmi_panic(ctl_table *table, int write, struct file *file, EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); +EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); +EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); +EXPORT_SYMBOL(reserve_perfctr_nmi); +EXPORT_SYMBOL(release_perfctr_nmi); +EXPORT_SYMBOL(reserve_evntsel_nmi); +EXPORT_SYMBOL(release_evntsel_nmi); EXPORT_SYMBOL(reserve_lapic_nmi); EXPORT_SYMBOL(release_lapic_nmi); EXPORT_SYMBOL(disable_timer_nmi_watchdog); -- cgit v1.2.3 From b7471c6da94d30d3deadc55986cc38d1ff57f9ca Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 26 Sep 2006 10:52:26 +0200 Subject: [PATCH] i386: Add SMP support on i386 to reservation framework This patch includes the changes to make the nmi watchdog on i386 SMP aware. A bunch of code was moved around to make it simpler to read. In addition, it is now possible to determine if a particular NMI was the result of the watchdog or not. This feature allows the kernel to filter out unknown NMIs easier. Signed-off-by: Don Zickus Signed-off-by: Andi Kleen --- arch/i386/kernel/apic.c | 3 +- arch/i386/kernel/nmi.c | 537 +++++++++++++++++++++++++++++++---------------- arch/i386/kernel/traps.c | 2 +- 3 files changed, 356 insertions(+), 186 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index 8c844d07862..1a34fc57800 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -586,8 +586,7 @@ void __devinit setup_local_APIC(void) printk("No ESR for 82489DX.\n"); } - if (nmi_watchdog == NMI_LOCAL_APIC) - setup_apic_nmi_watchdog(); + setup_apic_nmi_watchdog(NULL); apic_pm_activate(); } diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 5d58dfeacd5..d8800434303 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -24,16 +24,10 @@ #include #include +#include #include "mach_traps.h" -unsigned int nmi_watchdog = NMI_NONE; -extern int unknown_nmi_panic; -static unsigned int nmi_hz = HZ; -static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ -static unsigned int nmi_p4_cccr_val; -extern void show_registers(struct pt_regs *regs); - /* perfctr_nmi_owner tracks the ownership of the perfctr registers: * evtsel_nmi_owner tracks the ownership of the event selection * - different performance counters/ event selection may be reserved for @@ -63,51 +57,31 @@ static unsigned int lapic_nmi_owner; #define LAPIC_NMI_RESERVED (1<<1) /* nmi_active: - * +1: the lapic NMI watchdog is active, but can be disabled - * 0: the lapic NMI watchdog has not been set up, and cannot + * >0: the lapic NMI watchdog is active, but can be disabled + * <0: the lapic NMI watchdog has not been set up, and cannot * be enabled - * -1: the lapic NMI watchdog is disabled, but can be enabled + * 0: the lapic NMI watchdog is disabled, but can be enabled */ -int nmi_active; +atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ -#define K7_EVNTSEL_ENABLE (1 << 22) -#define K7_EVNTSEL_INT (1 << 20) -#define K7_EVNTSEL_OS (1 << 17) -#define K7_EVNTSEL_USR (1 << 16) -#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 -#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING +unsigned int nmi_watchdog = NMI_DEFAULT; +static unsigned int nmi_hz = HZ; -#define P6_EVNTSEL0_ENABLE (1 << 22) -#define P6_EVNTSEL_INT (1 << 20) -#define P6_EVNTSEL_OS (1 << 17) -#define P6_EVNTSEL_USR (1 << 16) -#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 -#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED +struct nmi_watchdog_ctlblk { + int enabled; + u64 check_bit; + unsigned int cccr_msr; + unsigned int perfctr_msr; /* the MSR to reset in NMI handler */ + unsigned int evntsel_msr; /* the MSR to select the events to handle */ +}; +static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); -#define MSR_P4_MISC_ENABLE 0x1A0 -#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) -#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12) -#define MSR_P4_PERFCTR0 0x300 -#define MSR_P4_CCCR0 0x360 -#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) -#define P4_ESCR_OS (1<<3) -#define P4_ESCR_USR (1<<2) -#define P4_CCCR_OVF_PMI0 (1<<26) -#define P4_CCCR_OVF_PMI1 (1<<27) -#define P4_CCCR_THRESHOLD(N) ((N)<<20) -#define P4_CCCR_COMPLEMENT (1<<19) -#define P4_CCCR_COMPARE (1<<18) -#define P4_CCCR_REQUIRED (3<<16) -#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) -#define P4_CCCR_ENABLE (1<<12) -/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter - CRU_ESCR0 (with any non-null event selector) through a complemented - max threshold. [IA32-Vol3, Section 14.9.9] */ -#define MSR_P4_IQ_COUNTER0 0x30C -#define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR) -#define P4_NMI_IQ_CCCR0 \ - (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ - P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) +/* local prototypes */ +static void stop_apic_nmi_watchdog(void *unused); +static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); + +extern void show_registers(struct pt_regs *regs); +extern int unknown_nmi_panic; /* converts an msr to an appropriate reservation bit */ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) @@ -208,6 +182,17 @@ void release_evntsel_nmi(unsigned int msr) clear_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]); } +static __cpuinit inline int nmi_known_cpu(void) +{ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)); + case X86_VENDOR_INTEL: + return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)); + } + return 0; +} + #ifdef CONFIG_SMP /* The performance counters used by NMI_LOCAL_APIC don't trigger when * the CPU is idle. To make sure the NMI watchdog really ticks on all @@ -234,7 +219,10 @@ static int __init check_nmi_watchdog(void) unsigned int *prev_nmi_count; int cpu; - if (nmi_watchdog == NMI_NONE) + if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT)) + return 0; + + if (!atomic_read(&nmi_active)) return 0; prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); @@ -258,18 +246,22 @@ static int __init check_nmi_watchdog(void) if (!cpu_isset(cpu, cpu_callin_map)) continue; #endif + if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled) + continue; if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { - endflag = 1; printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", cpu, prev_nmi_count[cpu], nmi_count(cpu)); - nmi_active = 0; - lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG; - kfree(prev_nmi_count); - return -1; + per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0; + atomic_dec(&nmi_active); } } + if (!atomic_read(&nmi_active)) { + kfree(prev_nmi_count); + atomic_set(&nmi_active, -1); + return -1; + } endflag = 1; printk("OK.\n"); @@ -290,31 +282,16 @@ static int __init setup_nmi_watchdog(char *str) get_option(&str, &nmi); - if (nmi >= NMI_INVALID) + if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE)) return 0; - if (nmi == NMI_NONE) - nmi_watchdog = nmi; /* * If any other x86 CPU has a local APIC, then * please test the NMI stuff there and send me the * missing bits. Right now Intel P6/P4 and AMD K7 only. */ - if ((nmi == NMI_LOCAL_APIC) && - (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && - (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15)) - nmi_watchdog = nmi; - if ((nmi == NMI_LOCAL_APIC) && - (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && - (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15)) - nmi_watchdog = nmi; - /* - * We can enable the IO-APIC watchdog - * unconditionally. - */ - if (nmi == NMI_IO_APIC) { - nmi_active = 1; - nmi_watchdog = nmi; - } + if ((nmi == NMI_LOCAL_APIC) && (nmi_known_cpu() == 0)) + return 0; /* no lapic support */ + nmi_watchdog = nmi; return 1; } @@ -322,41 +299,30 @@ __setup("nmi_watchdog=", setup_nmi_watchdog); static void disable_lapic_nmi_watchdog(void) { - if (nmi_active <= 0) + BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); + + if (atomic_read(&nmi_active) <= 0) return; - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - wrmsr(MSR_K7_EVNTSEL0, 0, 0); - break; - case X86_VENDOR_INTEL: - switch (boot_cpu_data.x86) { - case 6: - if (boot_cpu_data.x86_model > 0xd) - break; - wrmsr(MSR_P6_EVNTSEL0, 0, 0); - break; - case 15: - if (boot_cpu_data.x86_model > 0x4) - break; + on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); - wrmsr(MSR_P4_IQ_CCCR0, 0, 0); - wrmsr(MSR_P4_CRU_ESCR0, 0, 0); - break; - } - break; - } - nmi_active = -1; - /* tell do_nmi() and others that we're not active any more */ - nmi_watchdog = 0; + BUG_ON(atomic_read(&nmi_active) != 0); } static void enable_lapic_nmi_watchdog(void) { - if (nmi_active < 0) { - nmi_watchdog = NMI_LOCAL_APIC; - setup_apic_nmi_watchdog(); - } + BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); + + /* are we already enabled */ + if (atomic_read(&nmi_active) != 0) + return; + + /* are we lapic aware */ + if (nmi_known_cpu() <= 0) + return; + + on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); + touch_nmi_watchdog(); } int reserve_lapic_nmi(void) @@ -388,20 +354,25 @@ void release_lapic_nmi(void) void disable_timer_nmi_watchdog(void) { - if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0)) + BUG_ON(nmi_watchdog != NMI_IO_APIC); + + if (atomic_read(&nmi_active) <= 0) return; - unset_nmi_callback(); - nmi_active = -1; - nmi_watchdog = NMI_NONE; + disable_irq(0); + on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); + + BUG_ON(atomic_read(&nmi_active) != 0); } void enable_timer_nmi_watchdog(void) { - if (nmi_active < 0) { - nmi_watchdog = NMI_IO_APIC; + BUG_ON(nmi_watchdog != NMI_IO_APIC); + + if (atomic_read(&nmi_active) == 0) { touch_nmi_watchdog(); - nmi_active = 1; + on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); + enable_irq(0); } } @@ -411,7 +382,7 @@ static int nmi_pm_active; /* nmi_active before suspend */ static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) { - nmi_pm_active = nmi_active; + nmi_pm_active = atomic_read(&nmi_active); disable_lapic_nmi_watchdog(); return 0; } @@ -439,7 +410,13 @@ static int __init init_lapic_nmi_sysfs(void) { int error; - if (nmi_active == 0 || nmi_watchdog != NMI_LOCAL_APIC) + /* should really be a BUG_ON but b/c this is an + * init call, it just doesn't work. -dcz + */ + if (nmi_watchdog != NMI_LOCAL_APIC) + return 0; + + if ( atomic_read(&nmi_active) < 0 ) return 0; error = sysdev_class_register(&nmi_sysclass); @@ -457,143 +434,312 @@ late_initcall(init_lapic_nmi_sysfs); * Original code written by Keith Owens. */ -static void write_watchdog_counter(const char *descr) +static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr) { u64 count = (u64)cpu_khz * 1000; do_div(count, nmi_hz); if(descr) Dprintk("setting %s to -0x%08Lx\n", descr, count); - wrmsrl(nmi_perfctr_msr, 0 - count); + wrmsrl(perfctr_msr, 0 - count); } +/* Note that these events don't tick when the CPU idles. This means + the frequency varies with CPU load. */ + +#define K7_EVNTSEL_ENABLE (1 << 22) +#define K7_EVNTSEL_INT (1 << 20) +#define K7_EVNTSEL_OS (1 << 17) +#define K7_EVNTSEL_USR (1 << 16) +#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 +#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING + static int setup_k7_watchdog(void) { + unsigned int perfctr_msr, evntsel_msr; unsigned int evntsel; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - nmi_perfctr_msr = MSR_K7_PERFCTR0; - - if (!reserve_perfctr_nmi(nmi_perfctr_msr)) + perfctr_msr = MSR_K7_PERFCTR0; + evntsel_msr = MSR_K7_EVNTSEL0; + if (!reserve_perfctr_nmi(perfctr_msr)) goto fail; - if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0)) + if (!reserve_evntsel_nmi(evntsel_msr)) goto fail1; - wrmsrl(MSR_K7_PERFCTR0, 0UL); + wrmsrl(perfctr_msr, 0UL); evntsel = K7_EVNTSEL_INT | K7_EVNTSEL_OS | K7_EVNTSEL_USR | K7_NMI_EVENT; - wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); - write_watchdog_counter("K7_PERFCTR0"); + /* setup the timer */ + wrmsr(evntsel_msr, evntsel, 0); + write_watchdog_counter(perfctr_msr, "K7_PERFCTR0"); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= K7_EVNTSEL_ENABLE; - wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); + wrmsr(evntsel_msr, evntsel, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; //unused + wd->check_bit = 1ULL<<63; return 1; fail1: - release_perfctr_nmi(nmi_perfctr_msr); + release_perfctr_nmi(perfctr_msr); fail: return 0; } +static void stop_k7_watchdog(void) +{ + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + wrmsr(wd->evntsel_msr, 0, 0); + + release_evntsel_nmi(wd->evntsel_msr); + release_perfctr_nmi(wd->perfctr_msr); +} + +#define P6_EVNTSEL0_ENABLE (1 << 22) +#define P6_EVNTSEL_INT (1 << 20) +#define P6_EVNTSEL_OS (1 << 17) +#define P6_EVNTSEL_USR (1 << 16) +#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 +#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED + static int setup_p6_watchdog(void) { + unsigned int perfctr_msr, evntsel_msr; unsigned int evntsel; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - nmi_perfctr_msr = MSR_P6_PERFCTR0; - - if (!reserve_perfctr_nmi(nmi_perfctr_msr)) + perfctr_msr = MSR_P6_PERFCTR0; + evntsel_msr = MSR_P6_EVNTSEL0; + if (!reserve_perfctr_nmi(perfctr_msr)) goto fail; - if (!reserve_evntsel_nmi(MSR_P6_EVNTSEL0)) + if (!reserve_evntsel_nmi(evntsel_msr)) goto fail1; + wrmsrl(perfctr_msr, 0UL); + evntsel = P6_EVNTSEL_INT | P6_EVNTSEL_OS | P6_EVNTSEL_USR | P6_NMI_EVENT; - wrmsr(MSR_P6_EVNTSEL0, evntsel, 0); - write_watchdog_counter("P6_PERFCTR0"); + /* setup the timer */ + wrmsr(evntsel_msr, evntsel, 0); + write_watchdog_counter(perfctr_msr, "P6_PERFCTR0"); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= P6_EVNTSEL0_ENABLE; - wrmsr(MSR_P6_EVNTSEL0, evntsel, 0); + wrmsr(evntsel_msr, evntsel, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; //unused + wd->check_bit = 1ULL<<39; return 1; fail1: - release_perfctr_nmi(nmi_perfctr_msr); + release_perfctr_nmi(perfctr_msr); fail: return 0; } +static void stop_p6_watchdog(void) +{ + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + wrmsr(wd->evntsel_msr, 0, 0); + + release_evntsel_nmi(wd->evntsel_msr); + release_perfctr_nmi(wd->perfctr_msr); +} + +/* Note that these events don't tick when the CPU idles. This means + the frequency varies with CPU load. */ + +#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) +#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) +#define P4_ESCR_OS (1<<3) +#define P4_ESCR_USR (1<<2) +#define P4_CCCR_OVF_PMI0 (1<<26) +#define P4_CCCR_OVF_PMI1 (1<<27) +#define P4_CCCR_THRESHOLD(N) ((N)<<20) +#define P4_CCCR_COMPLEMENT (1<<19) +#define P4_CCCR_COMPARE (1<<18) +#define P4_CCCR_REQUIRED (3<<16) +#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) +#define P4_CCCR_ENABLE (1<<12) +#define P4_CCCR_OVF (1<<31) +/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter + CRU_ESCR0 (with any non-null event selector) through a complemented + max threshold. [IA32-Vol3, Section 14.9.9] */ + static int setup_p4_watchdog(void) { + unsigned int perfctr_msr, evntsel_msr, cccr_msr; + unsigned int evntsel, cccr_val; unsigned int misc_enable, dummy; + unsigned int ht_num; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy); + rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy); if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) return 0; - nmi_perfctr_msr = MSR_P4_IQ_COUNTER0; - nmi_p4_cccr_val = P4_NMI_IQ_CCCR0; #ifdef CONFIG_SMP - if (smp_num_siblings == 2) - nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1; + /* detect which hyperthread we are on */ + if (smp_num_siblings == 2) { + unsigned int ebx, apicid; + + ebx = cpuid_ebx(1); + apicid = (ebx >> 24) & 0xff; + ht_num = apicid & 1; + } else #endif + ht_num = 0; + + /* performance counters are shared resources + * assign each hyperthread its own set + * (re-use the ESCR0 register, seems safe + * and keeps the cccr_val the same) + */ + if (!ht_num) { + /* logical cpu 0 */ + perfctr_msr = MSR_P4_IQ_PERFCTR0; + evntsel_msr = MSR_P4_CRU_ESCR0; + cccr_msr = MSR_P4_IQ_CCCR0; + cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); + } else { + /* logical cpu 1 */ + perfctr_msr = MSR_P4_IQ_PERFCTR1; + evntsel_msr = MSR_P4_CRU_ESCR0; + cccr_msr = MSR_P4_IQ_CCCR1; + cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4); + } - if (!reserve_perfctr_nmi(nmi_perfctr_msr)) + if (!reserve_perfctr_nmi(perfctr_msr)) goto fail; - if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0)) + if (!reserve_evntsel_nmi(evntsel_msr)) goto fail1; - wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0); - wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0); - write_watchdog_counter("P4_IQ_COUNTER0"); + evntsel = P4_ESCR_EVENT_SELECT(0x3F) + | P4_ESCR_OS + | P4_ESCR_USR; + + cccr_val |= P4_CCCR_THRESHOLD(15) + | P4_CCCR_COMPLEMENT + | P4_CCCR_COMPARE + | P4_CCCR_REQUIRED; + + wrmsr(evntsel_msr, evntsel, 0); + wrmsr(cccr_msr, cccr_val, 0); + write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0"); apic_write(APIC_LVTPC, APIC_DM_NMI); - wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); + cccr_val |= P4_CCCR_ENABLE; + wrmsr(cccr_msr, cccr_val, 0); + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = cccr_msr; + wd->check_bit = 1ULL<<39; return 1; fail1: - release_perfctr_nmi(nmi_perfctr_msr); + release_perfctr_nmi(perfctr_msr); fail: return 0; } -void setup_apic_nmi_watchdog (void) +static void stop_p4_watchdog(void) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15) - return; - if (!setup_k7_watchdog()) - return; - break; - case X86_VENDOR_INTEL: - switch (boot_cpu_data.x86) { - case 6: - if (boot_cpu_data.x86_model > 0xd) - return; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + wrmsr(wd->cccr_msr, 0, 0); + wrmsr(wd->evntsel_msr, 0, 0); - if(!setup_p6_watchdog()) + release_evntsel_nmi(wd->evntsel_msr); + release_perfctr_nmi(wd->perfctr_msr); +} + +void setup_apic_nmi_watchdog (void *unused) +{ + /* only support LOCAL and IO APICs for now */ + if ((nmi_watchdog != NMI_LOCAL_APIC) && + (nmi_watchdog != NMI_IO_APIC)) + return; + + if (nmi_watchdog == NMI_LOCAL_APIC) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15) return; - break; - case 15: - if (boot_cpu_data.x86_model > 0x4) + if (!setup_k7_watchdog()) return; + break; + case X86_VENDOR_INTEL: + switch (boot_cpu_data.x86) { + case 6: + if (boot_cpu_data.x86_model > 0xd) + return; + + if (!setup_p6_watchdog()) + return; + break; + case 15: + if (boot_cpu_data.x86_model > 0x4) + return; - if (!setup_p4_watchdog()) + if (!setup_p4_watchdog()) + return; + break; + default: return; + } + break; + default: + return; + } + } + __get_cpu_var(nmi_watchdog_ctlblk.enabled) = 1; + atomic_inc(&nmi_active); +} + +static void stop_apic_nmi_watchdog(void *unused) +{ + /* only support LOCAL and IO APICs for now */ + if ((nmi_watchdog != NMI_LOCAL_APIC) && + (nmi_watchdog != NMI_IO_APIC)) + return; + + if (nmi_watchdog == NMI_LOCAL_APIC) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + stop_k7_watchdog(); + break; + case X86_VENDOR_INTEL: + switch (boot_cpu_data.x86) { + case 6: + if (boot_cpu_data.x86_model > 0xd) + break; + stop_p6_watchdog(); + break; + case 15: + if (boot_cpu_data.x86_model > 0x4) + break; + stop_p4_watchdog(); + break; + } break; default: return; } - break; - default: - return; } - lapic_nmi_owner = LAPIC_NMI_WATCHDOG; - nmi_active = 1; + __get_cpu_var(nmi_watchdog_ctlblk.enabled) = 0; + atomic_dec(&nmi_active); } /* @@ -635,7 +781,7 @@ EXPORT_SYMBOL(touch_nmi_watchdog); extern void die_nmi(struct pt_regs *, const char *msg); -void nmi_watchdog_tick (struct pt_regs * regs) +void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) { /* @@ -644,11 +790,21 @@ void nmi_watchdog_tick (struct pt_regs * regs) * smp_processor_id(). */ unsigned int sum; + int touched = 0; int cpu = smp_processor_id(); + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + u64 dummy; + + /* check for other users first */ + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) { + touched = 1; + } sum = per_cpu(irq_stat, cpu).apic_timer_irqs; - if (last_irq_sums[cpu] == sum) { + /* if the apic timer isn't firing, this cpu isn't doing much */ + if (!touched && last_irq_sums[cpu] == sum) { /* * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... @@ -663,26 +819,41 @@ void nmi_watchdog_tick (struct pt_regs * regs) last_irq_sums[cpu] = sum; alert_counter[cpu] = 0; } - if (nmi_perfctr_msr) { - if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) { - /* - * P4 quirks: - * - An overflown perfctr will assert its interrupt - * until the OVF flag in its CCCR is cleared. - * - LVTPC is masked on interrupt and must be - * unmasked by the LVTPC handler. - */ - wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); - apic_write(APIC_LVTPC, APIC_DM_NMI); - } - else if (nmi_perfctr_msr == MSR_P6_PERFCTR0) { - /* Only P6 based Pentium M need to re-unmask - * the apic vector but it doesn't hurt - * other P6 variant */ - apic_write(APIC_LVTPC, APIC_DM_NMI); + /* see if the nmi watchdog went off */ + if (wd->enabled) { + if (nmi_watchdog == NMI_LOCAL_APIC) { + rdmsrl(wd->perfctr_msr, dummy); + if (dummy & wd->check_bit){ + /* this wasn't a watchdog timer interrupt */ + goto done; + } + + /* only Intel P4 uses the cccr msr */ + if (wd->cccr_msr != 0) { + /* + * P4 quirks: + * - An overflown perfctr will assert its interrupt + * until the OVF flag in its CCCR is cleared. + * - LVTPC is masked on interrupt and must be + * unmasked by the LVTPC handler. + */ + rdmsrl(wd->cccr_msr, dummy); + dummy &= ~P4_CCCR_OVF; + wrmsrl(wd->cccr_msr, dummy); + apic_write(APIC_LVTPC, APIC_DM_NMI); + } + else if (wd->perfctr_msr == MSR_P6_PERFCTR0) { + /* Only P6 based Pentium M need to re-unmask + * the apic vector but it doesn't hurt + * other P6 variant */ + apic_write(APIC_LVTPC, APIC_DM_NMI); + } + /* start the cycle over again */ + write_watchdog_counter(wd->perfctr_msr, NULL); } - write_watchdog_counter(NULL); } +done: + return; } #ifdef CONFIG_SYSCTL diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 7e9edafffd8..3a07b2677e2 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -724,7 +724,7 @@ static void default_do_nmi(struct pt_regs * regs) * so it must be the NMI watchdog. */ if (nmi_watchdog) { - nmi_watchdog_tick(regs); + nmi_watchdog_tick(regs, reason); return; } #endif -- cgit v1.2.3 From 3adbbcce9a49b900d4cc118cdccfdefa78bf1afb Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 26 Sep 2006 10:52:26 +0200 Subject: [PATCH] x86: Cleanup NMI interrupt path This patch cleans up the NMI interrupt path. Instead of being gated by if the 'nmi callback' is set, the interrupt handler now calls everyone who is registered on the die_chain and additionally checks the nmi watchdog, reseting it if enabled. This allows more subsystems to hook into the NMI if they need to (without being block by set_nmi_callback). Signed-off-by: Don Zickus Signed-off-by: Andi Kleen --- arch/i386/kernel/nmi.c | 16 +++++++++++++--- arch/i386/kernel/traps.c | 24 +++++++++++------------- 2 files changed, 24 insertions(+), 16 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index d8800434303..bd96ea4f294 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -781,7 +781,7 @@ EXPORT_SYMBOL(touch_nmi_watchdog); extern void die_nmi(struct pt_regs *, const char *msg); -void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) +int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) { /* @@ -794,10 +794,12 @@ void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) int cpu = smp_processor_id(); struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); u64 dummy; + int rc=0; /* check for other users first */ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) { + rc = 1; touched = 1; } @@ -850,10 +852,18 @@ void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) } /* start the cycle over again */ write_watchdog_counter(wd->perfctr_msr, NULL); - } + rc = 1; + } else if (nmi_watchdog == NMI_IO_APIC) { + /* don't know how to accurately check for this. + * just assume it was a watchdog timer interrupt + * This matches the old behaviour. + */ + rc = 1; + } else + printk(KERN_WARNING "Unknown enabled NMI hardware?!\n"); } done: - return; + return rc; } #ifdef CONFIG_SYSCTL diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 3a07b2677e2..282f0bd40df 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -706,6 +706,13 @@ void die_nmi (struct pt_regs *regs, const char *msg) do_exit(SIGSEGV); } +static int dummy_nmi_callback(struct pt_regs * regs, int cpu) +{ + return 0; +} + +static nmi_callback_t nmi_callback = dummy_nmi_callback; + static void default_do_nmi(struct pt_regs * regs) { unsigned char reason = 0; @@ -723,12 +730,11 @@ static void default_do_nmi(struct pt_regs * regs) * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. */ - if (nmi_watchdog) { - nmi_watchdog_tick(regs, reason); + if (nmi_watchdog_tick(regs, reason)) return; - } #endif - unknown_nmi_error(reason, regs); + if (!rcu_dereference(nmi_callback)(regs, smp_processor_id())) + unknown_nmi_error(reason, regs); return; } if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) @@ -744,13 +750,6 @@ static void default_do_nmi(struct pt_regs * regs) reassert_nmi(); } -static int dummy_nmi_callback(struct pt_regs * regs, int cpu) -{ - return 0; -} - -static nmi_callback_t nmi_callback = dummy_nmi_callback; - fastcall void do_nmi(struct pt_regs * regs, long error_code) { int cpu; @@ -761,8 +760,7 @@ fastcall void do_nmi(struct pt_regs * regs, long error_code) ++nmi_count(cpu); - if (!rcu_dereference(nmi_callback)(regs, cpu)) - default_do_nmi(regs); + default_do_nmi(regs); nmi_exit(); } -- cgit v1.2.3 From 2fbe7b25c8edaf2d10e6c1a4cc9f8afe714c4764 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] i386/x86-64: Remove un/set_nmi_callback and reserve/release_lapic_nmi functions Removes the un/set_nmi_callback and reserve/release_lapic_nmi functions as they are no longer needed. The various subsystems are modified to register with the die_notifier instead. Also includes compile fixes by Andrew Morton. Signed-off-by: Don Zickus Signed-off-by: Andi Kleen --- arch/i386/kernel/crash.c | 20 ++++++++++-- arch/i386/kernel/nmi.c | 85 +++++++----------------------------------------- arch/i386/kernel/traps.c | 23 ++----------- 3 files changed, 31 insertions(+), 97 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c index 5b96f038367..736c76d6b31 100644 --- a/arch/i386/kernel/crash.c +++ b/arch/i386/kernel/crash.c @@ -22,6 +22,8 @@ #include #include #include +#include + #include @@ -93,9 +95,18 @@ static void crash_save_self(struct pt_regs *regs) #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) static atomic_t waiting_for_crash_ipi; -static int crash_nmi_callback(struct pt_regs *regs, int cpu) +static int crash_nmi_callback(struct notifier_block *self, + unsigned long val, void *data) { + struct pt_regs *regs; struct pt_regs fixed_regs; + int cpu; + + if (val != DIE_NMI) + return NOTIFY_OK; + + regs = ((struct die_args *)data)->regs; + cpu = raw_smp_processor_id(); /* Don't do anything if this handler is invoked on crashing cpu. * Otherwise, system will completely hang. Crashing cpu can get @@ -125,13 +136,18 @@ static void smp_send_nmi_allbutself(void) send_IPI_allbutself(NMI_VECTOR); } +static struct notifier_block crash_nmi_nb = { + .notifier_call = crash_nmi_callback, +}; + static void nmi_shootdown_cpus(void) { unsigned long msecs; atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); /* Would it be better to replace the trap vector here? */ - set_nmi_callback(crash_nmi_callback); + if (register_die_notifier(&crash_nmi_nb)) + return; /* return what? */ /* Ensure the new callback function is set before sending * out the NMI */ diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index bd96ea4f294..acd3fdea2a2 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -42,20 +42,6 @@ static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]); */ #define NMI_MAX_COUNTER_BITS 66 -/* - * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: - * - it may be reserved by some other driver, or not - * - when not reserved by some other driver, it may be used for - * the NMI watchdog, or not - * - * This is maintained separately from nmi_active because the NMI - * watchdog may also be driven from the I/O APIC timer. - */ -static DEFINE_SPINLOCK(lapic_nmi_owner_lock); -static unsigned int lapic_nmi_owner; -#define LAPIC_NMI_WATCHDOG (1<<0) -#define LAPIC_NMI_RESERVED (1<<1) - /* nmi_active: * >0: the lapic NMI watchdog is active, but can be disabled * <0: the lapic NMI watchdog has not been set up, and cannot @@ -325,33 +311,6 @@ static void enable_lapic_nmi_watchdog(void) touch_nmi_watchdog(); } -int reserve_lapic_nmi(void) -{ - unsigned int old_owner; - - spin_lock(&lapic_nmi_owner_lock); - old_owner = lapic_nmi_owner; - lapic_nmi_owner |= LAPIC_NMI_RESERVED; - spin_unlock(&lapic_nmi_owner_lock); - if (old_owner & LAPIC_NMI_RESERVED) - return -EBUSY; - if (old_owner & LAPIC_NMI_WATCHDOG) - disable_lapic_nmi_watchdog(); - return 0; -} - -void release_lapic_nmi(void) -{ - unsigned int new_owner; - - spin_lock(&lapic_nmi_owner_lock); - new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED; - lapic_nmi_owner = new_owner; - spin_unlock(&lapic_nmi_owner_lock); - if (new_owner & LAPIC_NMI_WATCHDOG) - enable_lapic_nmi_watchdog(); -} - void disable_timer_nmi_watchdog(void) { BUG_ON(nmi_watchdog != NMI_IO_APIC); @@ -866,6 +825,15 @@ done: return rc; } +int do_nmi_callback(struct pt_regs * regs, int cpu) +{ +#ifdef CONFIG_SYSCTL + if (unknown_nmi_panic) + return unknown_nmi_panic_callback(regs, cpu); +#endif + return 0; +} + #ifdef CONFIG_SYSCTL static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) @@ -873,37 +841,8 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) unsigned char reason = get_nmi_reason(); char buf[64]; - if (!(reason & 0xc0)) { - sprintf(buf, "NMI received for unknown reason %02x\n", reason); - die_nmi(regs, buf); - } - return 0; -} - -/* - * proc handler for /proc/sys/kernel/unknown_nmi_panic - */ -int proc_unknown_nmi_panic(ctl_table *table, int write, struct file *file, - void __user *buffer, size_t *length, loff_t *ppos) -{ - int old_state; - - old_state = unknown_nmi_panic; - proc_dointvec(table, write, file, buffer, length, ppos); - if (!!old_state == !!unknown_nmi_panic) - return 0; - - if (unknown_nmi_panic) { - if (reserve_lapic_nmi() < 0) { - unknown_nmi_panic = 0; - return -EBUSY; - } else { - set_nmi_callback(unknown_nmi_panic_callback); - } - } else { - release_lapic_nmi(); - unset_nmi_callback(); - } + sprintf(buf, "NMI received for unknown reason %02x\n", reason); + die_nmi(regs, buf); return 0; } @@ -917,7 +856,5 @@ EXPORT_SYMBOL(reserve_perfctr_nmi); EXPORT_SYMBOL(release_perfctr_nmi); EXPORT_SYMBOL(reserve_evntsel_nmi); EXPORT_SYMBOL(release_evntsel_nmi); -EXPORT_SYMBOL(reserve_lapic_nmi); -EXPORT_SYMBOL(release_lapic_nmi); EXPORT_SYMBOL(disable_timer_nmi_watchdog); EXPORT_SYMBOL(enable_timer_nmi_watchdog); diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 282f0bd40df..7db664d0b25 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -706,13 +706,6 @@ void die_nmi (struct pt_regs *regs, const char *msg) do_exit(SIGSEGV); } -static int dummy_nmi_callback(struct pt_regs * regs, int cpu) -{ - return 0; -} - -static nmi_callback_t nmi_callback = dummy_nmi_callback; - static void default_do_nmi(struct pt_regs * regs) { unsigned char reason = 0; @@ -732,9 +725,10 @@ static void default_do_nmi(struct pt_regs * regs) */ if (nmi_watchdog_tick(regs, reason)) return; + if (!do_nmi_callback(regs, smp_processor_id())) #endif - if (!rcu_dereference(nmi_callback)(regs, smp_processor_id())) unknown_nmi_error(reason, regs); + return; } if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) @@ -765,19 +759,6 @@ fastcall void do_nmi(struct pt_regs * regs, long error_code) nmi_exit(); } -void set_nmi_callback(nmi_callback_t callback) -{ - vmalloc_sync_all(); - rcu_assign_pointer(nmi_callback, callback); -} -EXPORT_SYMBOL_GPL(set_nmi_callback); - -void unset_nmi_callback(void) -{ - nmi_callback = dummy_nmi_callback; -} -EXPORT_SYMBOL_GPL(unset_nmi_callback); - #ifdef CONFIG_KPROBES fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code) { -- cgit v1.2.3 From 407984f1af259b31957c7c05075a454a751bb801 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] x86: Add abilty to enable/disable nmi watchdog with sysctl Adds a new /proc/sys/kernel/nmi call that will enable/disable the nmi watchdog. Signed-off-by: Don Zickus Signed-off-by: Andi Kleen --- arch/i386/kernel/nmi.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index acd3fdea2a2..28065d0b71a 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -846,6 +846,58 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) return 0; } +/* + * proc handler for /proc/sys/kernel/nmi_watchdog + */ +int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int old_state; + + nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; + old_state = nmi_watchdog_enabled; + proc_dointvec(table, write, file, buffer, length, ppos); + if (!!old_state == !!nmi_watchdog_enabled) + return 0; + + if (atomic_read(&nmi_active) < 0) { + printk(KERN_WARNING "NMI watchdog is permanently disabled\n"); + return -EINVAL; + } + + if (nmi_watchdog == NMI_DEFAULT) { + if (nmi_known_cpu() > 0) + nmi_watchdog = NMI_LOCAL_APIC; + else + nmi_watchdog = NMI_IO_APIC; + } + + if (nmi_watchdog == NMI_LOCAL_APIC) + { + if (nmi_watchdog_enabled) + enable_lapic_nmi_watchdog(); + else + disable_lapic_nmi_watchdog(); + } else if (nmi_watchdog == NMI_IO_APIC) { + /* FIXME + * for some reason these functions don't work + */ + printk("Can not enable/disable NMI on IO APIC\n"); + return -EINVAL; +#if 0 + if (nmi_watchdog_enabled) + enable_timer_nmi_watchdog(); + else + disable_timer_nmi_watchdog(); +#endif + } else { + printk( KERN_WARNING + "NMI watchdog doesn't know what hardware to touch\n"); + return -EIO; + } + return 0; +} + #endif EXPORT_SYMBOL(nmi_active); -- cgit v1.2.3 From e33e89ab1a8d295de0500b697f4f31c3ceee9aa2 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] x86: Add abilty to enable/disable nmi watchdog from procfs (update) Adds a new /proc/sys/kernel/nmi_watchdog call that will enable/disable the nmi watchdog. By entering a non-zero value here, a user can enable the nmi watchdog to monitor the online cpus in the system. By entering a zero value here, a user can disable the nmi watchdog and free up a performance counter which could then be utilized by the oprofile subsystem, otherwise oprofile may be short a counter when in use. Signed-off-by: Don Zickus Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/nmi.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 28065d0b71a..6241e4448ca 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -847,7 +847,7 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) } /* - * proc handler for /proc/sys/kernel/nmi_watchdog + * proc handler for /proc/sys/kernel/nmi */ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) @@ -861,8 +861,8 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, return 0; if (atomic_read(&nmi_active) < 0) { - printk(KERN_WARNING "NMI watchdog is permanently disabled\n"); - return -EINVAL; + printk( KERN_WARNING "NMI watchdog is permanently disabled\n"); + return -EIO; } if (nmi_watchdog == NMI_DEFAULT) { @@ -872,24 +872,11 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, nmi_watchdog = NMI_IO_APIC; } - if (nmi_watchdog == NMI_LOCAL_APIC) - { + if (nmi_watchdog == NMI_LOCAL_APIC) { if (nmi_watchdog_enabled) enable_lapic_nmi_watchdog(); else disable_lapic_nmi_watchdog(); - } else if (nmi_watchdog == NMI_IO_APIC) { - /* FIXME - * for some reason these functions don't work - */ - printk("Can not enable/disable NMI on IO APIC\n"); - return -EINVAL; -#if 0 - if (nmi_watchdog_enabled) - enable_timer_nmi_watchdog(); - else - disable_timer_nmi_watchdog(); -#endif } else { printk( KERN_WARNING "NMI watchdog doesn't know what hardware to touch\n"); -- cgit v1.2.3 From 8da5adda91df3d2fcc5300e68da491694c9af019 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] x86: Allow users to force a panic on NMI To quote Alan Cox: The default Linux behaviour on an NMI of either memory or unknown is to continue operation. For many environments such as scientific computing it is preferable that the box is taken out and the error dealt with than an uncorrected parity/ECC error get propogated. A small number of systems do generate NMI's for bizarre random reasons such as power management so the default is unchanged. In other respects the new proc/sys entry works like the existing panic controls already in that directory. This is separate to the edac support - EDAC allows supported chipsets to handle ECC errors well, this change allows unsupported cases to at least panic rather than cause problems further down the line. Signed-off-by: Don Zickus Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 7db664d0b25..2f6cb827648 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -635,6 +635,8 @@ static void mem_parity_error(unsigned char reason, struct pt_regs * regs) "to continue\n"); printk(KERN_EMERG "You probably have a hardware problem with your RAM " "chips\n"); + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); /* Clear and disable the memory parity error line. */ clear_mem_error(reason); @@ -670,6 +672,10 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) reason, smp_processor_id()); printk("Dazed and confused, but trying to continue\n"); printk("Do you have a strange power saving mode enabled?\n"); + + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + } static DEFINE_SPINLOCK(nmi_print_lock); -- cgit v1.2.3 From c41c5cd3b20a2d81c30498f13b1527847a8fdf69 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] x86: x86 clean up nmi panic messages Clean up some of the output messages on the nmi error paths to make more sense when they are displayed. This is mainly a cosmetic fix and shouldn't impact any normal code path. Signed-off-by: Don Zickus Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 2f6cb827648..3c85c89f68d 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -631,13 +631,15 @@ gp_in_kernel: static void mem_parity_error(unsigned char reason, struct pt_regs * regs) { - printk(KERN_EMERG "Uhhuh. NMI received. Dazed and confused, but trying " - "to continue\n"); + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " + "CPU %d.\n", reason, smp_processor_id()); printk(KERN_EMERG "You probably have a hardware problem with your RAM " "chips\n"); if (panic_on_unrecovered_nmi) panic("NMI: Not continuing"); + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); + /* Clear and disable the memory parity error line. */ clear_mem_error(reason); } @@ -668,14 +670,13 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) return; } #endif - printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", - reason, smp_processor_id()); - printk("Dazed and confused, but trying to continue\n"); - printk("Do you have a strange power saving mode enabled?\n"); - + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " + "CPU %d.\n", reason, smp_processor_id()); + printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); if (panic_on_unrecovered_nmi) panic("NMI: Not continuing"); + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); } static DEFINE_SPINLOCK(nmi_print_lock); -- cgit v1.2.3 From 4038f901cf102a40715b900984ed7540a9fa637f Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] i386/x86-64: Fix NMI watchdog suspend/resume Making NMI suspend/resume work with SMP. We use CPU hotplug to offline APs in SMP suspend/resume. Only BSP executes sysdev's .suspend/.resume method. APs should follow CPU hotplug code path. And: +From: Don Zickus Makes the start/stop paths of nmi watchdog more robust to handle the suspend/resume cases more gracefully. AK: I merged the two patches together Signed-off-by: Shaohua Li Signed-off-by: Andi Kleen Cc: Don Zickus Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/nmi.c | 33 ++++++++++++++++++++++++++------- arch/i386/kernel/smpboot.c | 3 ++- 2 files changed, 28 insertions(+), 8 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 6241e4448ca..8e4ed930ce6 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -63,7 +63,6 @@ struct nmi_watchdog_ctlblk { static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); /* local prototypes */ -static void stop_apic_nmi_watchdog(void *unused); static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); extern void show_registers(struct pt_regs *regs); @@ -341,15 +340,20 @@ static int nmi_pm_active; /* nmi_active before suspend */ static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) { + /* only CPU0 goes here, other CPUs should be offline */ nmi_pm_active = atomic_read(&nmi_active); - disable_lapic_nmi_watchdog(); + stop_apic_nmi_watchdog(NULL); + BUG_ON(atomic_read(&nmi_active) != 0); return 0; } static int lapic_nmi_resume(struct sys_device *dev) { - if (nmi_pm_active > 0) - enable_lapic_nmi_watchdog(); + /* only CPU0 goes here, other CPUs should be offline */ + if (nmi_pm_active > 0) { + setup_apic_nmi_watchdog(NULL); + touch_nmi_watchdog(); + } return 0; } @@ -626,11 +630,21 @@ static void stop_p4_watchdog(void) void setup_apic_nmi_watchdog (void *unused) { + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + /* only support LOCAL and IO APICs for now */ if ((nmi_watchdog != NMI_LOCAL_APIC) && (nmi_watchdog != NMI_IO_APIC)) return; + if (wd->enabled == 1) + return; + + /* cheap hack to support suspend/resume */ + /* if cpu0 is not active neither should the other cpus */ + if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) + return; + if (nmi_watchdog == NMI_LOCAL_APIC) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: @@ -663,17 +677,22 @@ void setup_apic_nmi_watchdog (void *unused) return; } } - __get_cpu_var(nmi_watchdog_ctlblk.enabled) = 1; + wd->enabled = 1; atomic_inc(&nmi_active); } -static void stop_apic_nmi_watchdog(void *unused) +void stop_apic_nmi_watchdog(void *unused) { + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + /* only support LOCAL and IO APICs for now */ if ((nmi_watchdog != NMI_LOCAL_APIC) && (nmi_watchdog != NMI_IO_APIC)) return; + if (wd->enabled == 0) + return; + if (nmi_watchdog == NMI_LOCAL_APIC) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: @@ -697,7 +716,7 @@ static void stop_apic_nmi_watchdog(void *unused) return; } } - __get_cpu_var(nmi_watchdog_ctlblk.enabled) = 0; + wd->enabled = 0; atomic_dec(&nmi_active); } diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index efe07990e7f..9367af76ce3 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -1376,7 +1376,8 @@ int __cpu_disable(void) */ if (cpu == 0) return -EBUSY; - + if (nmi_watchdog == NMI_LOCAL_APIC) + stop_apic_nmi_watchdog(NULL); clear_local_APIC(); /* Allow any queued timer interrupts to get serviced */ local_irq_enable(); -- cgit v1.2.3 From 260d6790b6a2a0a048b7f96d154c2b49f1e6515a Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] i386: Kdump i386 nmi event notification fix After a crash we should wait for NMI IPI event and not for external NMI or NMI watchdog tick. Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen Cc: Don Zickus Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/crash.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c index 736c76d6b31..67d297dc100 100644 --- a/arch/i386/kernel/crash.c +++ b/arch/i386/kernel/crash.c @@ -102,7 +102,7 @@ static int crash_nmi_callback(struct notifier_block *self, struct pt_regs fixed_regs; int cpu; - if (val != DIE_NMI) + if (val != DIE_NMI_IPI) return NOTIFY_OK; regs = ((struct die_args *)data)->regs; @@ -113,7 +113,7 @@ static int crash_nmi_callback(struct notifier_block *self, * an NMI if system was initially booted with nmi_watchdog parameter. */ if (cpu == crashing_cpu) - return 1; + return NOTIFY_STOP; local_irq_disable(); if (!user_mode_vm(regs)) { -- cgit v1.2.3 From 1de84979dfc527c422abf63f27beabe43892989b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] i386: Enable NMI watchdog by default I've had good experiences with having this on by default on x86-64. It turns nasty hangs into easier to debug oopses. Enable the local APIC wdog by default for systems newer than 2004. This comes from a strange compromise: according to arjan the reason it was off by default was some old IBM systems that corrupted registered when NMI happened in SMI. Can't remember more specific, but >= 2004 should avoid these. It's probably overly broad because most older systems should be ok (and the really old systems won't be supported by the local apic watchdog anyways) Signed-off-by: Andi Kleen --- arch/i386/kernel/nmi.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 8e4ed930ce6..6e5085d5d2f 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -204,6 +205,14 @@ static int __init check_nmi_watchdog(void) unsigned int *prev_nmi_count; int cpu; + /* Enable NMI watchdog for newer systems. + Actually it should be safe for most systems before 2004 too except + for some IBM systems that corrupt registers when NMI happens + during SMM. Unfortunately we don't have more exact information + on these and use this coarse check. */ + if (nmi_watchdog == NMI_DEFAULT && dmi_get_year(DMI_BIOS_DATE) >= 2004) + nmi_watchdog = NMI_LOCAL_APIC; + if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT)) return 0; -- cgit v1.2.3 From 248dcb2ffffe8f3e4a369556a68988788c208111 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] x86: i386/x86-64 Add nmi watchdog support for new Intel CPUs AK: This redoes the changes I temporarily reverted. Intel now has support for Architectural Performance Monitoring Counters ( Refer to IA-32 Intel Architecture Software Developer's Manual http://www.intel.com/design/pentium4/manuals/253669.htm ). This feature is present starting from Intel Core Duo and Intel Core Solo processors. What this means is, the performance monitoring counters and some performance monitoring events are now defined in an architectural way (using cpuid). And there will be no need to check for family/model etc for these architectural events. Below is the patch to use this performance counters in nmi watchdog driver. Patch handles both i386 and x86-64 kernels. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Andi Kleen --- arch/i386/kernel/nmi.c | 126 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 121 insertions(+), 5 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 6e5085d5d2f..7b9a053effa 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "mach_traps.h" @@ -77,6 +78,9 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) case X86_VENDOR_AMD: return (msr - MSR_K7_PERFCTR0); case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return (msr - MSR_ARCH_PERFMON_PERFCTR0); + switch (boot_cpu_data.x86) { case 6: return (msr - MSR_P6_PERFCTR0); @@ -95,6 +99,9 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) case X86_VENDOR_AMD: return (msr - MSR_K7_EVNTSEL0); case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return (msr - MSR_ARCH_PERFMON_EVENTSEL0); + switch (boot_cpu_data.x86) { case 6: return (msr - MSR_P6_EVNTSEL0); @@ -174,7 +181,10 @@ static __cpuinit inline int nmi_known_cpu(void) case X86_VENDOR_AMD: return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)); case X86_VENDOR_INTEL: - return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)); + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return 1; + else + return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)); } return 0; } @@ -261,8 +271,24 @@ static int __init check_nmi_watchdog(void) /* now that we know it works we can reduce NMI frequency to something more reasonable; makes a difference in some configs */ - if (nmi_watchdog == NMI_LOCAL_APIC) + if (nmi_watchdog == NMI_LOCAL_APIC) { + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + nmi_hz = 1; + /* + * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter + * are writable, with higher bits sign extending from bit 31. + * So, we can only program the counter with 31 bit values and + * 32nd bit should be 1, for 33.. to be 1. + * Find the appropriate nmi_hz + */ + if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 && + ((u64)cpu_khz * 1000) > 0x7fffffffULL) { + u64 count = (u64)cpu_khz * 1000; + do_div(count, 0x7fffffffUL); + nmi_hz = count + 1; + } + } kfree(prev_nmi_count); return 0; @@ -637,6 +663,85 @@ static void stop_p4_watchdog(void) release_perfctr_nmi(wd->perfctr_msr); } +#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL +#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK + +static int setup_intel_arch_watchdog(void) +{ + unsigned int ebx; + union cpuid10_eax eax; + unsigned int unused; + unsigned int perfctr_msr, evntsel_msr; + unsigned int evntsel; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + /* + * Check whether the Architectural PerfMon supports + * Unhalted Core Cycles Event or not. + * NOTE: Corresponding bit = 0 in ebx indicates event present. + */ + cpuid(10, &(eax.full), &ebx, &unused, &unused); + if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || + (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) + goto fail; + + perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0; + evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0; + + if (!reserve_perfctr_nmi(perfctr_msr)) + goto fail; + + if (!reserve_evntsel_nmi(evntsel_msr)) + goto fail1; + + wrmsrl(perfctr_msr, 0UL); + + evntsel = ARCH_PERFMON_EVENTSEL_INT + | ARCH_PERFMON_EVENTSEL_OS + | ARCH_PERFMON_EVENTSEL_USR + | ARCH_PERFMON_NMI_EVENT_SEL + | ARCH_PERFMON_NMI_EVENT_UMASK; + + /* setup the timer */ + wrmsr(evntsel_msr, evntsel, 0); + write_watchdog_counter(perfctr_msr, "INTEL_ARCH_PERFCTR0"); + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsr(evntsel_msr, evntsel, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; //unused + wd->check_bit = 1ULL << (eax.split.bit_width - 1); + return 1; +fail1: + release_perfctr_nmi(perfctr_msr); +fail: + return 0; +} + +static void stop_intel_arch_watchdog(void) +{ + unsigned int ebx; + union cpuid10_eax eax; + unsigned int unused; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + /* + * Check whether the Architectural PerfMon supports + * Unhalted Core Cycles Event or not. + * NOTE: Corresponding bit = 0 in ebx indicates event present. + */ + cpuid(10, &(eax.full), &ebx, &unused, &unused); + if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || + (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) + return; + + wrmsr(wd->evntsel_msr, 0, 0); + release_evntsel_nmi(wd->evntsel_msr); + release_perfctr_nmi(wd->perfctr_msr); +} + void setup_apic_nmi_watchdog (void *unused) { struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); @@ -663,6 +768,11 @@ void setup_apic_nmi_watchdog (void *unused) return; break; case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + if (!setup_intel_arch_watchdog()) + return; + break; + } switch (boot_cpu_data.x86) { case 6: if (boot_cpu_data.x86_model > 0xd) @@ -708,6 +818,10 @@ void stop_apic_nmi_watchdog(void *unused) stop_k7_watchdog(); break; case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + stop_intel_arch_watchdog(); + break; + } switch (boot_cpu_data.x86) { case 6: if (boot_cpu_data.x86_model > 0xd) @@ -831,10 +945,12 @@ int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) wrmsrl(wd->cccr_msr, dummy); apic_write(APIC_LVTPC, APIC_DM_NMI); } - else if (wd->perfctr_msr == MSR_P6_PERFCTR0) { - /* Only P6 based Pentium M need to re-unmask + else if (wd->perfctr_msr == MSR_P6_PERFCTR0 || + wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { + /* P6 based Pentium M need to re-unmask * the apic vector but it doesn't hurt - * other P6 variant */ + * other P6 variant. + * ArchPerfom/Core Duo also needs this */ apic_write(APIC_LVTPC, APIC_DM_NMI); } /* start the cycle over again */ -- cgit v1.2.3 From 3cfc348bf90ffaa777c188652aa297f04eb94de8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] x86: Add portable getcpu call For NUMA optimization and some other algorithms it is useful to have a fast to get the current CPU and node numbers in user space. x86-64 added a fast way to do this in a vsyscall. This adds a generic syscall for other architectures to make it a generic portable facility. I expect some of them will also implement it as a faster vsyscall. The cache is an optimization for the x86-64 vsyscall optimization. Since what the syscall returns is an approximation anyways and user space often wants very fast results it can be cached for some time. The norma methods to get this information in user space are relatively slow The vsyscall is in a better position to manage the cache because it has direct access to a fast time stamp (jiffies). For the generic syscall optimization it doesn't help much, but enforce a valid argument to keep programs portable I only added an i386 syscall entry for now. Other architectures can follow as needed. AK: Also added some cleanups from Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/syscall_table.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index dd63d477539..7e639f78b0b 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -317,3 +317,4 @@ ENTRY(sys_call_table) .long sys_tee /* 315 */ .long sys_vmsplice .long sys_move_pages + .long sys_getcpu -- cgit v1.2.3 From c16b63e09d9d03158e0a92e961234e94c4862620 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] i386/x86-64: Don't randomize stack top when no randomization personality is set Based on patch from Frank van Maarseveen , but extended. Signed-off-by: Andi Kleen --- arch/i386/kernel/process.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 8657c739656..b741c3e1a5e 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -905,7 +906,7 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) unsigned long arch_align_stack(unsigned long sp) { - if (randomize_va_space) + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) sp -= get_random_int() % 8192; return sp & ~0xf; } -- cgit v1.2.3 From 0cb91a2293648507886563ccb91979cfc94d6a4b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] i386: Account spinlocks to the caller during profiling for !FP kernels This ports the algorithm from x86-64 (with improvements) to i386. Previously this only worked for frame pointer enabled kernels. But spinlocks have a very simple stack frame that can be manually analyzed. Do this. Signed-off-by: Andi Kleen --- arch/i386/kernel/time.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index edd00f6cee3..5af802ef00b 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -130,18 +130,33 @@ static int set_rtc_mmss(unsigned long nowtime) int timer_ack; -#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); - if (!user_mode_vm(regs) && in_lock_functions(pc)) +#ifdef CONFIG_SMP + if (!user_mode_vm(regs) && in_lock_functions(pc)) { +#ifdef CONFIG_FRAME_POINTER return *(unsigned long *)(regs->ebp + 4); - +#else + unsigned long *sp; + if ((regs->xcs & 3) == 0) + sp = (unsigned long *)®s->esp; + else + sp = (unsigned long *)regs->esp; + /* Return address is either directly at stack pointer + or above a saved eflags. Eflags has bits 22-31 zero, + kernel addresses don't. */ + if (sp[0] >> 22) + return sp[0]; + if (sp[1] >> 22) + return sp[1]; +#endif + } +#endif return pc; } EXPORT_SYMBOL(profile_pc); -#endif /* * This is the same as the above, except we _also_ save the current -- cgit v1.2.3 From ecaf45ee5ce60afe7cc46e91d82c1b0cbda09387 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] i386: Redo semaphore and rwlock assembly helpers - Move them to a pure assembly file. Previously they were in a C file that only consisted of inline assembly. Doing it in pure assembler is much nicer. - Add a frame.i include with FRAME/ENDFRAME macros to easily add frame pointers to assembly functions - Add dwarf2 annotation to them so that the new dwarf2 unwinder doesn't get stuck on them - Random cleanups Includes feedback from Jan Beulich and a UML build fix from Andrew Morton. Cc: jbeulich@novell.com Cc: jdike@addtoit.com Signed-off-by: Andi Kleen --- arch/i386/kernel/Makefile | 2 +- arch/i386/kernel/semaphore.c | 134 ------------------------------------------- 2 files changed, 1 insertion(+), 135 deletions(-) delete mode 100644 arch/i386/kernel/semaphore.c (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 5427a842e84..dab497472de 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -4,7 +4,7 @@ extra-y := head.o init_task.o vmlinux.lds -obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \ +obj-y := process.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \ pci-dma.o i386_ksyms.o i387.o bootflag.o \ quirks.o i8237.o topology.o alternative.o i8253.o tsc.o diff --git a/arch/i386/kernel/semaphore.c b/arch/i386/kernel/semaphore.c deleted file mode 100644 index 98352c374c7..00000000000 --- a/arch/i386/kernel/semaphore.c +++ /dev/null @@ -1,134 +0,0 @@ -/* - * i386 semaphore implementation. - * - * (C) Copyright 1999 Linus Torvalds - * - * Portions Copyright 1999 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * rw semaphores implemented November 1999 by Benjamin LaHaise - */ -#include - -/* - * The semaphore operations have a special calling sequence that - * allow us to do a simpler in-line version of them. These routines - * need to convert that sequence back into the C sequence when - * there is contention on the semaphore. - * - * %eax contains the semaphore pointer on entry. Save the C-clobbered - * registers (%eax, %edx and %ecx) except %eax whish is either a return - * value or just clobbered.. - */ -asm( -".section .sched.text\n" -".align 4\n" -".globl __down_failed\n" -"__down_failed:\n\t" -#if defined(CONFIG_FRAME_POINTER) - "pushl %ebp\n\t" - "movl %esp,%ebp\n\t" -#endif - "pushl %edx\n\t" - "pushl %ecx\n\t" - "call __down\n\t" - "popl %ecx\n\t" - "popl %edx\n\t" -#if defined(CONFIG_FRAME_POINTER) - "movl %ebp,%esp\n\t" - "popl %ebp\n\t" -#endif - "ret" -); - -asm( -".section .sched.text\n" -".align 4\n" -".globl __down_failed_interruptible\n" -"__down_failed_interruptible:\n\t" -#if defined(CONFIG_FRAME_POINTER) - "pushl %ebp\n\t" - "movl %esp,%ebp\n\t" -#endif - "pushl %edx\n\t" - "pushl %ecx\n\t" - "call __down_interruptible\n\t" - "popl %ecx\n\t" - "popl %edx\n\t" -#if defined(CONFIG_FRAME_POINTER) - "movl %ebp,%esp\n\t" - "popl %ebp\n\t" -#endif - "ret" -); - -asm( -".section .sched.text\n" -".align 4\n" -".globl __down_failed_trylock\n" -"__down_failed_trylock:\n\t" -#if defined(CONFIG_FRAME_POINTER) - "pushl %ebp\n\t" - "movl %esp,%ebp\n\t" -#endif - "pushl %edx\n\t" - "pushl %ecx\n\t" - "call __down_trylock\n\t" - "popl %ecx\n\t" - "popl %edx\n\t" -#if defined(CONFIG_FRAME_POINTER) - "movl %ebp,%esp\n\t" - "popl %ebp\n\t" -#endif - "ret" -); - -asm( -".section .sched.text\n" -".align 4\n" -".globl __up_wakeup\n" -"__up_wakeup:\n\t" - "pushl %edx\n\t" - "pushl %ecx\n\t" - "call __up\n\t" - "popl %ecx\n\t" - "popl %edx\n\t" - "ret" -); - -/* - * rw spinlock fallbacks - */ -#if defined(CONFIG_SMP) -asm( -".section .sched.text\n" -".align 4\n" -".globl __write_lock_failed\n" -"__write_lock_failed:\n\t" - LOCK_PREFIX "addl $" RW_LOCK_BIAS_STR ",(%eax)\n" -"1: rep; nop\n\t" - "cmpl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" - "jne 1b\n\t" - LOCK_PREFIX "subl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" - "jnz __write_lock_failed\n\t" - "ret" -); - -asm( -".section .sched.text\n" -".align 4\n" -".globl __read_lock_failed\n" -"__read_lock_failed:\n\t" - LOCK_PREFIX "incl (%eax)\n" -"1: rep; nop\n\t" - "cmpl $1,(%eax)\n\t" - "js 1b\n\t" - LOCK_PREFIX "decl (%eax)\n\t" - "js __read_lock_failed\n\t" - "ret" -); -#endif -- cgit v1.2.3 From c1a58b42b428e717afbbb298356e041cea54ad17 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] i386/x86-64: Remove obsolete sanity check in mptable parsing It apparently has never triggered in many years. Signed-off-by: Andi Kleen --- arch/i386/kernel/mpparse.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c index 82756968856..a2b126fe9a5 100644 --- a/arch/i386/kernel/mpparse.c +++ b/arch/i386/kernel/mpparse.c @@ -294,19 +294,6 @@ static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); - /* - * Well it seems all SMP boards in existence - * use ExtINT/LVT1 == LINT0 and - * NMI/LVT2 == LINT1 - the following check - * will show us if this assumptions is false. - * Until then we do not have to add baggage. - */ - if ((m->mpc_irqtype == mp_ExtINT) && - (m->mpc_destapiclint != 0)) - BUG(); - if ((m->mpc_irqtype == mp_NMI) && - (m->mpc_destapiclint != 1)) - BUG(); } #ifdef CONFIG_X86_NUMAQ -- cgit v1.2.3 From cf4c6a2f27f5db810b69dcb1da7f194489e8ff88 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] i386: Factor out common io apic routing entry access The IO APIC code had lots of duplicated code to read/write 64bit routing entries into the IO-APIC. Factor this out int common read/write functions In a few cases the IO APIC lock is taken more often now, but this isn't a problem because it's all initialization/shutdown only slow path code. Similar to earlier x86-64 patch. Includes a fix by Jiri Slaby for a mistake that broke resume Signed-off-by: Andi Kleen --- arch/i386/kernel/io_apic.c | 100 +++++++++++++++++++-------------------------- 1 file changed, 43 insertions(+), 57 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 4eacd52eeb7..b713f27d7e8 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -94,6 +94,34 @@ int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1}; #define vector_to_irq(vector) (vector) #endif + +union entry_union { + struct { u32 w1, w2; }; + struct IO_APIC_route_entry entry; +}; + +static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) +{ + union entry_union eu; + unsigned long flags; + spin_lock_irqsave(&ioapic_lock, flags); + eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); + eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + return eu.entry; +} + +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + unsigned long flags; + union entry_union eu; + eu.entry = e; + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + /* * The common case is 1:1 IRQ<->pin mappings. Sometimes there are * shared ISA-space IRQs, so we have to support them. We are super @@ -201,13 +229,9 @@ static void unmask_IO_APIC_irq (unsigned int irq) static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { struct IO_APIC_route_entry entry; - unsigned long flags; /* Check delivery_mode to be sure we're not clearing an SMI pin */ - spin_lock_irqsave(&ioapic_lock, flags); - *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); + entry = ioapic_read_entry(apic, pin); if (entry.delivery_mode == dest_SMI) return; @@ -216,10 +240,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) */ memset(&entry, 0, sizeof(entry)); entry.mask = 1; - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(apic, pin, entry); } static void clear_IO_APIC (void) @@ -1284,9 +1305,8 @@ static void __init setup_IO_APIC_irqs(void) if (!apic && (irq < 16)) disable_8259A_irq(irq); } + ioapic_write_entry(apic, pin, entry); spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); set_native_irq_info(irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -1302,7 +1322,6 @@ static void __init setup_IO_APIC_irqs(void) static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) { struct IO_APIC_route_entry entry; - unsigned long flags; memset(&entry,0,sizeof(entry)); @@ -1332,10 +1351,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in /* * Add it to the IO-APIC irq-routing table: */ - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(apic, pin, entry); enable_8259A_irq(0); } @@ -1445,10 +1461,7 @@ void __init print_IO_APIC(void) for (i = 0; i <= reg_01.bits.entries; i++) { struct IO_APIC_route_entry entry; - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); - *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); - spin_unlock_irqrestore(&ioapic_lock, flags); + entry = ioapic_read_entry(apic, i); printk(KERN_DEBUG " %02x %03X %02X ", i, @@ -1667,10 +1680,7 @@ static void __init enable_IO_APIC(void) /* See if any of the pins is in ExtINT mode */ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { struct IO_APIC_route_entry entry; - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); + entry = ioapic_read_entry(apic, pin); /* If the interrupt line is enabled and in ExtInt mode @@ -1727,7 +1737,6 @@ void disable_IO_APIC(void) */ if (ioapic_i8259.pin != -1) { struct IO_APIC_route_entry entry; - unsigned long flags; memset(&entry, 0, sizeof(entry)); entry.mask = 0; /* Enabled */ @@ -1744,12 +1753,7 @@ void disable_IO_APIC(void) /* * Add it to the IO-APIC irq-routing table: */ - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin, - *(((int *)&entry)+1)); - io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin, - *(((int *)&entry)+0)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); } disconnect_bsp_APIC(ioapic_i8259.pin != -1); } @@ -2214,17 +2218,13 @@ static inline void unlock_ExtINT_logic(void) int apic, pin, i; struct IO_APIC_route_entry entry0, entry1; unsigned char save_control, save_freq_select; - unsigned long flags; pin = find_isa_irq_pin(8, mp_INT); apic = find_isa_irq_apic(8, mp_INT); if (pin == -1) return; - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin); - *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); + entry0 = ioapic_read_entry(apic, pin); clear_IO_APIC_pin(apic, pin); memset(&entry1, 0, sizeof(entry1)); @@ -2237,10 +2237,7 @@ static inline void unlock_ExtINT_logic(void) entry1.trigger = 0; entry1.vector = 0; - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(apic, pin, entry1); save_control = CMOS_READ(RTC_CONTROL); save_freq_select = CMOS_READ(RTC_FREQ_SELECT); @@ -2259,10 +2256,7 @@ static inline void unlock_ExtINT_logic(void) CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); clear_IO_APIC_pin(apic, pin); - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(apic, pin, entry0); } int timer_uses_ioapic_pin_0; @@ -2462,17 +2456,12 @@ static int ioapic_suspend(struct sys_device *dev, pm_message_t state) { struct IO_APIC_route_entry *entry; struct sysfs_ioapic_data *data; - unsigned long flags; int i; data = container_of(dev, struct sysfs_ioapic_data, dev); entry = data->entry; - spin_lock_irqsave(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { - *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i); - *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i); - } - spin_unlock_irqrestore(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++) + entry[i] = ioapic_read_entry(dev->id, i); return 0; } @@ -2494,11 +2483,9 @@ static int ioapic_resume(struct sys_device *dev) reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; io_apic_write(dev->id, 0, reg_00.raw); } - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { - io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1)); - io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0)); - } spin_unlock_irqrestore(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++) + ioapic_write_entry(dev->id, i, entry[i]); return 0; } @@ -2695,9 +2682,8 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a if (!ioapic && (irq < 16)) disable_8259A_irq(irq); + ioapic_write_entry(ioapic, pin, entry); spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); -- cgit v1.2.3 From 19f03ffecdb599c1f64113b6dda0a1f143d2bab9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] i386: Clean up code style in mpparse.c ACPI code Remove some unlinuxy ways to write function parameter definitions. Remove some stray "return;"s No functional change. Cc: len.brown@intel.com Signed-off-by: Andi Kleen --- arch/i386/kernel/mpparse.c | 52 +++++++++++++++------------------------------- 1 file changed, 17 insertions(+), 35 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c index a2b126fe9a5..bdaa75a5bc1 100644 --- a/arch/i386/kernel/mpparse.c +++ b/arch/i386/kernel/mpparse.c @@ -810,8 +810,7 @@ int es7000_plat; #ifdef CONFIG_ACPI -void __init mp_register_lapic_address ( - u64 address) +void __init mp_register_lapic_address(u64 address) { mp_lapic_addr = (unsigned long) address; @@ -823,13 +822,10 @@ void __init mp_register_lapic_address ( Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); } - -void __devinit mp_register_lapic ( - u8 id, - u8 enabled) +void __devinit mp_register_lapic (u8 id, u8 enabled) { struct mpc_config_processor processor; - int boot_cpu = 0; + int boot_cpu = 0; if (MAX_APICS - id <= 0) { printk(KERN_WARNING "Processor #%d invalid (max %d)\n", @@ -866,11 +862,9 @@ static struct mp_ioapic_routing { u32 pin_programmed[4]; } mp_ioapic_routing[MAX_IO_APICS]; - -static int mp_find_ioapic ( - int gsi) +static int mp_find_ioapic (int gsi) { - int i = 0; + int i = 0; /* Find the IOAPIC that manages this GSI. */ for (i = 0; i < nr_ioapics; i++) { @@ -883,15 +877,11 @@ static int mp_find_ioapic ( return -1; } - -void __init mp_register_ioapic ( - u8 id, - u32 address, - u32 gsi_base) +void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) { - int idx = 0; - int tmpid; + int idx = 0; + int tmpid; if (nr_ioapics >= MAX_IO_APICS) { printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " @@ -937,16 +927,10 @@ void __init mp_register_ioapic ( mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end); - - return; } - -void __init mp_override_legacy_irq ( - u8 bus_irq, - u8 polarity, - u8 trigger, - u32 gsi) +void __init +mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) { struct mpc_config_intsrc intsrc; int ioapic = -1; @@ -984,15 +968,13 @@ void __init mp_override_legacy_irq ( mp_irqs[mp_irq_entries] = intsrc; if (++mp_irq_entries == MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded!\n"); - - return; } void __init mp_config_acpi_legacy_irqs (void) { struct mpc_config_intsrc intsrc; - int i = 0; - int ioapic = -1; + int i = 0; + int ioapic = -1; /* * Fabricate the legacy ISA bus (bus #31). @@ -1061,12 +1043,12 @@ void __init mp_config_acpi_legacy_irqs (void) #define MAX_GSI_NUM 4096 -int mp_register_gsi (u32 gsi, int triggering, int polarity) +int mp_register_gsi(u32 gsi, int triggering, int polarity) { - int ioapic = -1; - int ioapic_pin = 0; - int idx, bit = 0; - static int pci_irq = 16; + int ioapic = -1; + int ioapic_pin = 0; + int idx, bit = 0; + static int pci_irq = 16; /* * Mapping between Global System Interrups, which * represent all possible interrupts, and IRQs -- cgit v1.2.3 From ba9c231f7499ff6918c069c72ff5fd836c76b963 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] i386: initialize end-of-memory variables as early as possible Move initialization of all memory end variables to as early as possible, so that dependent code doesn't need to check whether these variables have already been set. Change the range check in kunmap_atomic to actually make use of this so that the no-mapping-estabished path (under CONFIG_DEBUG_HIGHMEM) gets used only when the address is inside the lowmem area (and BUG() otherwise). Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/kernel/setup.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index f1682206d30..71a540362b7 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -1170,6 +1170,14 @@ static unsigned long __init setup_memory(void) } printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); + num_physpages = highend_pfn; + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; +#else + num_physpages = max_low_pfn; + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; +#endif +#ifdef CONFIG_FLATMEM + max_mapnr = num_physpages; #endif printk(KERN_NOTICE "%ldMB LOWMEM available.\n", pages_to_mb(max_low_pfn)); -- cgit v1.2.3 From 1a3f239ddf9208f2e52d36fef1c1c4518cbbbabe Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] i386: Replace i386 open-coded cmdline parsing with This patch replaces the open-coded early commandline parsing throughout the i386 boot code with the generic mechanism (already used by ppc, powerpc, ia64 and s390). The code was inconsistent with whether it deletes the option from the cmdline or not, meaning some of these will get passed through the environment into init. This transformation is mainly mechanical, but there are some notable parts: 1) Grammar: s/linux never set's it up/linux never sets it up/ 2) Remove hacked-in earlyprintk= option scanning. When someone actually implements CONFIG_EARLY_PRINTK, then they can use early_param(). [AK: actually it is implemented, but I'm adding the early_param it in the next x86-64 patch] 3) Move declaration of generic_apic_probe() from setup.c into asm/apic.h 4) Various parameters now moved into their appropriate files (thanks Andi). 5) All parse functions which examine arg need to check for NULL, except one where it has subtle humor value. AK: readded acpi_sci handling which was completely dropped AK: moved some more variables into acpi/boot.c Cc: len.brown@intel.com Signed-off-by: Rusty Russell Signed-off-by: Andi Kleen --- arch/i386/kernel/acpi/boot.c | 76 ++++++++- arch/i386/kernel/apic.c | 15 ++ arch/i386/kernel/io_apic.c | 24 ++- arch/i386/kernel/machine_kexec.c | 23 +++ arch/i386/kernel/setup.c | 350 +++++++++++++-------------------------- arch/i386/kernel/smpboot.c | 13 ++ 6 files changed, 266 insertions(+), 235 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index ee003bc0e8b..87e2ab50b69 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -36,6 +36,8 @@ #include #include +int __initdata acpi_force = 0; + #ifdef CONFIG_X86_64 extern void __init clustered_apic_check(void); @@ -860,8 +862,6 @@ static void __init acpi_process_madt(void) return; } -extern int acpi_force; - #ifdef __i386__ static int __init disable_acpi_irq(struct dmi_system_id *d) @@ -1163,3 +1163,75 @@ int __init acpi_boot_init(void) return 0; } + +static int __init parse_acpi(char *arg) +{ + if (!arg) + return -EINVAL; + + /* "acpi=off" disables both ACPI table parsing and interpreter */ + if (strcmp(arg, "off") == 0) { + disable_acpi(); + } + /* acpi=force to over-ride black-list */ + else if (strcmp(arg, "force") == 0) { + acpi_force = 1; + acpi_ht = 1; + acpi_disabled = 0; + } + /* acpi=strict disables out-of-spec workarounds */ + else if (strcmp(arg, "strict") == 0) { + acpi_strict = 1; + } + /* Limit ACPI just to boot-time to enable HT */ + else if (strcmp(arg, "ht") == 0) { + if (!acpi_force) + disable_acpi(); + acpi_ht = 1; + } + /* "acpi=noirq" disables ACPI interrupt routing */ + else if (strcmp(arg, "noirq") == 0) { + acpi_noirq_set(); + } else { + /* Core will printk when we return error. */ + return -EINVAL; + } + return 0; +} +early_param("acpi", parse_acpi); + +/* FIXME: Using pci= for an ACPI parameter is a travesty. */ +static int __init parse_pci(char *arg) +{ + if (arg && strcmp(arg, "noacpi") == 0) + acpi_disable_pci(); + return 0; +} +early_param("pci", parse_pci); + +#ifdef CONFIG_X86_IO_APIC +static int __init parse_acpi_skip_timer_override(char *arg) +{ + acpi_skip_timer_override = 1; + return 0; +} +early_param("acpi_skip_timer_override", parse_acpi_skip_timer_override); +#endif /* CONFIG_X86_IO_APIC */ + +static int __init setup_acpi_sci(char *s) +{ + if (!s) + return -EINVAL; + if (!strcmp(s, "edge")) + acpi_sci_flags.trigger = 1; + else if (!strcmp(s, "level")) + acpi_sci_flags.trigger = 3; + else if (!strcmp(s, "high")) + acpi_sci_flags.polarity = 1; + else if (!strcmp(s, "low")) + acpi_sci_flags.polarity = 3; + else + return -EINVAL; + return 0; +} +early_param("acpi_sci", setup_acpi_sci); diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index 1a34fc57800..ce75e71b694 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -1372,3 +1372,18 @@ int __init APIC_init_uniprocessor (void) return 0; } + +static int __init parse_lapic(char *arg) +{ + lapic_enable(); + return 0; +} +early_param("lapic", parse_lapic); + +static int __init parse_nolapic(char *arg) +{ + lapic_disable(); + return 0; +} +early_param("nolapic", parse_nolapic); + diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index b713f27d7e8..fd0df75cfbd 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -66,7 +66,7 @@ int sis_apic_bug = -1; */ int nr_ioapic_registers[MAX_IO_APICS]; -int disable_timer_pin_1 __initdata; +static int disable_timer_pin_1 __initdata; /* * Rough estimation of how many shared IRQs there are, can @@ -2691,3 +2691,25 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a } #endif /* CONFIG_ACPI */ + +static int __init parse_disable_timer_pin_1(char *arg) +{ + disable_timer_pin_1 = 1; + return 0; +} +early_param("disable_timer_pin_1", parse_disable_timer_pin_1); + +static int __init parse_enable_timer_pin_1(char *arg) +{ + disable_timer_pin_1 = -1; + return 0; +} +early_param("enable_timer_pin_1", parse_enable_timer_pin_1); + +static int __init parse_noapic(char *arg) +{ + /* disable IO-APIC */ + disable_ioapic_setup(); + return 0; +} +early_param("noapic", parse_noapic); diff --git a/arch/i386/kernel/machine_kexec.c b/arch/i386/kernel/machine_kexec.c index 6b1ae6ba76f..66c3dc99a65 100644 --- a/arch/i386/kernel/machine_kexec.c +++ b/arch/i386/kernel/machine_kexec.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -209,3 +210,25 @@ NORET_TYPE void machine_kexec(struct kimage *image) rnk = (relocate_new_kernel_t) reboot_code_buffer; (*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae); } + +/* crashkernel=size@addr specifies the location to reserve for + * a crash kernel. By reserving this memory we guarantee + * that linux never sets it up as a DMA target. + * Useful for holding code to do something appropriate + * after a kernel panic. + */ +static int __init parse_crashkernel(char *arg) +{ + unsigned long size, base; + size = memparse(arg, &arg); + if (*arg == '@') { + base = memparse(arg+1, &arg); + /* FIXME: Do I want a sanity check + * to validate the memory range? + */ + crashk_res.start = base; + crashk_res.end = base + size - 1; + } + return 0; +} +early_param("crashkernel", parse_crashkernel); diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 71a540362b7..c6e31ed386f 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -96,11 +96,6 @@ unsigned long mmu_cr4_features; #endif EXPORT_SYMBOL(acpi_disabled); -#ifdef CONFIG_ACPI -int __initdata acpi_force = 0; -extern acpi_interrupt_flags acpi_sci_flags; -#endif - /* for MCA, but anyone else can use it if they want */ unsigned int machine_id; #ifdef CONFIG_MCA @@ -148,7 +143,6 @@ EXPORT_SYMBOL(ist_info); struct e820map e820; extern void early_cpu_init(void); -extern void generic_apic_probe(char *); extern int root_mountflags; unsigned long saved_videomode; @@ -700,238 +694,132 @@ static inline void copy_edd(void) } #endif -static void __init parse_cmdline_early (char ** cmdline_p) -{ - char c = ' ', *to = command_line, *from = saved_command_line; - int len = 0; - int userdef = 0; +static int __initdata user_defined_memmap = 0; - /* Save unparsed command line copy for /proc/cmdline */ - saved_command_line[COMMAND_LINE_SIZE-1] = '\0'; +/* + * "mem=nopentium" disables the 4MB page tables. + * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM + * to , overriding the bios size. + * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from + * to +, overriding the bios size. + * + * HPA tells me bootloaders need to parse mem=, so no new + * option should be mem= [also see Documentation/i386/boot.txt] + */ +static int __init parse_mem(char *arg) +{ + if (!arg) + return -EINVAL; - for (;;) { - if (c != ' ') - goto next_char; - /* - * "mem=nopentium" disables the 4MB page tables. - * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM - * to , overriding the bios size. - * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from - * to +, overriding the bios size. - * - * HPA tells me bootloaders need to parse mem=, so no new - * option should be mem= [also see Documentation/i386/boot.txt] + if (strcmp(arg, "nopentium") == 0) { + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); + disable_pse = 1; + } else { + /* If the user specifies memory size, we + * limit the BIOS-provided memory map to + * that size. exactmap can be used to specify + * the exact map. mem=number can be used to + * trim the existing memory map. */ - if (!memcmp(from, "mem=", 4)) { - if (to != command_line) - to--; - if (!memcmp(from+4, "nopentium", 9)) { - from += 9+4; - clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); - disable_pse = 1; - } else { - /* If the user specifies memory size, we - * limit the BIOS-provided memory map to - * that size. exactmap can be used to specify - * the exact map. mem=number can be used to - * trim the existing memory map. - */ - unsigned long long mem_size; + unsigned long long mem_size; - mem_size = memparse(from+4, &from); - limit_regions(mem_size); - userdef=1; - } - } - - else if (!memcmp(from, "memmap=", 7)) { - if (to != command_line) - to--; - if (!memcmp(from+7, "exactmap", 8)) { -#ifdef CONFIG_CRASH_DUMP - /* If we are doing a crash dump, we - * still need to know the real mem - * size before original memory map is - * reset. - */ - find_max_pfn(); - saved_max_pfn = max_pfn; -#endif - from += 8+7; - e820.nr_map = 0; - userdef = 1; - } else { - /* If the user specifies memory size, we - * limit the BIOS-provided memory map to - * that size. exactmap can be used to specify - * the exact map. mem=number can be used to - * trim the existing memory map. - */ - unsigned long long start_at, mem_size; - - mem_size = memparse(from+7, &from); - if (*from == '@') { - start_at = memparse(from+1, &from); - add_memory_region(start_at, mem_size, E820_RAM); - } else if (*from == '#') { - start_at = memparse(from+1, &from); - add_memory_region(start_at, mem_size, E820_ACPI); - } else if (*from == '$') { - start_at = memparse(from+1, &from); - add_memory_region(start_at, mem_size, E820_RESERVED); - } else { - limit_regions(mem_size); - userdef=1; - } - } - } - - else if (!memcmp(from, "noexec=", 7)) - noexec_setup(from + 7); + mem_size = memparse(arg, &arg); + limit_regions(mem_size); + user_defined_memmap = 1; + } + return 0; +} +early_param("mem", parse_mem); +static int __init parse_memmap(char *arg) +{ + if (!arg) + return -EINVAL; -#ifdef CONFIG_X86_SMP - /* - * If the BIOS enumerates physical processors before logical, - * maxcpus=N at enumeration-time can be used to disable HT. + if (strcmp(arg, "exactmap") == 0) { +#ifdef CONFIG_CRASH_DUMP + /* If we are doing a crash dump, we + * still need to know the real mem + * size before original memory map is + * reset. */ - else if (!memcmp(from, "maxcpus=", 8)) { - extern unsigned int maxcpus; - - maxcpus = simple_strtoul(from + 8, NULL, 0); - } + find_max_pfn(); + saved_max_pfn = max_pfn; #endif - -#ifdef CONFIG_ACPI - /* "acpi=off" disables both ACPI table parsing and interpreter */ - else if (!memcmp(from, "acpi=off", 8)) { - disable_acpi(); - } - - /* acpi=force to over-ride black-list */ - else if (!memcmp(from, "acpi=force", 10)) { - acpi_force = 1; - acpi_ht = 1; - acpi_disabled = 0; - } - - /* acpi=strict disables out-of-spec workarounds */ - else if (!memcmp(from, "acpi=strict", 11)) { - acpi_strict = 1; - } - - /* Limit ACPI just to boot-time to enable HT */ - else if (!memcmp(from, "acpi=ht", 7)) { - if (!acpi_force) - disable_acpi(); - acpi_ht = 1; - } - - /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */ - else if (!memcmp(from, "pci=noacpi", 10)) { - acpi_disable_pci(); - } - /* "acpi=noirq" disables ACPI interrupt routing */ - else if (!memcmp(from, "acpi=noirq", 10)) { - acpi_noirq_set(); + e820.nr_map = 0; + user_defined_memmap = 1; + } else { + /* If the user specifies memory size, we + * limit the BIOS-provided memory map to + * that size. exactmap can be used to specify + * the exact map. mem=number can be used to + * trim the existing memory map. + */ + unsigned long long start_at, mem_size; + + mem_size = memparse(arg, &arg); + if (*arg == '@') { + start_at = memparse(arg+1, &arg); + add_memory_region(start_at, mem_size, E820_RAM); + } else if (*arg == '#') { + start_at = memparse(arg+1, &arg); + add_memory_region(start_at, mem_size, E820_ACPI); + } else if (*arg == '$') { + start_at = memparse(arg+1, &arg); + add_memory_region(start_at, mem_size, E820_RESERVED); + } else { + limit_regions(mem_size); + user_defined_memmap = 1; } + } + return 0; +} +early_param("memmap", parse_memmap); - else if (!memcmp(from, "acpi_sci=edge", 13)) - acpi_sci_flags.trigger = 1; - - else if (!memcmp(from, "acpi_sci=level", 14)) - acpi_sci_flags.trigger = 3; - - else if (!memcmp(from, "acpi_sci=high", 13)) - acpi_sci_flags.polarity = 1; - - else if (!memcmp(from, "acpi_sci=low", 12)) - acpi_sci_flags.polarity = 3; - -#ifdef CONFIG_X86_IO_APIC - else if (!memcmp(from, "acpi_skip_timer_override", 24)) - acpi_skip_timer_override = 1; - - if (!memcmp(from, "disable_timer_pin_1", 19)) - disable_timer_pin_1 = 1; - if (!memcmp(from, "enable_timer_pin_1", 18)) - disable_timer_pin_1 = -1; +#ifdef CONFIG_PROC_VMCORE +/* elfcorehdr= specifies the location of elf core header + * stored by the crashed kernel. + */ +static int __init parse_elfcorehdr(char *arg) +{ + if (!arg) + return -EINVAL; - /* disable IO-APIC */ - else if (!memcmp(from, "noapic", 6)) - disable_ioapic_setup(); -#endif /* CONFIG_X86_IO_APIC */ -#endif /* CONFIG_ACPI */ + elfcorehdr_addr = memparse(arg, &arg); + return 0; +} +early_param("elfcorehdr", parse_elfcorehdr); +#endif /* CONFIG_PROC_VMCORE */ -#ifdef CONFIG_X86_LOCAL_APIC - /* enable local APIC */ - else if (!memcmp(from, "lapic", 5)) - lapic_enable(); +/* + * highmem=size forces highmem to be exactly 'size' bytes. + * This works even on boxes that have no highmem otherwise. + * This also works to reduce highmem size on bigger boxes. + */ +static int __init parse_highmem(char *arg) +{ + if (!arg) + return -EINVAL; - /* disable local APIC */ - else if (!memcmp(from, "nolapic", 6)) - lapic_disable(); -#endif /* CONFIG_X86_LOCAL_APIC */ + highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT; + return 0; +} +early_param("highmem", parse_highmem); -#ifdef CONFIG_KEXEC - /* crashkernel=size@addr specifies the location to reserve for - * a crash kernel. By reserving this memory we guarantee - * that linux never set's it up as a DMA target. - * Useful for holding code to do something appropriate - * after a kernel panic. - */ - else if (!memcmp(from, "crashkernel=", 12)) { - unsigned long size, base; - size = memparse(from+12, &from); - if (*from == '@') { - base = memparse(from+1, &from); - /* FIXME: Do I want a sanity check - * to validate the memory range? - */ - crashk_res.start = base; - crashk_res.end = base + size - 1; - } - } -#endif -#ifdef CONFIG_PROC_VMCORE - /* elfcorehdr= specifies the location of elf core header - * stored by the crashed kernel. - */ - else if (!memcmp(from, "elfcorehdr=", 11)) - elfcorehdr_addr = memparse(from+11, &from); -#endif +/* + * vmalloc=size forces the vmalloc area to be exactly 'size' + * bytes. This can be used to increase (or decrease) the + * vmalloc area - the default is 128m. + */ +static int __init parse_vmalloc(char *arg) +{ + if (!arg) + return -EINVAL; - /* - * highmem=size forces highmem to be exactly 'size' bytes. - * This works even on boxes that have no highmem otherwise. - * This also works to reduce highmem size on bigger boxes. - */ - else if (!memcmp(from, "highmem=", 8)) - highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT; - - /* - * vmalloc=size forces the vmalloc area to be exactly 'size' - * bytes. This can be used to increase (or decrease) the - * vmalloc area - the default is 128m. - */ - else if (!memcmp(from, "vmalloc=", 8)) - __VMALLOC_RESERVE = memparse(from+8, &from); - - next_char: - c = *(from++); - if (!c) - break; - if (COMMAND_LINE_SIZE <= ++len) - break; - *(to++) = c; - } - *to = '\0'; - *cmdline_p = command_line; - if (userdef) { - printk(KERN_INFO "user-defined physical RAM map:\n"); - print_memory_map("user"); - } + __VMALLOC_RESERVE = memparse(arg, &arg); + return 0; } +early_param("vmalloc", parse_vmalloc); /* * Callback for efi_memory_walk. @@ -1507,17 +1395,15 @@ void __init setup_arch(char **cmdline_p) data_resource.start = virt_to_phys(_etext); data_resource.end = virt_to_phys(_edata)-1; - parse_cmdline_early(cmdline_p); + parse_early_param(); -#ifdef CONFIG_EARLY_PRINTK - { - char *s = strstr(*cmdline_p, "earlyprintk="); - if (s) { - setup_early_printk(strchr(s, '=') + 1); - printk("early console enabled\n"); - } + if (user_defined_memmap) { + printk(KERN_INFO "user-defined physical RAM map:\n"); + print_memory_map("user"); } -#endif + + strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE); + *cmdline_p = command_line; max_low_pfn = setup_memory(); @@ -1546,7 +1432,7 @@ void __init setup_arch(char **cmdline_p) dmi_scan_machine(); #ifdef CONFIG_X86_GENERICARCH - generic_apic_probe(*cmdline_p); + generic_apic_probe(); #endif if (efi_enabled) efi_map_memmap(); diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 9367af76ce3..517eb387455 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -1491,3 +1491,16 @@ void __init smp_intr_init(void) /* IPI for generic function call */ set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); } + +/* + * If the BIOS enumerates physical processors before logical, + * maxcpus=N at enumeration-time can be used to disable HT. + */ +static int __init parse_maxcpus(char *arg) +{ + extern unsigned int maxcpus; + + maxcpus = simple_strtoul(arg, NULL, 0); + return 0; +} +early_param("maxcpus", parse_maxcpus); -- cgit v1.2.3 From df3bb57d2c0160ccd1ee51322f50aa295c3b0858 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] i386/x86-64: Move acpi_disabled variables into acpi/boot.c Removes code duplication between i386/x86-64. Not needed anymore in setup.c since early_param cleanup Cc: len.brown@intel.com Signed-off-by: Andi Kleen --- arch/i386/kernel/acpi/boot.c | 7 +++++++ arch/i386/kernel/setup.c | 7 ------- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index 87e2ab50b69..f3761b98c8d 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -38,6 +38,13 @@ int __initdata acpi_force = 0; +#ifdef CONFIG_ACPI +int acpi_disabled = 0; +#else +int acpi_disabled = 1; +#endif +EXPORT_SYMBOL(acpi_disabled); + #ifdef CONFIG_X86_64 extern void __init clustered_apic_check(void); diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index c6e31ed386f..ea17567dbe7 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -89,13 +89,6 @@ EXPORT_SYMBOL(boot_cpu_data); unsigned long mmu_cr4_features; -#ifdef CONFIG_ACPI - int acpi_disabled = 0; -#else - int acpi_disabled = 1; -#endif -EXPORT_SYMBOL(acpi_disabled); - /* for MCA, but anyone else can use it if they want */ unsigned int machine_id; #ifdef CONFIG_MCA -- cgit v1.2.3 From 91cd444e56ebe0c2acd9576a045d77490b26f607 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] x86: Remove unneeded externs in acpi/boot.c And move one into proto.h Cc: len.brown@intel.com Signed-off-by: Andi Kleen --- arch/i386/kernel/acpi/boot.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index f3761b98c8d..c809a3f4c0b 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -47,9 +47,6 @@ EXPORT_SYMBOL(acpi_disabled); #ifdef CONFIG_X86_64 -extern void __init clustered_apic_check(void); - -extern int gsi_irq_sharing(int gsi); #include static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 0; } -- cgit v1.2.3 From 2ade2920dcefdf5595c6380ebed131c964190855 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] i386/x86-64: rename is_at_popf(), add iret to tests and fix is_at_popf() needs to test for the iret instruction as well as popf. So add that test and rename it to is_setting_trap_flag(). Also change max insn length from 16 to 15 to match reality. LAHF / SAHF can't affect TF, so the comment in x86_64 is removed. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen --- arch/i386/kernel/ptrace.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c index d3db03f4085..775f50e9395 100644 --- a/arch/i386/kernel/ptrace.c +++ b/arch/i386/kernel/ptrace.c @@ -185,17 +185,17 @@ static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_ return addr; } -static inline int is_at_popf(struct task_struct *child, struct pt_regs *regs) +static inline int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) { int i, copied; - unsigned char opcode[16]; + unsigned char opcode[15]; unsigned long addr = convert_eip_to_linear(child, regs); copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); for (i = 0; i < copied; i++) { switch (opcode[i]) { - /* popf */ - case 0x9d: + /* popf and iret */ + case 0x9d: case 0xcf: return 1; /* opcode and address size prefixes */ case 0x66: case 0x67: @@ -247,7 +247,7 @@ static void set_singlestep(struct task_struct *child) * don't mark it as being "us" that set it, so that we * won't clear it by hand later. */ - if (is_at_popf(child, regs)) + if (is_setting_trap_flag(child, regs)) return; child->ptrace |= PT_DTRACE; -- cgit v1.2.3 From 5a1b3999d6cb7ab87f1f3b1700bc91839fd6fa29 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] x86: Some preparationary cleanup for stack trace - Remove unused all_contexts parameter No caller used it - Move skip argument into the structure (needed for followon patches) Cc: mingo@elte.hu Signed-off-by: Andi Kleen --- arch/i386/kernel/stacktrace.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/stacktrace.c b/arch/i386/kernel/stacktrace.c index e62a037ab39..ae3c32a87ad 100644 --- a/arch/i386/kernel/stacktrace.c +++ b/arch/i386/kernel/stacktrace.c @@ -61,12 +61,8 @@ save_context_stack(struct stack_trace *trace, unsigned int skip, /* * Save stack-backtrace addresses into a stack_trace buffer. - * If all_contexts is set, all contexts (hardirq, softirq and process) - * are saved. If not set then only the current context is saved. */ -void save_stack_trace(struct stack_trace *trace, - struct task_struct *task, int all_contexts, - unsigned int skip) +void save_stack_trace(struct stack_trace *trace, struct task_struct *task) { unsigned long ebp; unsigned long *stack = &ebp; @@ -85,10 +81,9 @@ void save_stack_trace(struct stack_trace *trace, struct thread_info *context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); - ebp = save_context_stack(trace, skip, context, stack, ebp); + ebp = save_context_stack(trace, trace->skip, context, stack, ebp); stack = (unsigned long *)context->previous_esp; - if (!all_contexts || !stack || - trace->nr_entries >= trace->max_entries) + if (!stack || trace->nr_entries >= trace->max_entries) break; trace->entries[trace->nr_entries++] = ULONG_MAX; if (trace->nr_entries >= trace->max_entries) -- cgit v1.2.3 From 2b14a78cd07a52001b8c3865ed615d8b9b905b78 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] i386: Do stacktracer conversion too Following x86-64 patches. Reuses code from them in fact. Convert the standard backtracer to do all output using callbacks. Use the x86-64 stack tracer implementation that uses these callbacks to implement the stacktrace interface. This allows to use the new dwarf2 unwinder for stacktrace and get better backtraces. Cc: mingo@elte.hu Signed-off-by: Andi Kleen --- arch/i386/kernel/Makefile | 1 + arch/i386/kernel/stacktrace.c | 93 ------------------------------------ arch/i386/kernel/traps.c | 108 +++++++++++++++++++++++++++++++----------- 3 files changed, 81 insertions(+), 121 deletions(-) delete mode 100644 arch/i386/kernel/stacktrace.c (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index dab497472de..1a884b6e6e5 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -81,4 +81,5 @@ $(obj)/vsyscall-syms.o: $(src)/vsyscall.lds \ $(call if_changed,syscall) k8-y += ../../x86_64/kernel/k8.o +stacktrace-y += ../../x86_64/kernel/stacktrace.o diff --git a/arch/i386/kernel/stacktrace.c b/arch/i386/kernel/stacktrace.c deleted file mode 100644 index ae3c32a87ad..00000000000 --- a/arch/i386/kernel/stacktrace.c +++ /dev/null @@ -1,93 +0,0 @@ -/* - * arch/i386/kernel/stacktrace.c - * - * Stack trace management functions - * - * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar - */ -#include -#include - -static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) -{ - return p > (void *)tinfo && - p < (void *)tinfo + THREAD_SIZE - 3; -} - -/* - * Save stack-backtrace addresses into a stack_trace buffer: - */ -static inline unsigned long -save_context_stack(struct stack_trace *trace, unsigned int skip, - struct thread_info *tinfo, unsigned long *stack, - unsigned long ebp) -{ - unsigned long addr; - -#ifdef CONFIG_FRAME_POINTER - while (valid_stack_ptr(tinfo, (void *)ebp)) { - addr = *(unsigned long *)(ebp + 4); - if (!skip) - trace->entries[trace->nr_entries++] = addr; - else - skip--; - if (trace->nr_entries >= trace->max_entries) - break; - /* - * break out of recursive entries (such as - * end_of_stack_stop_unwind_function): - */ - if (ebp == *(unsigned long *)ebp) - break; - - ebp = *(unsigned long *)ebp; - } -#else - while (valid_stack_ptr(tinfo, stack)) { - addr = *stack++; - if (__kernel_text_address(addr)) { - if (!skip) - trace->entries[trace->nr_entries++] = addr; - else - skip--; - if (trace->nr_entries >= trace->max_entries) - break; - } - } -#endif - - return ebp; -} - -/* - * Save stack-backtrace addresses into a stack_trace buffer. - */ -void save_stack_trace(struct stack_trace *trace, struct task_struct *task) -{ - unsigned long ebp; - unsigned long *stack = &ebp; - - WARN_ON(trace->nr_entries || !trace->max_entries); - - if (!task || task == current) { - /* Grab ebp right from our regs: */ - asm ("movl %%ebp, %0" : "=r" (ebp)); - } else { - /* ebp is the last reg pushed by switch_to(): */ - ebp = *(unsigned long *) task->thread.esp; - } - - while (1) { - struct thread_info *context = (struct thread_info *) - ((unsigned long)stack & (~(THREAD_SIZE - 1))); - - ebp = save_context_stack(trace, trace->skip, context, stack, ebp); - stack = (unsigned long *)context->previous_esp; - if (!stack || trace->nr_entries >= trace->max_entries) - break; - trace->entries[trace->nr_entries++] = ULONG_MAX; - if (trace->nr_entries >= trace->max_entries) - break; - } -} - diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 3c85c89f68d..4ced4285163 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -51,6 +51,7 @@ #include #include #include +#include #include @@ -118,26 +119,16 @@ static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) p < (void *)tinfo + THREAD_SIZE - 3; } -/* - * Print one address/symbol entries per line. - */ -static inline void print_addr_and_symbol(unsigned long addr, char *log_lvl) -{ - printk(" [<%08lx>] ", addr); - - print_symbol("%s\n", addr); -} - static inline unsigned long print_context_stack(struct thread_info *tinfo, unsigned long *stack, unsigned long ebp, - char *log_lvl) + struct stacktrace_ops *ops, void *data) { unsigned long addr; #ifdef CONFIG_FRAME_POINTER while (valid_stack_ptr(tinfo, (void *)ebp)) { addr = *(unsigned long *)(ebp + 4); - print_addr_and_symbol(addr, log_lvl); + ops->address(data, addr); /* * break out of recursive entries (such as * end_of_stack_stop_unwind_function): @@ -150,28 +141,35 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo, while (valid_stack_ptr(tinfo, stack)) { addr = *stack++; if (__kernel_text_address(addr)) - print_addr_and_symbol(addr, log_lvl); + ops->address(data, addr); } #endif return ebp; } +struct ops_and_data { + struct stacktrace_ops *ops; + void *data; +}; + static asmlinkage int -show_trace_unwind(struct unwind_frame_info *info, void *log_lvl) +dump_trace_unwind(struct unwind_frame_info *info, void *data) { + struct ops_and_data *oad = (struct ops_and_data *)data; int n = 0; while (unwind(info) == 0 && UNW_PC(info)) { n++; - print_addr_and_symbol(UNW_PC(info), log_lvl); + oad->ops->address(oad->data, UNW_PC(info)); if (arch_unw_user_mode(info)) break; } return n; } -static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, char *log_lvl) +void dump_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, + struct stacktrace_ops *ops, void *data) { unsigned long ebp; @@ -181,31 +179,37 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, if (call_trace >= 0) { int unw_ret = 0; struct unwind_frame_info info; + struct ops_and_data oad = { .ops = ops, .data = data }; if (regs) { if (unwind_init_frame_info(&info, task, regs) == 0) - unw_ret = show_trace_unwind(&info, log_lvl); + unw_ret = dump_trace_unwind(&info, &oad); } else if (task == current) - unw_ret = unwind_init_running(&info, show_trace_unwind, log_lvl); + unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad); else { if (unwind_init_blocked(&info, task) == 0) - unw_ret = show_trace_unwind(&info, log_lvl); + unw_ret = dump_trace_unwind(&info, &oad); } if (unw_ret > 0) { if (call_trace == 1 && !arch_unw_user_mode(&info)) { - print_symbol("DWARF2 unwinder stuck at %s\n", + ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n", UNW_PC(&info)); if (UNW_SP(&info) >= PAGE_OFFSET) { - printk("Leftover inexact backtrace:\n"); + ops->warning(data, "Leftover inexact backtrace:\n"); stack = (void *)UNW_SP(&info); } else - printk("Full inexact backtrace again:\n"); + ops->warning(data, "Full inexact backtrace again:\n"); } else if (call_trace >= 1) return; else - printk("Full inexact backtrace again:\n"); + ops->warning(data, "Full inexact backtrace again:\n"); } else - printk("Inexact backtrace:\n"); + ops->warning(data, "Inexact backtrace:\n"); + } else if (!stack) { + unsigned long dummy; + stack = &dummy; + if (task && task != current) + stack = (unsigned long *)task->thread.esp; } if (task == current) { @@ -220,15 +224,63 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, struct thread_info *context; context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); - ebp = print_context_stack(context, stack, ebp, log_lvl); + ebp = print_context_stack(context, stack, ebp, ops, data); + /* Should be after the line below, but somewhere + in early boot context comes out corrupted and we + can't reference it -AK */ + if (ops->stack(data, "IRQ") < 0) + break; stack = (unsigned long*)context->previous_esp; if (!stack) break; - printk("%s =======================\n", log_lvl); } } +EXPORT_SYMBOL(dump_trace); + +static void +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + printk(data); + print_symbol(msg, symbol); + printk("\n"); +} + +static void print_trace_warning(void *data, char *msg) +{ + printk("%s%s\n", (char *)data, msg); +} + +static int print_trace_stack(void *data, char *name) +{ + return 0; +} + +/* + * Print one address/symbol entries per line. + */ +static void print_trace_address(void *data, unsigned long addr) +{ + printk("%s [<%08lx>] ", (char *)data, addr); + print_symbol("%s\n", addr); +} + +static struct stacktrace_ops print_trace_ops = { + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, +}; + +static void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long * stack, char *log_lvl) +{ + dump_trace(task, regs, stack, &print_trace_ops, log_lvl); + printk("%s =======================\n", log_lvl); +} -void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long * stack) +void show_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long * stack) { show_trace_log_lvl(task, regs, stack, ""); } -- cgit v1.2.3 From 950fee84557416a3427dd404a13addc4be7b3e6c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] i386: Terminate backtrace fallback early if unwinder stack pointer is zero Cc: jbeulich@novell.com Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 4ced4285163..86fa7e47f30 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -197,6 +197,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (UNW_SP(&info) >= PAGE_OFFSET) { ops->warning(data, "Leftover inexact backtrace:\n"); stack = (void *)UNW_SP(&info); + if (!stack) + return; } else ops->warning(data, "Full inexact backtrace again:\n"); } else if (call_trace >= 1) -- cgit v1.2.3 From a32cf3975bed3b84491f8ffeb24abe8c45d86ab0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] i386: Get ebp from unwinder state when continuing fallback backtrace Cc: jbeulich@novell.com Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 86fa7e47f30..bdf949c30c7 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -171,7 +171,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, struct stacktrace_ops *ops, void *data) { - unsigned long ebp; + unsigned long ebp = 0; if (!task) task = current; @@ -199,6 +199,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, stack = (void *)UNW_SP(&info); if (!stack) return; + ebp = UNW_FP(&info); } else ops->warning(data, "Full inexact backtrace again:\n"); } else if (call_trace >= 1) @@ -207,20 +208,25 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, ops->warning(data, "Full inexact backtrace again:\n"); } else ops->warning(data, "Inexact backtrace:\n"); - } else if (!stack) { + } + if (!stack) { unsigned long dummy; stack = &dummy; if (task && task != current) stack = (unsigned long *)task->thread.esp; } - if (task == current) { - /* Grab ebp right from our regs */ - asm ("movl %%ebp, %0" : "=r" (ebp) : ); - } else { - /* ebp is the last reg pushed by switch_to */ - ebp = *(unsigned long *) task->thread.esp; +#ifdef CONFIG_FRAME_POINTER + if (!ebp) { + if (task == current) { + /* Grab ebp right from our regs */ + asm ("movl %%ebp, %0" : "=r" (ebp) : ); + } else { + /* ebp is the last reg pushed by switch_to */ + ebp = *(unsigned long *) task->thread.esp; + } } +#endif while (1) { struct thread_info *context; -- cgit v1.2.3 From 5758d5dfef1c514200bda3f29ba700f1c3e3bc99 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] i386: fix dubious segment register clear in cpu_init() Fix a very dubious piece of code in arch/i386/kernel/cpu/common.c:cpu_init(). This clears out %fs and %gs, but clobbers %eax in the process without telling gcc. It turns out that gcc happens to be not using %eax at that point anyway so it doesn't matter much, but it looks like a bomb waiting to go off. This does end up saving an instruction, because gcc wants %eax==0 for the set_debugreg()s below. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 70c87de582c..730a7ab7500 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -675,7 +675,7 @@ old_gdt: #endif /* Clear %fs and %gs. */ - asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs"); + asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0)); /* Clear all 6 debug registers: */ set_debugreg(0, 0); -- cgit v1.2.3 From 3ca113ea74836a80645c79adba24caaa7a74120c Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] i386: don't taint UP K7's running SMP kernels. We have a test that looks for invalid pairings of certain athlon/durons that weren't designed for SMP, and taint accordingly (with 'S') if we find such a configuration. However, this test shouldn't fire if there's only a single CPU present. It's perfectly valid for an SMP kernel to boot on UP hardware for example. AK: changed to num_possible_cpus() Signed-off-by: Dave Jones Signed-off-by: Andi Kleen --- arch/i386/kernel/smpboot.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 517eb387455..020d873b7d2 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -177,6 +177,9 @@ static void __devinit smp_store_cpu_info(int id) */ if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) { + if (num_possible_cpus() == 1) + goto valid_k7; + /* Athlon 660/661 is valid. */ if ((c->x86_model==6) && ((c->x86_mask==0) || (c->x86_mask==1))) goto valid_k7; -- cgit v1.2.3 From d28c4393a7bf558538e9def269c1caeab6ec056f Mon Sep 17 00:00:00 2001 From: "Prasanna S.P" Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] x86: error_code is not safe for kprobes This patch moves the entry.S:error_entry to .kprobes.text section, since code marked unsafe for kprobes jumps directly to entry.S::error_entry, that must be marked unsafe as well. This patch also moves all the ".previous.text" asm directives to ".previous" for kprobes section. AK: Following a similar i386 patch from Chuck Ebbert AK: Also merged Jeremy's fix in. +From: Jeremy Fitzhardinge KPROBE_ENTRY does a .section .kprobes.text, and expects its users to do a .previous at the end of the function. Unfortunately, if any code within the function switches sections, for example .fixup, then the .previous ends up putting all subsequent code into .fixup. Worse, any subsequent .fixup code gets intermingled with the code its supposed to be fixing (which is also in .fixup). It's surprising this didn't cause more havok. The fix is to use .pushsection/.popsection, so this stuff nests properly. A further cleanup would be to get rid of all .section/.previous pairs, since they're inherently fragile. +From: Chuck Ebbert <76306.1226@compuserve.com> Because code marked unsafe for kprobes jumps directly to entry.S::error_code, that must be marked unsafe as well. The easiest way to do that is to move the page fault entry point to just before error_code and let it inherit the same section. Also moved all the ".previous" asm directives for kprobes sections to column 1 and removed ".text" from them. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen --- arch/i386/kernel/entry.S | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 87f9f60b803..ba22ec8fab5 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -591,11 +591,9 @@ ENTRY(name) \ /* The include is where all of the SMP etc. interrupts come from */ #include "entry_arch.h" -ENTRY(divide_error) - RING0_INT_FRAME - pushl $0 # no error code - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_divide_error +KPROBE_ENTRY(page_fault) + RING0_EC_FRAME + pushl $do_page_fault CFI_ADJUST_CFA_OFFSET 4 ALIGN error_code: @@ -645,6 +643,7 @@ error_code: call *%edi jmp ret_from_exception CFI_ENDPROC +KPROBE_END(page_fault) ENTRY(coprocessor_error) RING0_INT_FRAME @@ -720,7 +719,8 @@ debug_stack_correct: call do_debug jmp ret_from_exception CFI_ENDPROC - .previous .text +KPROBE_END(debug) + /* * NMI is doubly nasty. It can happen _while_ we're handling * a debug fault, and the debug fault hasn't yet been able to @@ -816,7 +816,7 @@ KPROBE_ENTRY(int3) call do_int3 jmp ret_from_exception CFI_ENDPROC - .previous .text +KPROBE_END(int3) ENTRY(overflow) RING0_INT_FRAME @@ -881,7 +881,7 @@ KPROBE_ENTRY(general_protection) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC - .previous .text +KPROBE_END(general_protection) ENTRY(alignment_check) RING0_EC_FRAME @@ -890,13 +890,14 @@ ENTRY(alignment_check) jmp error_code CFI_ENDPROC -KPROBE_ENTRY(page_fault) - RING0_EC_FRAME - pushl $do_page_fault +ENTRY(divide_error) + RING0_INT_FRAME + pushl $0 # no error code + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_divide_error CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC - .previous .text #ifdef CONFIG_X86_MCE ENTRY(machine_check) -- cgit v1.2.3 From 5e4edbb711417be40f350a319db39888a4edd450 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] i386: Fix warning in mpparse.c Fix linux/arch/i386/kernel/mpparse.c: In function #MP_bus_info#: linux/arch/i386/kernel/mpparse.c:232: warning: comparison is always false due to limited range of data type Signed-off-by: Andi Kleen --- arch/i386/kernel/mpparse.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c index bdaa75a5bc1..772a4233bfe 100644 --- a/arch/i386/kernel/mpparse.c +++ b/arch/i386/kernel/mpparse.c @@ -229,12 +229,14 @@ static void __init MP_bus_info (struct mpc_config_bus *m) mpc_oem_bus_info(m, str, translation_table[mpc_record]); +#if MAX_MP_BUSSES < 256 if (m->mpc_busid >= MAX_MP_BUSSES) { printk(KERN_WARNING "MP table busid value (%d) for bustype %s " " is too large, max. supported is %d\n", m->mpc_busid, str, MAX_MP_BUSSES - 1); return; } +#endif if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) { mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; -- cgit v1.2.3 From e8924acb2ef46b96c93f97025815ef3843cb67a2 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] i386: Make acpi_force static acpi_force can become static. Cc: len.brown@intel.com Signed-off-by: Adrian Bunk Signed-off-by: Andi Kleen --- arch/i386/kernel/acpi/boot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index c809a3f4c0b..0fba420d668 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -36,7 +36,7 @@ #include #include -int __initdata acpi_force = 0; +static int __initdata acpi_force = 0; #ifdef CONFIG_ACPI int acpi_disabled = 0; -- cgit v1.2.3 From 3d08a256da8aed5300bd0752200ece426f49b050 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] i386: Make enable_local_apic static enable_local_apic can now become static. Cc: len.brown@intel.com Signed-off-by: Adrian Bunk Signed-off-by: Andi Kleen --- arch/i386/kernel/apic.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index ce75e71b694..90faae5c5d3 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -52,7 +52,18 @@ static cpumask_t timer_bcast_ipi; /* * Knob to control our willingness to enable the local APIC. */ -int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ +static int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ + +static inline void lapic_disable(void) +{ + enable_local_apic = -1; + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); +} + +static inline void lapic_enable(void) +{ + enable_local_apic = 1; +} /* * Debug level -- cgit v1.2.3 From 02ba1a32dbd3d406530a17a2643a8f0f8cbf3acc Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] i386: move kernel_thread_helper into entry.S And add proper CFI annotation to it which was previously impossible. This prevents "stuck" messages by the dwarf2 unwinder when reaching the top of a kernel stack. Includes feedback from Jan Beulich Cc: jbeulich@novell.com Signed-off-by: Andi Kleen --- arch/i386/kernel/entry.S | 13 +++++++++++++ arch/i386/kernel/process.c | 9 --------- 2 files changed, 13 insertions(+), 9 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index ba22ec8fab5..dede506e5bd 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -950,6 +950,19 @@ ENTRY(arch_unwind_init_running) ENDPROC(arch_unwind_init_running) #endif +ENTRY(kernel_thread_helper) + pushl $0 # fake return address for unwinder + CFI_STARTPROC + movl %edx,%eax + push %edx + CFI_ADJUST_CFA_OFFSET 4 + call *%ebx + push %eax + CFI_ADJUST_CFA_OFFSET 4 + call do_exit + CFI_ENDPROC +ENDPROC(kernel_thread_helper) + .section .rodata,"a" #include "syscall_table.S" diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index b741c3e1a5e..220aeca59c3 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -321,15 +321,6 @@ void show_regs(struct pt_regs * regs) * the "args". */ extern void kernel_thread_helper(void); -__asm__(".section .text\n" - ".align 4\n" - "kernel_thread_helper:\n\t" - "movl %edx,%eax\n\t" - "pushl %edx\n\t" - "call *%ebx\n\t" - "pushl %eax\n\t" - "call do_exit\n" - ".previous"); /* * Create a kernel thread -- cgit v1.2.3 From 522e93e3fcdbf00ba85c72fde6df28cfc0486a65 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] i386: Descriptor and trap table cleanups. The implementation comes from Zach's [RFC, PATCH 10/24] i386 Vmi descriptor changes: Descriptor and trap table cleanups. Add cleanly written accessors for IDT and GDT gates so the subarch may override them. Note that this allows the hypervisor to transparently tweak the DPL of the descriptors as well as the RPL of segments in those descriptors, with no unnecessary kernel code modification. It also allows the hypervisor implementation of the VMI to tweak the gates, allowing for custom exception frames or extra layers of indirection above the guest fault / IRQ handlers. Signed-off-by: Zachary Amsden Signed-off-by: Rusty Russell Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index bdf949c30c7..00d643f3de4 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -1165,20 +1165,6 @@ void __init trap_init_f00f_bug(void) } #endif -#define _set_gate(gate_addr,type,dpl,addr,seg) \ -do { \ - int __d0, __d1; \ - __asm__ __volatile__ ("movw %%dx,%%ax\n\t" \ - "movw %4,%%dx\n\t" \ - "movl %%eax,%0\n\t" \ - "movl %%edx,%1" \ - :"=m" (*((long *) (gate_addr))), \ - "=m" (*(1+(long *) (gate_addr))), "=&a" (__d0), "=&d" (__d1) \ - :"i" ((short) (0x8000+(dpl<<13)+(type<<8))), \ - "3" ((char *) (addr)),"2" ((seg) << 16)); \ -} while (0) - - /* * This needs to use 'idt_table' rather than 'idt', and * thus use the _nonmapped_ version of the IDT, as the @@ -1187,7 +1173,7 @@ do { \ */ void set_intr_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n,14,0,addr,__KERNEL_CS); + _set_gate(n, DESCTYPE_INT, addr, __KERNEL_CS); } /* @@ -1195,22 +1181,22 @@ void set_intr_gate(unsigned int n, void *addr) */ static inline void set_system_intr_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n, 14, 3, addr, __KERNEL_CS); + _set_gate(n, DESCTYPE_INT | DESCTYPE_DPL3, addr, __KERNEL_CS); } static void __init set_trap_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n,15,0,addr,__KERNEL_CS); + _set_gate(n, DESCTYPE_TRAP, addr, __KERNEL_CS); } static void __init set_system_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n,15,3,addr,__KERNEL_CS); + _set_gate(n, DESCTYPE_TRAP | DESCTYPE_DPL3, addr, __KERNEL_CS); } static void __init set_task_gate(unsigned int n, unsigned int gdt_entry) { - _set_gate(idt_table+n,5,0,0,(gdt_entry<<3)); + _set_gate(n, DESCTYPE_TASK, (void *)0, (gdt_entry<<3)); } -- cgit v1.2.3 From aada06c9b7f4cdedbeb2bc905893cf1923a0abad Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] i386: clean up topology.c There is no need to duplicate the topology_init() function. Signed-off-by: Magnus Damm Signed-off-by: Andi Kleen --- arch/i386/kernel/topology.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c index e2e281d4bcc..07d6da36a82 100644 --- a/arch/i386/kernel/topology.c +++ b/arch/i386/kernel/topology.c @@ -28,6 +28,7 @@ #include #include #include +#include #include static struct i386_cpu cpu_devices[NR_CPUS]; @@ -55,34 +56,18 @@ EXPORT_SYMBOL(arch_register_cpu); EXPORT_SYMBOL(arch_unregister_cpu); #endif /*CONFIG_HOTPLUG_CPU*/ - - -#ifdef CONFIG_NUMA -#include - static int __init topology_init(void) { int i; +#ifdef CONFIG_NUMA for_each_online_node(i) register_one_node(i); +#endif /* CONFIG_NUMA */ for_each_present_cpu(i) arch_register_cpu(i); return 0; } -#else /* !CONFIG_NUMA */ - -static int __init topology_init(void) -{ - int i; - - for_each_present_cpu(i) - arch_register_cpu(i); - return 0; -} - -#endif /* CONFIG_NUMA */ - subsys_initcall(topology_init); -- cgit v1.2.3 From c0d83745cc67ed71a08c14739a0b286d0239b1e2 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] i386: mark two more functions as __init cyrix_identify() should be __init because transmeta_identify() is. tsc_init() is only called from setup_arch() which is marked as __init. These two section mismatches have been detected using running modpost on a vmlinux image compiled with CONFIG_RELOCATABLE=y. Signed-off-by: Magnus Damm Signed-off-by: Andi Kleen --- arch/i386/kernel/tsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c index 7e0d8dab207..b8fa0a8b2e4 100644 --- a/arch/i386/kernel/tsc.c +++ b/arch/i386/kernel/tsc.c @@ -192,7 +192,7 @@ int recalibrate_cpu_khz(void) EXPORT_SYMBOL(recalibrate_cpu_khz); -void tsc_init(void) +void __init tsc_init(void) { if (!cpu_has_tsc || tsc_disable) return; -- cgit v1.2.3 From 73fea175303926055440c06bc8894f0c5c58afc8 Mon Sep 17 00:00:00 2001 From: Ashok Raj Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] i386: Support physical cpu hotplug for x86_64 This patch enables ACPI based physical CPU hotplug support for x86_64. Implements acpi_map_lsapic() and acpi_unmap_lsapic() to support physical cpu hotplug. Signed-off-by: Ashok Raj Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: "Brown, Len" Signed-off-by: Andrew Morton --- arch/i386/kernel/acpi/boot.c | 69 +++++++++++++++++++++++++++++++++++++++++--- arch/i386/kernel/mpparse.c | 2 +- 2 files changed, 66 insertions(+), 5 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index 0fba420d668..8a6c5b41234 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -512,16 +513,76 @@ EXPORT_SYMBOL(acpi_register_gsi); #ifdef CONFIG_ACPI_HOTPLUG_CPU int acpi_map_lsapic(acpi_handle handle, int *pcpu) { - /* TBD */ - return -EINVAL; + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + union acpi_object *obj; + struct acpi_table_lapic *lapic; + cpumask_t tmp_map, new_map; + u8 physid; + int cpu; + + if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer))) + return -EINVAL; + + if (!buffer.length || !buffer.pointer) + return -EINVAL; + + obj = buffer.pointer; + if (obj->type != ACPI_TYPE_BUFFER || + obj->buffer.length < sizeof(*lapic)) { + kfree(buffer.pointer); + return -EINVAL; + } + + lapic = (struct acpi_table_lapic *)obj->buffer.pointer; + + if ((lapic->header.type != ACPI_MADT_LAPIC) || + (!lapic->flags.enabled)) { + kfree(buffer.pointer); + return -EINVAL; + } + + physid = lapic->id; + + kfree(buffer.pointer); + buffer.length = ACPI_ALLOCATE_BUFFER; + buffer.pointer = NULL; + + tmp_map = cpu_present_map; + mp_register_lapic(physid, lapic->flags.enabled); + + /* + * If mp_register_lapic successfully generates a new logical cpu + * number, then the following will get us exactly what was mapped + */ + cpus_andnot(new_map, cpu_present_map, tmp_map); + if (cpus_empty(new_map)) { + printk ("Unable to map lapic to logical cpu number\n"); + return -EINVAL; + } + + cpu = first_cpu(new_map); + + *pcpu = cpu; + return 0; } EXPORT_SYMBOL(acpi_map_lsapic); int acpi_unmap_lsapic(int cpu) { - /* TBD */ - return -EINVAL; + int i; + + for_each_possible_cpu(i) { + if (x86_acpiid_to_apicid[i] == x86_cpu_to_apicid[cpu]) { + x86_acpiid_to_apicid[i] = -1; + break; + } + } + x86_cpu_to_apicid[cpu] = -1; + cpu_clear(cpu, cpu_present_map); + num_processors--; + + return (0); } EXPORT_SYMBOL(acpi_unmap_lsapic); diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c index 772a4233bfe..442aaf8c77e 100644 --- a/arch/i386/kernel/mpparse.c +++ b/arch/i386/kernel/mpparse.c @@ -69,7 +69,7 @@ unsigned int def_to_bigsmp = 0; /* Processor that is doing the boot up */ unsigned int boot_cpu_physical_apicid = -1U; /* Internal processor count */ -static unsigned int __devinitdata num_processors; +unsigned int __cpuinitdata num_processors; /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map; -- cgit v1.2.3 From 68bbc172cd1b0ee01814304b8a7bef8922d5fdca Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] i386: remove redundant generic_identify() calls when identifying cpus cpu_dev->c_identify is only called from arch/i386/common.c:identify_cpu(), and this after generic_identify() already has been called. There is no need to call this function twice and hook it in c_identify - but I may be wrong, please double check before applying. This patch also removes generic_identify() from cpu.h to avoid unnecessary future nesting. Signed-off-by: Magnus Damm Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/amd.c | 1 - arch/i386/kernel/cpu/common.c | 2 +- arch/i386/kernel/cpu/cpu.h | 2 -- arch/i386/kernel/cpu/cyrix.c | 2 -- arch/i386/kernel/cpu/intel.c | 1 - arch/i386/kernel/cpu/nexgen.c | 1 - arch/i386/kernel/cpu/transmeta.c | 1 - 7 files changed, 1 insertion(+), 9 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c index e6a2d6b80cd..d6e7778bac7 100644 --- a/arch/i386/kernel/cpu/amd.c +++ b/arch/i386/kernel/cpu/amd.c @@ -275,7 +275,6 @@ static struct cpu_dev amd_cpu_dev __initdata = { }, }, .c_init = init_amd, - .c_identify = generic_identify, .c_size_cache = amd_size_cache, }; diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 730a7ab7500..24ac51a0237 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -265,7 +265,7 @@ static void __init early_cpu_detect(void) } } -void __cpuinit generic_identify(struct cpuinfo_x86 * c) +static void __cpuinit generic_identify(struct cpuinfo_x86 * c) { u32 tfms, xlvl; int ebx; diff --git a/arch/i386/kernel/cpu/cpu.h b/arch/i386/kernel/cpu/cpu.h index 5a1d4f163e8..2f6432cef6f 100644 --- a/arch/i386/kernel/cpu/cpu.h +++ b/arch/i386/kernel/cpu/cpu.h @@ -24,7 +24,5 @@ extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM]; extern int get_model_name(struct cpuinfo_x86 *c); extern void display_cacheinfo(struct cpuinfo_x86 *c); -extern void generic_identify(struct cpuinfo_x86 * c); - extern void early_intel_workaround(struct cpuinfo_x86 *c); diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c index f03b7f94c30..bb02ed1fe56 100644 --- a/arch/i386/kernel/cpu/cyrix.c +++ b/arch/i386/kernel/cpu/cyrix.c @@ -427,7 +427,6 @@ static void cyrix_identify(struct cpuinfo_x86 * c) local_irq_restore(flags); } } - generic_identify(c); } static struct cpu_dev cyrix_cpu_dev __initdata = { @@ -457,7 +456,6 @@ static struct cpu_dev nsc_cpu_dev __initdata = { .c_vendor = "NSC", .c_ident = { "Geode by NSC" }, .c_init = init_nsc, - .c_identify = generic_identify, }; int __init nsc_init_cpu(void) diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c index 5a2e270924b..26243e019d1 100644 --- a/arch/i386/kernel/cpu/intel.c +++ b/arch/i386/kernel/cpu/intel.c @@ -263,7 +263,6 @@ static struct cpu_dev intel_cpu_dev __cpuinitdata = { }, }, .c_init = init_intel, - .c_identify = generic_identify, .c_size_cache = intel_size_cache, }; diff --git a/arch/i386/kernel/cpu/nexgen.c b/arch/i386/kernel/cpu/nexgen.c index ad87fa58058..df188bfd759 100644 --- a/arch/i386/kernel/cpu/nexgen.c +++ b/arch/i386/kernel/cpu/nexgen.c @@ -38,7 +38,6 @@ static void __init nexgen_identify(struct cpuinfo_x86 * c) if ( deep_magic_nexgen_probe() ) { strcpy(c->x86_vendor_id, "NexGenDriven"); } - generic_identify(c); } static struct cpu_dev nexgen_cpu_dev __initdata = { diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c index 7214c9b577a..4027c369bc9 100644 --- a/arch/i386/kernel/cpu/transmeta.c +++ b/arch/i386/kernel/cpu/transmeta.c @@ -88,7 +88,6 @@ static void __init init_transmeta(struct cpuinfo_x86 *c) static void __init transmeta_identify(struct cpuinfo_x86 * c) { u32 xlvl; - generic_identify(c); /* Transmeta-defined flags: level 0x80860001 */ xlvl = cpuid_eax(0x80860000); -- cgit v1.2.3 From 95414930548871c6c92a5b0e607b12b81f3d84d8 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] i386: mark cpu_dev structures as __cpuinitdata The different cpu_dev structures are all used from __cpuinit callers what I can tell. So mark them as __cpuinitdata instead of __initdata. I am a little bit unsure about arch/i386/common.c:default_cpu, especially when it comes to the purpose of this_cpu. Signed-off-by: Magnus Damm Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/amd.c | 2 +- arch/i386/kernel/cpu/centaur.c | 2 +- arch/i386/kernel/cpu/common.c | 2 +- arch/i386/kernel/cpu/cyrix.c | 4 ++-- arch/i386/kernel/cpu/nexgen.c | 2 +- arch/i386/kernel/cpu/rise.c | 2 +- arch/i386/kernel/cpu/transmeta.c | 2 +- arch/i386/kernel/cpu/umc.c | 2 +- 8 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c index d6e7778bac7..eb134a20a7b 100644 --- a/arch/i386/kernel/cpu/amd.c +++ b/arch/i386/kernel/cpu/amd.c @@ -259,7 +259,7 @@ static unsigned int amd_size_cache(struct cpuinfo_x86 * c, unsigned int size) return size; } -static struct cpu_dev amd_cpu_dev __initdata = { +static struct cpu_dev amd_cpu_dev __cpuinitdata = { .c_vendor = "AMD", .c_ident = { "AuthenticAMD" }, .c_models = { diff --git a/arch/i386/kernel/cpu/centaur.c b/arch/i386/kernel/cpu/centaur.c index bd75629dd26..b4d70dcea9f 100644 --- a/arch/i386/kernel/cpu/centaur.c +++ b/arch/i386/kernel/cpu/centaur.c @@ -457,7 +457,7 @@ static unsigned int centaur_size_cache(struct cpuinfo_x86 * c, unsigned int size return size; } -static struct cpu_dev centaur_cpu_dev __initdata = { +static struct cpu_dev centaur_cpu_dev __cpuinitdata = { .c_vendor = "Centaur", .c_ident = { "CentaurHauls" }, .c_init = init_centaur, diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 24ac51a0237..99144449b9a 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -49,7 +49,7 @@ static void default_init(struct cpuinfo_x86 * c) } } -static struct cpu_dev default_cpu = { +static struct cpu_dev __cpuinitdata default_cpu = { .c_init = default_init, .c_vendor = "Unknown", }; diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c index bb02ed1fe56..c2a0da9e0b5 100644 --- a/arch/i386/kernel/cpu/cyrix.c +++ b/arch/i386/kernel/cpu/cyrix.c @@ -429,7 +429,7 @@ static void cyrix_identify(struct cpuinfo_x86 * c) } } -static struct cpu_dev cyrix_cpu_dev __initdata = { +static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { .c_vendor = "Cyrix", .c_ident = { "CyrixInstead" }, .c_init = init_cyrix, @@ -452,7 +452,7 @@ static int __init cyrix_exit_cpu(void) late_initcall(cyrix_exit_cpu); -static struct cpu_dev nsc_cpu_dev __initdata = { +static struct cpu_dev nsc_cpu_dev __cpuinitdata = { .c_vendor = "NSC", .c_ident = { "Geode by NSC" }, .c_init = init_nsc, diff --git a/arch/i386/kernel/cpu/nexgen.c b/arch/i386/kernel/cpu/nexgen.c index df188bfd759..9d622598c05 100644 --- a/arch/i386/kernel/cpu/nexgen.c +++ b/arch/i386/kernel/cpu/nexgen.c @@ -40,7 +40,7 @@ static void __init nexgen_identify(struct cpuinfo_x86 * c) } } -static struct cpu_dev nexgen_cpu_dev __initdata = { +static struct cpu_dev nexgen_cpu_dev __cpuinitdata = { .c_vendor = "Nexgen", .c_ident = { "NexGenDriven" }, .c_models = { diff --git a/arch/i386/kernel/cpu/rise.c b/arch/i386/kernel/cpu/rise.c index d08d5a2811c..b597e435f6c 100644 --- a/arch/i386/kernel/cpu/rise.c +++ b/arch/i386/kernel/cpu/rise.c @@ -28,7 +28,7 @@ static void __init init_rise(struct cpuinfo_x86 *c) set_bit(X86_FEATURE_CX8, c->x86_capability); } -static struct cpu_dev rise_cpu_dev __initdata = { +static struct cpu_dev rise_cpu_dev __cpuinitdata = { .c_vendor = "Rise", .c_ident = { "RiseRiseRise" }, .c_models = { diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c index 4027c369bc9..0b6e8392ed3 100644 --- a/arch/i386/kernel/cpu/transmeta.c +++ b/arch/i386/kernel/cpu/transmeta.c @@ -97,7 +97,7 @@ static void __init transmeta_identify(struct cpuinfo_x86 * c) } } -static struct cpu_dev transmeta_cpu_dev __initdata = { +static struct cpu_dev transmeta_cpu_dev __cpuinitdata = { .c_vendor = "Transmeta", .c_ident = { "GenuineTMx86", "TransmetaCPU" }, .c_init = init_transmeta, diff --git a/arch/i386/kernel/cpu/umc.c b/arch/i386/kernel/cpu/umc.c index 2cd988f6dc5..392b195948d 100644 --- a/arch/i386/kernel/cpu/umc.c +++ b/arch/i386/kernel/cpu/umc.c @@ -10,7 +10,7 @@ static void __init init_umc(struct cpuinfo_x86 * c) } -static struct cpu_dev umc_cpu_dev __initdata = { +static struct cpu_dev umc_cpu_dev __cpuinitdata = { .c_vendor = "UMC", .c_ident = { "UMC UMC UMC" }, .c_models = { -- cgit v1.2.3 From b4af3f7cf11e6b5904af08a652d4a2429af17c74 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] i386: mark cpu init functions as __cpuinit, data as __cpuinitdata Mark i386-specific cpu init functions as __cpuinit. They are all only called from arch/i386/common.c:identify_cpu() that already is marked as __cpuinit. This patch also removes the empty function init_umc(). Signed-off-by: Magnus Damm Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/amd.c | 2 +- arch/i386/kernel/cpu/centaur.c | 20 ++++++++++---------- arch/i386/kernel/cpu/common.c | 2 +- arch/i386/kernel/cpu/cyrix.c | 32 ++++++++++++++++---------------- arch/i386/kernel/cpu/nexgen.c | 2 +- arch/i386/kernel/cpu/rise.c | 2 +- arch/i386/kernel/cpu/transmeta.c | 2 +- arch/i386/kernel/cpu/umc.c | 5 ----- 8 files changed, 31 insertions(+), 36 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c index eb134a20a7b..856cd08f2f3 100644 --- a/arch/i386/kernel/cpu/amd.c +++ b/arch/i386/kernel/cpu/amd.c @@ -22,7 +22,7 @@ extern void vide(void); __asm__(".align 4\nvide: ret"); -static void __init init_amd(struct cpuinfo_x86 *c) +static void __cpuinit init_amd(struct cpuinfo_x86 *c) { u32 l, h; int mbytes = num_physpages >> (20-PAGE_SHIFT); diff --git a/arch/i386/kernel/cpu/centaur.c b/arch/i386/kernel/cpu/centaur.c index b4d70dcea9f..34a81ede789 100644 --- a/arch/i386/kernel/cpu/centaur.c +++ b/arch/i386/kernel/cpu/centaur.c @@ -9,7 +9,7 @@ #ifdef CONFIG_X86_OOSTORE -static u32 __init power2(u32 x) +static u32 __cpuinit power2(u32 x) { u32 s=1; while(s<=x) @@ -22,7 +22,7 @@ static u32 __init power2(u32 x) * Set up an actual MCR */ -static void __init centaur_mcr_insert(int reg, u32 base, u32 size, int key) +static void __cpuinit centaur_mcr_insert(int reg, u32 base, u32 size, int key) { u32 lo, hi; @@ -40,7 +40,7 @@ static void __init centaur_mcr_insert(int reg, u32 base, u32 size, int key) * Shortcut: We know you can't put 4Gig of RAM on a winchip */ -static u32 __init ramtop(void) /* 16388 */ +static u32 __cpuinit ramtop(void) /* 16388 */ { int i; u32 top = 0; @@ -91,7 +91,7 @@ static u32 __init ramtop(void) /* 16388 */ * Compute a set of MCR's to give maximum coverage */ -static int __init centaur_mcr_compute(int nr, int key) +static int __cpuinit centaur_mcr_compute(int nr, int key) { u32 mem = ramtop(); u32 root = power2(mem); @@ -166,7 +166,7 @@ static int __init centaur_mcr_compute(int nr, int key) return ct; } -static void __init centaur_create_optimal_mcr(void) +static void __cpuinit centaur_create_optimal_mcr(void) { int i; /* @@ -189,7 +189,7 @@ static void __init centaur_create_optimal_mcr(void) wrmsr(MSR_IDT_MCR0+i, 0, 0); } -static void __init winchip2_create_optimal_mcr(void) +static void __cpuinit winchip2_create_optimal_mcr(void) { u32 lo, hi; int i; @@ -227,7 +227,7 @@ static void __init winchip2_create_optimal_mcr(void) * Handle the MCR key on the Winchip 2. */ -static void __init winchip2_unprotect_mcr(void) +static void __cpuinit winchip2_unprotect_mcr(void) { u32 lo, hi; u32 key; @@ -239,7 +239,7 @@ static void __init winchip2_unprotect_mcr(void) wrmsr(MSR_IDT_MCR_CTRL, lo, hi); } -static void __init winchip2_protect_mcr(void) +static void __cpuinit winchip2_protect_mcr(void) { u32 lo, hi; @@ -257,7 +257,7 @@ static void __init winchip2_protect_mcr(void) #define RNG_ENABLED (1 << 3) #define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */ -static void __init init_c3(struct cpuinfo_x86 *c) +static void __cpuinit init_c3(struct cpuinfo_x86 *c) { u32 lo, hi; @@ -303,7 +303,7 @@ static void __init init_c3(struct cpuinfo_x86 *c) display_cacheinfo(c); } -static void __init init_centaur(struct cpuinfo_x86 *c) +static void __cpuinit init_centaur(struct cpuinfo_x86 *c) { enum { ECX8=1<<1, diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 99144449b9a..2799baaadf4 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -36,7 +36,7 @@ struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {}; extern int disable_pse; -static void default_init(struct cpuinfo_x86 * c) +static void __cpuinit default_init(struct cpuinfo_x86 * c) { /* Not much we can do here... */ /* Check if at least it has cpuid */ diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c index c2a0da9e0b5..d34b1bdb15f 100644 --- a/arch/i386/kernel/cpu/cyrix.c +++ b/arch/i386/kernel/cpu/cyrix.c @@ -52,25 +52,25 @@ static void __init do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) * Actually since bugs.h doesn't even reference this perhaps someone should * fix the documentation ??? */ -static unsigned char Cx86_dir0_msb __initdata = 0; +static unsigned char Cx86_dir0_msb __cpuinitdata = 0; -static char Cx86_model[][9] __initdata = { +static char Cx86_model[][9] __cpuinitdata = { "Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ", "M II ", "Unknown" }; -static char Cx486_name[][5] __initdata = { +static char Cx486_name[][5] __cpuinitdata = { "SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx", "SRx2", "DRx2" }; -static char Cx486S_name[][4] __initdata = { +static char Cx486S_name[][4] __cpuinitdata = { "S", "S2", "Se", "S2e" }; -static char Cx486D_name[][4] __initdata = { +static char Cx486D_name[][4] __cpuinitdata = { "DX", "DX2", "?", "?", "?", "DX4" }; -static char Cx86_cb[] __initdata = "?.5x Core/Bus Clock"; -static char cyrix_model_mult1[] __initdata = "12??43"; -static char cyrix_model_mult2[] __initdata = "12233445"; +static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock"; +static char cyrix_model_mult1[] __cpuinitdata = "12??43"; +static char cyrix_model_mult2[] __cpuinitdata = "12233445"; /* * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old @@ -82,7 +82,7 @@ static char cyrix_model_mult2[] __initdata = "12233445"; extern void calibrate_delay(void) __init; -static void __init check_cx686_slop(struct cpuinfo_x86 *c) +static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c) { unsigned long flags; @@ -107,7 +107,7 @@ static void __init check_cx686_slop(struct cpuinfo_x86 *c) } -static void __init set_cx86_reorder(void) +static void __cpuinit set_cx86_reorder(void) { u8 ccr3; @@ -122,7 +122,7 @@ static void __init set_cx86_reorder(void) setCx86(CX86_CCR3, ccr3); } -static void __init set_cx86_memwb(void) +static void __cpuinit set_cx86_memwb(void) { u32 cr0; @@ -137,7 +137,7 @@ static void __init set_cx86_memwb(void) setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14 ); } -static void __init set_cx86_inc(void) +static void __cpuinit set_cx86_inc(void) { unsigned char ccr3; @@ -158,7 +158,7 @@ static void __init set_cx86_inc(void) * Configure later MediaGX and/or Geode processor. */ -static void __init geode_configure(void) +static void __cpuinit geode_configure(void) { unsigned long flags; u8 ccr3, ccr4; @@ -184,14 +184,14 @@ static void __init geode_configure(void) #ifdef CONFIG_PCI -static struct pci_device_id __initdata cyrix_55x0[] = { +static struct pci_device_id __cpuinitdata cyrix_55x0[] = { { PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510) }, { PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520) }, { }, }; #endif -static void __init init_cyrix(struct cpuinfo_x86 *c) +static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) { unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0; char *buf = c->x86_model_id; @@ -346,7 +346,7 @@ static void __init init_cyrix(struct cpuinfo_x86 *c) /* * Handle National Semiconductor branded processors */ -static void __init init_nsc(struct cpuinfo_x86 *c) +static void __cpuinit init_nsc(struct cpuinfo_x86 *c) { /* There may be GX1 processors in the wild that are branded * NSC and not Cyrix. diff --git a/arch/i386/kernel/cpu/nexgen.c b/arch/i386/kernel/cpu/nexgen.c index 9d622598c05..d8cc88224d1 100644 --- a/arch/i386/kernel/cpu/nexgen.c +++ b/arch/i386/kernel/cpu/nexgen.c @@ -27,7 +27,7 @@ static int __init deep_magic_nexgen_probe(void) return ret; } -static void __init init_nexgen(struct cpuinfo_x86 * c) +static void __cpuinit init_nexgen(struct cpuinfo_x86 * c) { c->x86_cache_size = 256; /* A few had 1 MB... */ } diff --git a/arch/i386/kernel/cpu/rise.c b/arch/i386/kernel/cpu/rise.c index b597e435f6c..9317f741498 100644 --- a/arch/i386/kernel/cpu/rise.c +++ b/arch/i386/kernel/cpu/rise.c @@ -5,7 +5,7 @@ #include "cpu.h" -static void __init init_rise(struct cpuinfo_x86 *c) +static void __cpuinit init_rise(struct cpuinfo_x86 *c) { printk("CPU: Rise iDragon"); if (c->x86_model > 2) diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c index 0b6e8392ed3..725404cd77f 100644 --- a/arch/i386/kernel/cpu/transmeta.c +++ b/arch/i386/kernel/cpu/transmeta.c @@ -5,7 +5,7 @@ #include #include "cpu.h" -static void __init init_transmeta(struct cpuinfo_x86 *c) +static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) { unsigned int cap_mask, uk, max, dummy; unsigned int cms_rev1, cms_rev2; diff --git a/arch/i386/kernel/cpu/umc.c b/arch/i386/kernel/cpu/umc.c index 392b195948d..1bf3f87e9c5 100644 --- a/arch/i386/kernel/cpu/umc.c +++ b/arch/i386/kernel/cpu/umc.c @@ -5,10 +5,6 @@ /* UMC chips appear to be only either 386 or 486, so no special init takes place. */ -static void __init init_umc(struct cpuinfo_x86 * c) -{ - -} static struct cpu_dev umc_cpu_dev __cpuinitdata = { .c_vendor = "UMC", @@ -21,7 +17,6 @@ static struct cpu_dev umc_cpu_dev __cpuinitdata = { } }, }, - .c_init = init_umc, }; int __init umc_init_cpu(void) -- cgit v1.2.3 From 5f0f1c166647860bb2c2a206338e7d9af3834753 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] i386: mark cpu identify functions as __cpuinit Mark i386-specific cpu identification functions as __cpuinit. They are all only called from arch/i386/common.c:identify_cpu() that already is marked as __cpuinit. Signed-off-by: Magnus Damm Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/cyrix.c | 4 ++-- arch/i386/kernel/cpu/nexgen.c | 4 ++-- arch/i386/kernel/cpu/transmeta.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c index d34b1bdb15f..c0c3b59de32 100644 --- a/arch/i386/kernel/cpu/cyrix.c +++ b/arch/i386/kernel/cpu/cyrix.c @@ -12,7 +12,7 @@ /* * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU */ -static void __init do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) { unsigned char ccr2, ccr3; unsigned long flags; @@ -394,7 +394,7 @@ static inline int test_cyrix_52div(void) return (unsigned char) (test >> 8) == 0x02; } -static void cyrix_identify(struct cpuinfo_x86 * c) +static void __cpuinit cyrix_identify(struct cpuinfo_x86 * c) { /* Detect Cyrix with disabled CPUID */ if ( c->x86 == 4 && test_cyrix_52div() ) { diff --git a/arch/i386/kernel/cpu/nexgen.c b/arch/i386/kernel/cpu/nexgen.c index d8cc88224d1..8bf23cc80c6 100644 --- a/arch/i386/kernel/cpu/nexgen.c +++ b/arch/i386/kernel/cpu/nexgen.c @@ -10,7 +10,7 @@ * to have CPUID. (Thanks to Herbert Oppmann) */ -static int __init deep_magic_nexgen_probe(void) +static int __cpuinit deep_magic_nexgen_probe(void) { int ret; @@ -32,7 +32,7 @@ static void __cpuinit init_nexgen(struct cpuinfo_x86 * c) c->x86_cache_size = 256; /* A few had 1 MB... */ } -static void __init nexgen_identify(struct cpuinfo_x86 * c) +static void __cpuinit nexgen_identify(struct cpuinfo_x86 * c) { /* Detect NexGen with old hypercode */ if ( deep_magic_nexgen_probe() ) { diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c index 725404cd77f..4056fb7d2cd 100644 --- a/arch/i386/kernel/cpu/transmeta.c +++ b/arch/i386/kernel/cpu/transmeta.c @@ -85,7 +85,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) #endif } -static void __init transmeta_identify(struct cpuinfo_x86 * c) +static void __cpuinit transmeta_identify(struct cpuinfo_x86 * c) { u32 xlvl; -- cgit v1.2.3 From e9dff0ee6694b2edd40b1b448cb786f6a7b02336 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] i386: mark cpu cache functions as __cpuinit Mark i386-specific cpu cache functions as __cpuinit. They are all only called from arch/i386/common.c:display_cache_info() that already is marked as __cpuinit. Signed-off-by: Magnus Damm Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/amd.c | 2 +- arch/i386/kernel/cpu/centaur.c | 2 +- arch/i386/kernel/cpu/intel.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c index 856cd08f2f3..e4758095d87 100644 --- a/arch/i386/kernel/cpu/amd.c +++ b/arch/i386/kernel/cpu/amd.c @@ -246,7 +246,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) num_cache_leaves = 3; } -static unsigned int amd_size_cache(struct cpuinfo_x86 * c, unsigned int size) +static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size) { /* AMD errata T13 (order #21922) */ if ((c->x86 == 6)) { diff --git a/arch/i386/kernel/cpu/centaur.c b/arch/i386/kernel/cpu/centaur.c index 34a81ede789..8c25047975c 100644 --- a/arch/i386/kernel/cpu/centaur.c +++ b/arch/i386/kernel/cpu/centaur.c @@ -442,7 +442,7 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c) } } -static unsigned int centaur_size_cache(struct cpuinfo_x86 * c, unsigned int size) +static unsigned int __cpuinit centaur_size_cache(struct cpuinfo_x86 * c, unsigned int size) { /* VIA C3 CPUs (670-68F) need further shifting. */ if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8))) diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c index 26243e019d1..94a95aa5227 100644 --- a/arch/i386/kernel/cpu/intel.c +++ b/arch/i386/kernel/cpu/intel.c @@ -198,7 +198,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) } -static unsigned int intel_size_cache(struct cpuinfo_x86 * c, unsigned int size) +static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size) { /* Intel PIII Tualatin. This comes in two flavours. * One has 256kb of cache, the other 512. We have no way -- cgit v1.2.3 From 6f6b1e0477ccb2f25a9b045e38440347d2ce21c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20Luis=20V=E1zquez=20Cao?= Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] i386: Disallow kprobes on NMI handlers A kprobe executes IRET early and that could cause NMI recursion and stack corruption. Note: This problem was originally spotted by Andi Kleen. This patch adds fixes not included in his original patch. [AK: Jan Beulich originally discovered these classes of bugs] Signed-off-by: Fernando Vazquez Signed-off-by: Andi Kleen --- arch/i386/kernel/mca.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/mca.c b/arch/i386/kernel/mca.c index cd5456f14af..eb57a851789 100644 --- a/arch/i386/kernel/mca.c +++ b/arch/i386/kernel/mca.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -414,7 +415,8 @@ subsys_initcall(mca_init); /*--------------------------------------------------------------------*/ -static void mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag) +static __kprobes void +mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag) { int slot = mca_dev->slot; @@ -444,7 +446,7 @@ static void mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag) /*--------------------------------------------------------------------*/ -static int mca_handle_nmi_callback(struct device *dev, void *data) +static int __kprobes mca_handle_nmi_callback(struct device *dev, void *data) { struct mca_device *mca_dev = to_mca_device(dev); unsigned char pos5; @@ -462,7 +464,7 @@ static int mca_handle_nmi_callback(struct device *dev, void *data) return 0; } -void mca_handle_nmi(void) +void __kprobes mca_handle_nmi(void) { /* First try - scan the various adapters and see if a specific * adapter was responsible for the error. -- cgit v1.2.3 From 06039754d775d3e48e4a292e4f353321205eff53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20Luis=20V=E1zquez=20Cao?= Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] i386: Disallow kprobes on NMI handlers A kprobe executes IRET early and that could cause NMI recursion and stack corruption. Note: This problem was originally spotted and solved by Andi Kleen in the x86_64 architecture. This patch is an adaption of his patch for i386. AK: Merged with current code which was a bit different. AK: Removed printk in nmi handler that shouldn't be there in the first time AK: Added missing include. AK: added KPROBES_END Signed-off-by: Fernando Vazquez Signed-off-by: Andi Kleen --- arch/i386/kernel/entry.S | 3 ++- arch/i386/kernel/nmi.c | 6 +++--- arch/i386/kernel/traps.c | 15 +++++++++------ 3 files changed, 14 insertions(+), 10 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index dede506e5bd..0928f70639a 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -729,7 +729,7 @@ KPROBE_END(debug) * check whether we got an NMI on the debug path where the debug * fault happened on the sysenter path. */ -ENTRY(nmi) +KPROBE_ENTRY(nmi) RING0_INT_FRAME pushl %eax CFI_ADJUST_CFA_OFFSET 4 @@ -805,6 +805,7 @@ nmi_16bit_stack: .align 4 .long 1b,iret_exc .previous +KPROBE_END(nmi) KPROBE_ENTRY(int3) RING0_INT_FRAME diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 7b9a053effa..dbda706fdd1 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -882,7 +883,7 @@ EXPORT_SYMBOL(touch_nmi_watchdog); extern void die_nmi(struct pt_regs *, const char *msg); -int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) +__kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) { /* @@ -962,8 +963,7 @@ int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) * This matches the old behaviour. */ rc = 1; - } else - printk(KERN_WARNING "Unknown enabled NMI hardware?!\n"); + } } done: return rc; diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 00d643f3de4..5c0f4960c67 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -689,7 +689,8 @@ gp_in_kernel: } } -static void mem_parity_error(unsigned char reason, struct pt_regs * regs) +static __kprobes void +mem_parity_error(unsigned char reason, struct pt_regs * regs) { printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " "CPU %d.\n", reason, smp_processor_id()); @@ -704,7 +705,8 @@ static void mem_parity_error(unsigned char reason, struct pt_regs * regs) clear_mem_error(reason); } -static void io_check_error(unsigned char reason, struct pt_regs * regs) +static __kprobes void +io_check_error(unsigned char reason, struct pt_regs * regs) { unsigned long i; @@ -720,7 +722,8 @@ static void io_check_error(unsigned char reason, struct pt_regs * regs) outb(reason, 0x61); } -static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) +static __kprobes void +unknown_nmi_error(unsigned char reason, struct pt_regs * regs) { #ifdef CONFIG_MCA /* Might actually be able to figure out what the guilty party @@ -741,7 +744,7 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) static DEFINE_SPINLOCK(nmi_print_lock); -void die_nmi (struct pt_regs *regs, const char *msg) +void __kprobes die_nmi(struct pt_regs *regs, const char *msg) { if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP) @@ -773,7 +776,7 @@ void die_nmi (struct pt_regs *regs, const char *msg) do_exit(SIGSEGV); } -static void default_do_nmi(struct pt_regs * regs) +static __kprobes void default_do_nmi(struct pt_regs * regs) { unsigned char reason = 0; @@ -811,7 +814,7 @@ static void default_do_nmi(struct pt_regs * regs) reassert_nmi(); } -fastcall void do_nmi(struct pt_regs * regs, long error_code) +fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code) { int cpu; -- cgit v1.2.3 From a549b86dd0f3cbffcd5f9343f4ae7fcd59f7e756 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] i386: annotate FIX_STACK() and the rest of nmi() In i386's entry.S, FIX_STACK() needs annotation because it replaces the stack pointer. And the rest of nmi() needs annotation in order to compile with these new annotations. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen --- arch/i386/kernel/entry.S | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 0928f70639a..4b084524922 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -701,9 +701,15 @@ device_not_available_emulate: jne ok; \ label: \ movl TSS_sysenter_esp0+offset(%esp),%esp; \ + CFI_DEF_CFA esp, 0; \ + CFI_UNDEFINED eip; \ pushfl; \ + CFI_ADJUST_CFA_OFFSET 4; \ pushl $__KERNEL_CS; \ - pushl $sysenter_past_esp + CFI_ADJUST_CFA_OFFSET 4; \ + pushl $sysenter_past_esp; \ + CFI_ADJUST_CFA_OFFSET 4; \ + CFI_REL_OFFSET eip, 0 KPROBE_ENTRY(debug) RING0_INT_FRAME @@ -754,6 +760,7 @@ KPROBE_ENTRY(nmi) cmpl $sysenter_entry,12(%esp) je nmi_debug_stack_check nmi_stack_correct: + /* We have a RING0_INT_FRAME here */ pushl %eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL @@ -764,9 +771,12 @@ nmi_stack_correct: CFI_ENDPROC nmi_stack_fixup: + RING0_INT_FRAME FIX_STACK(12,nmi_stack_correct, 1) jmp nmi_stack_correct + nmi_debug_stack_check: + /* We have a RING0_INT_FRAME here */ cmpw $__KERNEL_CS,16(%esp) jne nmi_stack_correct cmpl $debug,(%esp) @@ -777,8 +787,10 @@ nmi_debug_stack_check: jmp nmi_stack_correct nmi_16bit_stack: - RING0_INT_FRAME - /* create the pointer to lss back */ + /* We have a RING0_INT_FRAME here. + * + * create the pointer to lss back + */ pushl %ss CFI_ADJUST_CFA_OFFSET 4 pushl %esp -- cgit v1.2.3 From 3566561bfadffcb5dbc85d576be80c0dbf2cccc9 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] i386: Avoid overwriting the current pgd (V4, i386) kexec: Avoid overwriting the current pgd (V4, i386) This patch upgrades the i386-specific kexec code to avoid overwriting the current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used to start a secondary kernel that dumps the memory of the previous kernel. The code introduces a new set of page tables. These tables are used to provide an executable identity mapping without overwriting the current pgd. Signed-off-by: Magnus Damm Signed-off-by: Andi Kleen --- arch/i386/kernel/machine_kexec.c | 117 +++++++-------------------- arch/i386/kernel/relocate_kernel.S | 162 +++++++++++++++++++++++++++++++++---- 2 files changed, 174 insertions(+), 105 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/machine_kexec.c b/arch/i386/kernel/machine_kexec.c index 66c3dc99a65..91966bafb3d 100644 --- a/arch/i386/kernel/machine_kexec.c +++ b/arch/i386/kernel/machine_kexec.c @@ -21,70 +21,13 @@ #include #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) - -#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) -#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) -#define L2_ATTR (_PAGE_PRESENT) - -#define LEVEL0_SIZE (1UL << 12UL) - -#ifndef CONFIG_X86_PAE -#define LEVEL1_SIZE (1UL << 22UL) -static u32 pgtable_level1[1024] PAGE_ALIGNED; - -static void identity_map_page(unsigned long address) -{ - unsigned long level1_index, level2_index; - u32 *pgtable_level2; - - /* Find the current page table */ - pgtable_level2 = __va(read_cr3()); - - /* Find the indexes of the physical address to identity map */ - level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE; - level2_index = address / LEVEL1_SIZE; - - /* Identity map the page table entry */ - pgtable_level1[level1_index] = address | L0_ATTR; - pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR; - - /* Flush the tlb so the new mapping takes effect. - * Global tlb entries are not flushed but that is not an issue. - */ - load_cr3(pgtable_level2); -} - -#else -#define LEVEL1_SIZE (1UL << 21UL) -#define LEVEL2_SIZE (1UL << 30UL) -static u64 pgtable_level1[512] PAGE_ALIGNED; -static u64 pgtable_level2[512] PAGE_ALIGNED; - -static void identity_map_page(unsigned long address) -{ - unsigned long level1_index, level2_index, level3_index; - u64 *pgtable_level3; - - /* Find the current page table */ - pgtable_level3 = __va(read_cr3()); - - /* Find the indexes of the physical address to identity map */ - level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE; - level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE; - level3_index = address / LEVEL2_SIZE; - - /* Identity map the page table entry */ - pgtable_level1[level1_index] = address | L0_ATTR; - pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR; - set_64bit(&pgtable_level3[level3_index], - __pa(pgtable_level2) | L2_ATTR); - - /* Flush the tlb so the new mapping takes effect. - * Global tlb entries are not flushed but that is not an issue. - */ - load_cr3(pgtable_level3); -} +static u32 kexec_pgd[1024] PAGE_ALIGNED; +#ifdef CONFIG_X86_PAE +static u32 kexec_pmd0[1024] PAGE_ALIGNED; +static u32 kexec_pmd1[1024] PAGE_ALIGNED; #endif +static u32 kexec_pte0[1024] PAGE_ALIGNED; +static u32 kexec_pte1[1024] PAGE_ALIGNED; static void set_idt(void *newidt, __u16 limit) { @@ -128,16 +71,6 @@ static void load_segments(void) #undef __STR } -typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)( - unsigned long indirection_page, - unsigned long reboot_code_buffer, - unsigned long start_address, - unsigned int has_pae) ATTRIB_NORET; - -extern const unsigned char relocate_new_kernel[]; -extern void relocate_new_kernel_end(void); -extern const unsigned int relocate_new_kernel_size; - /* * A architecture hook called to validate the * proposed image and prepare the control pages @@ -170,25 +103,29 @@ void machine_kexec_cleanup(struct kimage *image) */ NORET_TYPE void machine_kexec(struct kimage *image) { - unsigned long page_list; - unsigned long reboot_code_buffer; - - relocate_new_kernel_t rnk; + unsigned long page_list[PAGES_NR]; + void *control_page; /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); - /* Compute some offsets */ - reboot_code_buffer = page_to_pfn(image->control_code_page) - << PAGE_SHIFT; - page_list = image->head; - - /* Set up an identity mapping for the reboot_code_buffer */ - identity_map_page(reboot_code_buffer); - - /* copy it out */ - memcpy((void *)reboot_code_buffer, relocate_new_kernel, - relocate_new_kernel_size); + control_page = page_address(image->control_code_page); + memcpy(control_page, relocate_kernel, PAGE_SIZE); + + page_list[PA_CONTROL_PAGE] = __pa(control_page); + page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; + page_list[PA_PGD] = __pa(kexec_pgd); + page_list[VA_PGD] = (unsigned long)kexec_pgd; +#ifdef CONFIG_X86_PAE + page_list[PA_PMD_0] = __pa(kexec_pmd0); + page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; + page_list[PA_PMD_1] = __pa(kexec_pmd1); + page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; +#endif + page_list[PA_PTE_0] = __pa(kexec_pte0); + page_list[VA_PTE_0] = (unsigned long)kexec_pte0; + page_list[PA_PTE_1] = __pa(kexec_pte1); + page_list[VA_PTE_1] = (unsigned long)kexec_pte1; /* The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is @@ -207,8 +144,8 @@ NORET_TYPE void machine_kexec(struct kimage *image) set_idt(phys_to_virt(0),0); /* now call it */ - rnk = (relocate_new_kernel_t) reboot_code_buffer; - (*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae); + relocate_kernel((unsigned long)image->head, (unsigned long)page_list, + image->start, cpu_has_pae); } /* crashkernel=size@addr specifies the location to reserve for diff --git a/arch/i386/kernel/relocate_kernel.S b/arch/i386/kernel/relocate_kernel.S index d312616effa..f151d6fae46 100644 --- a/arch/i386/kernel/relocate_kernel.S +++ b/arch/i386/kernel/relocate_kernel.S @@ -7,16 +7,138 @@ */ #include +#include +#include + +/* + * Must be relocatable PIC code callable as a C function + */ + +#define PTR(x) (x << 2) +#define PAGE_ALIGNED (1 << PAGE_SHIFT) +#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */ +#define PAE_PGD_ATTR 0x01 /* _PAGE_PRESENT */ + + .text + .align PAGE_ALIGNED + .globl relocate_kernel +relocate_kernel: + movl 8(%esp), %ebp /* list of pages */ + +#ifdef CONFIG_X86_PAE + /* map the control page at its virtual address */ + + movl PTR(VA_PGD)(%ebp), %edi + movl PTR(VA_CONTROL_PAGE)(%ebp), %eax + andl $0xc0000000, %eax + shrl $27, %eax + addl %edi, %eax + + movl PTR(PA_PMD_0)(%ebp), %edx + orl $PAE_PGD_ATTR, %edx + movl %edx, (%eax) + + movl PTR(VA_PMD_0)(%ebp), %edi + movl PTR(VA_CONTROL_PAGE)(%ebp), %eax + andl $0x3fe00000, %eax + shrl $18, %eax + addl %edi, %eax + + movl PTR(PA_PTE_0)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) + + movl PTR(VA_PTE_0)(%ebp), %edi + movl PTR(VA_CONTROL_PAGE)(%ebp), %eax + andl $0x001ff000, %eax + shrl $9, %eax + addl %edi, %eax + + movl PTR(PA_CONTROL_PAGE)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) + + /* identity map the control page at its physical address */ + + movl PTR(VA_PGD)(%ebp), %edi + movl PTR(PA_CONTROL_PAGE)(%ebp), %eax + andl $0xc0000000, %eax + shrl $27, %eax + addl %edi, %eax + + movl PTR(PA_PMD_1)(%ebp), %edx + orl $PAE_PGD_ATTR, %edx + movl %edx, (%eax) + + movl PTR(VA_PMD_1)(%ebp), %edi + movl PTR(PA_CONTROL_PAGE)(%ebp), %eax + andl $0x3fe00000, %eax + shrl $18, %eax + addl %edi, %eax + + movl PTR(PA_PTE_1)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) + + movl PTR(VA_PTE_1)(%ebp), %edi + movl PTR(PA_CONTROL_PAGE)(%ebp), %eax + andl $0x001ff000, %eax + shrl $9, %eax + addl %edi, %eax + + movl PTR(PA_CONTROL_PAGE)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) +#else + /* map the control page at its virtual address */ + + movl PTR(VA_PGD)(%ebp), %edi + movl PTR(VA_CONTROL_PAGE)(%ebp), %eax + andl $0xffc00000, %eax + shrl $20, %eax + addl %edi, %eax + + movl PTR(PA_PTE_0)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) + + movl PTR(VA_PTE_0)(%ebp), %edi + movl PTR(VA_CONTROL_PAGE)(%ebp), %eax + andl $0x003ff000, %eax + shrl $10, %eax + addl %edi, %eax + + movl PTR(PA_CONTROL_PAGE)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) + + /* identity map the control page at its physical address */ + + movl PTR(VA_PGD)(%ebp), %edi + movl PTR(PA_CONTROL_PAGE)(%ebp), %eax + andl $0xffc00000, %eax + shrl $20, %eax + addl %edi, %eax + + movl PTR(PA_PTE_1)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) + + movl PTR(VA_PTE_1)(%ebp), %edi + movl PTR(PA_CONTROL_PAGE)(%ebp), %eax + andl $0x003ff000, %eax + shrl $10, %eax + addl %edi, %eax + + movl PTR(PA_CONTROL_PAGE)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) +#endif - /* - * Must be relocatable PIC code callable as a C function, that once - * it starts can not use the previous processes stack. - */ - .globl relocate_new_kernel relocate_new_kernel: /* read the arguments and say goodbye to the stack */ movl 4(%esp), %ebx /* page_list */ - movl 8(%esp), %ebp /* reboot_code_buffer */ + movl 8(%esp), %ebp /* list of pages */ movl 12(%esp), %edx /* start address */ movl 16(%esp), %ecx /* cpu_has_pae */ @@ -24,11 +146,26 @@ relocate_new_kernel: pushl $0 popfl - /* set a new stack at the bottom of our page... */ - lea 4096(%ebp), %esp + /* get physical address of control page now */ + /* this is impossible after page table switch */ + movl PTR(PA_CONTROL_PAGE)(%ebp), %edi - /* store the parameters back on the stack */ - pushl %edx /* store the start address */ + /* switch to new set of page tables */ + movl PTR(PA_PGD)(%ebp), %eax + movl %eax, %cr3 + + /* setup a new stack at the end of the physical control page */ + lea 4096(%edi), %esp + + /* jump to identity mapped page */ + movl %edi, %eax + addl $(identity_mapped - relocate_kernel), %eax + pushl %eax + ret + +identity_mapped: + /* store the start address on the stack */ + pushl %edx /* Set cr0 to a known state: * 31 0 == Paging disabled @@ -113,8 +250,3 @@ relocate_new_kernel: xorl %edi, %edi xorl %ebp, %ebp ret -relocate_new_kernel_end: - - .globl relocate_new_kernel_size -relocate_new_kernel_size: - .long relocate_new_kernel_end - relocate_new_kernel -- cgit v1.2.3 From 0da5db313317e3195482d3e660a1074857374a89 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 26 Sep 2006 10:52:39 +0200 Subject: [PATCH] i386: Abstract sensitive instructions Abstract sensitive instructions in assembler code, replacing them with macros (which currently are #defined to the native versions). We use long names: assembler is case-insensitive, so if something goes wrong and macros do not expand, it would assemble anyway. Resulting object files are exactly the same as before. Signed-off-by: Rusty Russell Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/entry.S | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 4b084524922..3872fca5c74 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -76,8 +76,15 @@ DF_MASK = 0x00000400 NT_MASK = 0x00004000 VM_MASK = 0x00020000 +/* These are replaces for paravirtualization */ +#define DISABLE_INTERRUPTS cli +#define ENABLE_INTERRUPTS sti +#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit +#define INTERRUPT_RETURN iret +#define GET_CR0_INTO_EAX movl %cr0, %eax + #ifdef CONFIG_PREEMPT -#define preempt_stop cli; TRACE_IRQS_OFF +#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF #else #define preempt_stop #define resume_kernel restore_nocheck @@ -236,7 +243,7 @@ check_userspace: testl $(VM_MASK | 3), %eax jz resume_kernel ENTRY(resume_userspace) - cli # make sure we don't miss an interrupt + DISABLE_INTERRUPTS # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret movl TI_flags(%ebp), %ecx @@ -247,7 +254,7 @@ ENTRY(resume_userspace) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) - cli + DISABLE_INTERRUPTS cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? jnz restore_nocheck need_resched: @@ -275,7 +282,7 @@ sysenter_past_esp: * No need to follow this irqs on/off section: the syscall * disabled irqs and here we enable it straight after entry: */ - sti + ENABLE_INTERRUPTS pushl $(__USER_DS) CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET ss, 0*/ @@ -320,7 +327,7 @@ sysenter_past_esp: jae syscall_badsys call *sys_call_table(,%eax,4) movl %eax,EAX(%esp) - cli + DISABLE_INTERRUPTS TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx @@ -330,8 +337,7 @@ sysenter_past_esp: movl OLDESP(%esp), %ecx xorl %ebp,%ebp TRACE_IRQS_ON - sti - sysexit + ENABLE_INTERRUPTS_SYSEXIT CFI_ENDPROC @@ -356,7 +362,7 @@ syscall_call: call *sys_call_table(,%eax,4) movl %eax,EAX(%esp) # store the return value syscall_exit: - cli # make sure we don't miss an interrupt + DISABLE_INTERRUPTS # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret TRACE_IRQS_OFF @@ -381,11 +387,11 @@ restore_nocheck_notrace: RESTORE_REGS addl $4, %esp CFI_ADJUST_CFA_OFFSET -4 -1: iret +1: INTERRUPT_RETURN .section .fixup,"ax" iret_exc: TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS pushl $0 # no error code pushl $do_iret_error jmp error_code @@ -409,7 +415,7 @@ ldt_ss: * dosemu and wine happy. */ subl $8, %esp # reserve space for switch16 pointer CFI_ADJUST_CFA_OFFSET 8 - cli + DISABLE_INTERRUPTS TRACE_IRQS_OFF movl %esp, %eax /* Set up the 16bit stack frame with switch32 pointer on top, @@ -419,7 +425,7 @@ ldt_ss: TRACE_IRQS_IRET RESTORE_REGS lss 20+4(%esp), %esp # switch to 16bit stack -1: iret +1: INTERRUPT_RETURN .section __ex_table,"a" .align 4 .long 1b,iret_exc @@ -434,7 +440,7 @@ work_pending: jz work_notifysig work_resched: call schedule - cli # make sure we don't miss an interrupt + DISABLE_INTERRUPTS # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret TRACE_IRQS_OFF @@ -490,7 +496,7 @@ syscall_exit_work: testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl jz work_pending TRACE_IRQS_ON - sti # could let do_syscall_trace() call + ENABLE_INTERRUPTS # could let do_syscall_trace() call # schedule() instead movl %esp, %eax movl $1, %edx @@ -668,7 +674,7 @@ ENTRY(device_not_available) pushl $-1 # mark this as an int CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL - movl %cr0, %eax + GET_CR0_INTO_EAX testl $0x4, %eax # EM (math emulation bit) jne device_not_available_emulate preempt_stop @@ -811,7 +817,7 @@ nmi_16bit_stack: call do_nmi RESTORE_REGS lss 12+4(%esp), %esp # back to 16bit stack -1: iret +1: INTERRUPT_RETURN CFI_ENDPROC .section __ex_table,"a" .align 4 -- cgit v1.2.3 From 78be3706b21a232310590fe00258b224177ac05f Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 26 Sep 2006 10:52:39 +0200 Subject: [PATCH] i386: Allow a kernel not to be in ring 0 We allow for the fact that the guest kernel may not run in ring 0. This requires some abstraction in a few places when setting %cs or checking privilege level (user vs kernel). This is Chris' [RFC PATCH 15/33] move segment checks to subarch, except rather than using #define USER_MODE_MASK which depends on a config option, we use Zach's more flexible approach of assuming ring 3 == userspace. I also used "get_kernel_rpl()" over "get_kernel_cs()" because I think it reads better in the code... 1) Remove the hardcoded 3 and introduce #define SEGMENT_RPL_MASK 3 2) Add a get_kernel_rpl() macro, and don't assume it's zero. And: Clean up of patch for letting kernel run other than ring 0: a. Add some comments about the SEGMENT_IS_*_CODE() macros. b. Add a USER_RPL macro. (Code was comparing a value to a mask in some places and to the magic number 3 in other places.) c. Add macros for table indicator field and use them. d. Change the entry.S tests for LDT stack segment to use the macros Signed-off-by: Rusty Russell Signed-off-by: Zachary Amsden Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/entry.S | 9 +++++---- arch/i386/kernel/process.c | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 3872fca5c74..284f2e908ad 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -240,8 +240,9 @@ ret_from_intr: check_userspace: movl EFLAGS(%esp), %eax # mix EFLAGS and CS movb CS(%esp), %al - testl $(VM_MASK | 3), %eax - jz resume_kernel + andl $(VM_MASK | SEGMENT_RPL_MASK), %eax + cmpl $USER_RPL, %eax + jb resume_kernel # not returning to v8086 or userspace ENTRY(resume_userspace) DISABLE_INTERRUPTS # make sure we don't miss an interrupt # setting need_resched or sigpending @@ -377,8 +378,8 @@ restore_all: # See comments in process.c:copy_thread() for details. movb OLDSS(%esp), %ah movb CS(%esp), %al - andl $(VM_MASK | (4 << 8) | 3), %eax - cmpl $((4 << 8) | 3), %eax + andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax + cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax CFI_REMEMBER_STATE je ldt_ss # returning to user-space with LDT SS restore_nocheck: diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 220aeca59c3..8c190ca7ae4 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -338,7 +338,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) regs.xes = __USER_DS; regs.orig_eax = -1; regs.eip = (unsigned long) kernel_thread_helper; - regs.xcs = __KERNEL_CS; + regs.xcs = __KERNEL_CS | get_kernel_rpl(); regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; /* Ok, create the new process.. */ -- cgit v1.2.3 From ec5c09269ba9cd3b9cf879390db6d5c7dcdcaf1e Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Tue, 26 Sep 2006 10:52:39 +0200 Subject: [PATCH] i386: Do better early exception handlers Add early i386 fault handlers with debug information for common faults. Handles: divide error invalid opcode protection fault page fault Also adds code to detect early recursive/multiple faults and halt the system when they happen (taken from x86_64.) Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Ingo Molnar Signed-off-by: Andrew Morton --- arch/i386/kernel/head.S | 67 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index a6b8bd89aa2..be9d883c62c 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -371,8 +371,65 @@ rp_sidt: addl $8,%edi dec %ecx jne rp_sidt + +.macro set_early_handler handler,trapno + lea \handler,%edx + movl $(__KERNEL_CS << 16),%eax + movw %dx,%ax + movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ + lea idt_table,%edi + movl %eax,8*\trapno(%edi) + movl %edx,8*\trapno+4(%edi) +.endm + + set_early_handler handler=early_divide_err,trapno=0 + set_early_handler handler=early_illegal_opcode,trapno=6 + set_early_handler handler=early_protection_fault,trapno=13 + set_early_handler handler=early_page_fault,trapno=14 + ret +early_divide_err: + xor %edx,%edx + pushl $0 /* fake errcode */ + jmp early_fault + +early_illegal_opcode: + movl $6,%edx + pushl $0 /* fake errcode */ + jmp early_fault + +early_protection_fault: + movl $13,%edx + jmp early_fault + +early_page_fault: + movl $14,%edx + jmp early_fault + +early_fault: + cld +#ifdef CONFIG_PRINTK + movl $(__KERNEL_DS),%eax + movl %eax,%ds + movl %eax,%es + cmpl $2,early_recursion_flag + je hlt_loop + incl early_recursion_flag + movl %cr2,%eax + pushl %eax + pushl %edx /* trapno */ + pushl $fault_msg +#ifdef CONFIG_EARLY_PRINTK + call early_printk +#else + call printk +#endif +#endif +hlt_loop: + hlt + jmp hlt_loop + /* This is the default interrupt "handler" :-) */ ALIGN ignore_int: @@ -386,6 +443,9 @@ ignore_int: movl $(__KERNEL_DS),%eax movl %eax,%ds movl %eax,%es + cmpl $2,early_recursion_flag + je hlt_loop + incl early_recursion_flag pushl 16(%esp) pushl 24(%esp) pushl 32(%esp) @@ -431,9 +491,16 @@ ENTRY(stack_start) ready: .byte 0 +early_recursion_flag: + .long 0 + int_msg: .asciz "Unknown interrupt or fault at EIP %p %p %p\n" +fault_msg: + .ascii "Int %d: CR2 %p err %p EIP %p CS %p flags %p\n" + .asciz "Stack: %p %p %p %p %p %p %p %p\n" + /* * The IDT and GDT 'descriptors' are a strange 48-bit object * only used by the lidt and lgdt instructions. They are not -- cgit v1.2.3 From f0f4c3432e5e1087b3a8c0e6bd4113d3c37497ff Mon Sep 17 00:00:00 2001 From: "adurbin@google.com" Date: Tue, 26 Sep 2006 10:52:39 +0200 Subject: [PATCH] i386: add HPET(s) into resource map Add HPET(s) into resource map. This will allow for the HPET(s) to be visibile within /proc/iomem. Signed-off-by: Aaron Durbin Signed-off-by: Andi Kleen --- arch/i386/kernel/acpi/boot.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index 8a6c5b41234..1aaea6ab8c4 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -30,6 +30,8 @@ #include #include #include +#include +#include #include #include @@ -646,6 +648,8 @@ static int __init acpi_parse_sbf(unsigned long phys_addr, unsigned long size) static int __init acpi_parse_hpet(unsigned long phys, unsigned long size) { struct acpi_table_hpet *hpet_tbl; + struct resource *hpet_res; + resource_size_t res_start; if (!phys || !size) return -EINVAL; @@ -661,12 +665,26 @@ static int __init acpi_parse_hpet(unsigned long phys, unsigned long size) "memory.\n"); return -1; } + +#define HPET_RESOURCE_NAME_SIZE 9 + hpet_res = alloc_bootmem(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE); + if (hpet_res) { + memset(hpet_res, 0, sizeof(*hpet_res)); + hpet_res->name = (void *)&hpet_res[1]; + hpet_res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + snprintf((char *)hpet_res->name, HPET_RESOURCE_NAME_SIZE, + "HPET %u", hpet_tbl->number); + hpet_res->end = (1 * 1024) - 1; + } + #ifdef CONFIG_X86_64 vxtime.hpet_address = hpet_tbl->addr.addrl | ((long)hpet_tbl->addr.addrh << 32); printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", hpet_tbl->id, vxtime.hpet_address); + + res_start = vxtime.hpet_address; #else /* X86 */ { extern unsigned long hpet_address; @@ -674,9 +692,17 @@ static int __init acpi_parse_hpet(unsigned long phys, unsigned long size) hpet_address = hpet_tbl->addr.addrl; printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", hpet_tbl->id, hpet_address); + + res_start = hpet_address; } #endif /* X86 */ + if (hpet_res) { + hpet_res->start = res_start; + hpet_res->end += res_start; + insert_resource(&iomem_resource, hpet_res); + } + return 0; } #else -- cgit v1.2.3 From f354b3a92af9b132b188b9c8ebbfb74de699926d Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] i386: Split multi-line printk in oops output. Sometimes, bug reports come in where we've had an oops, and the only record we have is what the reporter saw on screen shortly before the system locked up completely. Unfortunatly, syslog only prints lines beginning with KERN_EMERG to the console, so some lines get lost. An example of this can be seen at https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=203723 Some of this information isn't vital to diagnosis, but some parts are useful, such as the tainted flag. Signed-off-by: Dave Jones Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 5c0f4960c67..c7adb076e81 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -351,8 +351,9 @@ void show_registers(struct pt_regs *regs) ss = regs->xss & 0xffff; } print_modules(); - printk(KERN_EMERG "CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\n" - "EFLAGS: %08lx (%s %.*s) \n", + printk(KERN_EMERG "CPU: %d\n" + KERN_EMERG "EIP: %04x:[<%08lx>] %s VLI\n" + KERN_EMERG "EFLAGS: %08lx (%s %.*s)\n", smp_processor_id(), 0xffff & regs->xcs, regs->eip, print_tainted(), regs->eflags, system_utsname.release, (int)strcspn(system_utsname.version, " "), -- cgit v1.2.3 From 35d534a3ff4905c0a7bddbad0fc9df55967742c7 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] x86: - restore i8259A eoi status on resume Got it. i8259A_resume calls init_8259A(0) unconditionally, even if auto_eoi has been set. Keep track of the current status and restore that on resume. This fixes it for AMD64 and i386. Signed-off-by: Matthew Garrett Signed-off-by: Andi Kleen --- arch/i386/kernel/i8259.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c index d4756d154f4..ea5f4e7958d 100644 --- a/arch/i386/kernel/i8259.c +++ b/arch/i386/kernel/i8259.c @@ -45,6 +45,8 @@ static void end_8259A_irq (unsigned int irq) #define shutdown_8259A_irq disable_8259A_irq +static int i8259A_auto_eoi; + static void mask_and_ack_8259A(unsigned int); unsigned int startup_8259A_irq(unsigned int irq) @@ -253,7 +255,7 @@ static void save_ELCR(char *trigger) static int i8259A_resume(struct sys_device *dev) { - init_8259A(0); + init_8259A(i8259A_auto_eoi); restore_ELCR(irq_trigger); return 0; } @@ -301,6 +303,8 @@ void init_8259A(int auto_eoi) { unsigned long flags; + i8259A_auto_eoi = auto_eoi; + spin_lock_irqsave(&i8259A_lock, flags); outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ -- cgit v1.2.3 From adf1423698f00d00b267f7dca8231340ce7d65ef Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] i386/x86-64: Work around gcc bug with noreturn functions in unwinder Current gcc generates calls not jumps to noreturn functions. When that happens the return address can point to the next function, which confuses the unwinder. This patch works around it by marking asynchronous exception frames in contrast normal call frames in the unwind information. Then teach the unwinder to decode this. For normal call frames the unwinder now subtracts one from the address which avoids this problem. The standard libgcc unwinder uses the same trick. It doesn't include adjustment of the printed address (i.e. for the original example, it'd still be kernel_math_error+0 that gets displayed, but the unwinder wouldn't get confused anymore. This only works with binutils 2.6.17+ and some versions of H.J.Lu's 2.6.16 unfortunately because earlier binutils don't support .cfi_signal_frame [AK: added automatic detection of the new binutils and wrote description] Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/kernel/entry.S | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 284f2e908ad..5a63d6fdb70 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -183,18 +183,21 @@ VM_MASK = 0x00020000 #define RING0_INT_FRAME \ CFI_STARTPROC simple;\ + CFI_SIGNAL_FRAME;\ CFI_DEF_CFA esp, 3*4;\ /*CFI_OFFSET cs, -2*4;*/\ CFI_OFFSET eip, -3*4 #define RING0_EC_FRAME \ CFI_STARTPROC simple;\ + CFI_SIGNAL_FRAME;\ CFI_DEF_CFA esp, 4*4;\ /*CFI_OFFSET cs, -2*4;*/\ CFI_OFFSET eip, -3*4 #define RING0_PTREGS_FRAME \ CFI_STARTPROC simple;\ + CFI_SIGNAL_FRAME;\ CFI_DEF_CFA esp, OLDESP-EBX;\ /*CFI_OFFSET cs, CS-OLDESP;*/\ CFI_OFFSET eip, EIP-OLDESP;\ @@ -275,6 +278,7 @@ need_resched: # sysenter call handler stub ENTRY(sysenter_entry) CFI_STARTPROC simple + CFI_SIGNAL_FRAME CFI_DEF_CFA esp, 0 CFI_REGISTER esp, ebp movl TSS_sysenter_esp0(%esp),%esp -- cgit v1.2.3 From f157cbb1eb9ce3f33a401ec6d20eb3eb852351a3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] i386/x86-64: Make all early PCI scans dependent on CONFIG_PCI This is useful on systems with broken PCI bus. Affects various scans in x86-64 and i386's early ACPI quirk scan. Cc: gregkh@suse.de Cc: len.brown@intel.com Cc: Trammell Hudson Signed-off-by: Andi Kleen --- arch/i386/kernel/acpi/Makefile | 2 ++ arch/i386/kernel/setup.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/acpi/Makefile b/arch/i386/kernel/acpi/Makefile index 7e9ac99354f..7f7be01f44e 100644 --- a/arch/i386/kernel/acpi/Makefile +++ b/arch/i386/kernel/acpi/Makefile @@ -1,5 +1,7 @@ obj-$(CONFIG_ACPI) += boot.o +ifneq ($(CONFIG_PCI),) obj-$(CONFIG_X86_IO_APIC) += earlyquirk.o +endif obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o ifneq ($(CONFIG_ACPI_PROCESSOR),) diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index ea17567dbe7..7a99b1369fa 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -1437,9 +1437,11 @@ void __init setup_arch(char **cmdline_p) acpi_boot_table_init(); #endif +#ifdef CONFIG_PCI #ifdef CONFIG_X86_IO_APIC check_acpi_pci(); /* Checks more than just ACPI actually */ #endif +#endif #ifdef CONFIG_ACPI acpi_boot_init(); -- cgit v1.2.3 From 0637a70a5db98182d9ad3d6ae1ee30acf20afde9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] x86: Allow disabling early pci scans with pci=noearly or disallowing conf1 Some buggy systems can machine check when config space accesses happen for some non existent devices. i386/x86-64 do some early device scans that might trigger this. Allow pci=noearly to disable this. Also when type 1 is disabling also don't do any early accesses which are always type1. This moves the pci= configuration parsing to be a early parameter. I don't think this can break anything because it only changes a single global that is only used by PCI. Cc: gregkh@suse.de Cc: Trammell Hudson Signed-off-by: Andi Kleen --- arch/i386/kernel/acpi/earlyquirk.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c index 1649a175a20..fe799b11ac0 100644 --- a/arch/i386/kernel/acpi/earlyquirk.c +++ b/arch/i386/kernel/acpi/earlyquirk.c @@ -48,7 +48,11 @@ void __init check_acpi_pci(void) int num, slot, func; /* Assume the machine supports type 1. If not it will - always read ffffffff and should not have any side effect. */ + always read ffffffff and should not have any side effect. + Actually a few buggy systems can machine check. Allow the user + to disable it by command line option at least -AK */ + if (!early_pci_allowed()) + return; /* Poor man's PCI discovery */ for (num = 0; num < 32; num++) { -- cgit v1.2.3 From 15d5f8398311f565682959daaca30e3ca7aea600 Mon Sep 17 00:00:00 2001 From: Dmitriy Zavin Date: Tue, 26 Sep 2006 10:52:42 +0200 Subject: [PATCH] x86: Refactor thermal throttle processing Refactor the event processing (syslog messaging and rate limiting) into separate file therm_throt.c. This allows consistent reporting of CPU thermal throttle events. After ACK'ing the interrupt, if the event is current, the user (p4.c/mce_intel.c) calls therm_throt_process to log (and rate limit) the event. If that function returns 1, the user has the option to log things further (such as to mce_log in x86_64). AK: minor cleanup Signed-off-by: Dmitriy Zavin Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/mcheck/Makefile | 2 +- arch/i386/kernel/cpu/mcheck/p4.c | 23 ++++-------- arch/i386/kernel/cpu/mcheck/therm_throt.c | 58 +++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 18 deletions(-) create mode 100644 arch/i386/kernel/cpu/mcheck/therm_throt.c (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/cpu/mcheck/Makefile b/arch/i386/kernel/cpu/mcheck/Makefile index 30808f3d671..f1ebe1c1c17 100644 --- a/arch/i386/kernel/cpu/mcheck/Makefile +++ b/arch/i386/kernel/cpu/mcheck/Makefile @@ -1,2 +1,2 @@ -obj-y = mce.o k7.o p4.o p5.o p6.o winchip.o +obj-y = mce.o k7.o p4.o p5.o p6.o winchip.o therm_throt.o obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c index b95f1b3d53a..d83a669d376 100644 --- a/arch/i386/kernel/cpu/mcheck/p4.c +++ b/arch/i386/kernel/cpu/mcheck/p4.c @@ -13,6 +13,8 @@ #include #include +#include + #include "mce.h" /* as supported by the P4/Xeon family */ @@ -44,25 +46,12 @@ static void unexpected_thermal_interrupt(struct pt_regs *regs) /* P4/Xeon Thermal transition interrupt handler */ static void intel_thermal_interrupt(struct pt_regs *regs) { - u32 l, h; - unsigned int cpu = smp_processor_id(); - static unsigned long next[NR_CPUS]; + __u64 msr_val; ack_APIC_irq(); - if (time_after(next[cpu], jiffies)) - return; - - next[cpu] = jiffies + HZ*5; - rdmsr(MSR_IA32_THERM_STATUS, l, h); - if (l & 0x1) { - printk(KERN_EMERG "CPU%d: Temperature above threshold\n", cpu); - printk(KERN_EMERG "CPU%d: Running in modulated clock mode\n", - cpu); - add_taint(TAINT_MACHINE_CHECK); - } else { - printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); - } + rdmsrl(MSR_IA32_THERM_STATUS, msr_val); + therm_throt_process(msr_val & 0x1); } /* Thermal interrupt handler for this CPU setup */ @@ -122,7 +111,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) rdmsr (MSR_IA32_MISC_ENABLE, l, h); wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h); - + l = apic_read (APIC_LVTTHMR); apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED); printk (KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c new file mode 100644 index 00000000000..85eba00d680 --- /dev/null +++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c @@ -0,0 +1,58 @@ +/* + * linux/arch/i386/kerne/cpu/mcheck/therm_throt.c + * + * Thermal throttle event support code. + * + * Author: Dmitriy Zavin (dmitriyz@google.com) + * + * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. + * + */ + +#include +#include +#include +#include +#include + +/* How long to wait between reporting thermal events */ +#define CHECK_INTERVAL (300 * HZ) + +static DEFINE_PER_CPU(unsigned long, next_check); + +/*** + * therm_throt_process - Process thermal throttling event + * @curr: Whether the condition is current or not (boolean), since the + * thermal interrupt normally gets called both when the thermal + * event begins and once the event has ended. + * + * This function is normally called by the thermal interrupt after the + * IRQ has been acknowledged. + * + * It will take care of rate limiting and printing messages to the syslog. + * + * Returns: 0 : Event should NOT be further logged, i.e. still in + * "timeout" from previous log message. + * 1 : Event should be logged further, and a message has been + * printed to the syslog. + */ +int therm_throt_process(int curr) +{ + unsigned int cpu = smp_processor_id(); + + if (time_before(jiffies, __get_cpu_var(next_check))) + return 0; + + __get_cpu_var(next_check) = jiffies + CHECK_INTERVAL; + + /* if we just entered the thermal event */ + if (curr) { + printk(KERN_CRIT "CPU%d: Temperature above threshold, " + "cpu clock throttled\n", cpu); + add_taint(TAINT_MACHINE_CHECK); + } else { + printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu); + } + + return 1; +} -- cgit v1.2.3 From 66aea9913cf435fe92ebb7bf869b4f15901ab993 Mon Sep 17 00:00:00 2001 From: Dmitriy Zavin Date: Tue, 26 Sep 2006 10:52:42 +0200 Subject: [PATCH] i386: Make the jiffies compares use the 64bit safe macros. Signed-off-by: Dmitriy Zavin Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/mcheck/therm_throt.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c index 85eba00d680..101f7ace00c 100644 --- a/arch/i386/kernel/cpu/mcheck/therm_throt.c +++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c @@ -18,7 +18,7 @@ /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) -static DEFINE_PER_CPU(unsigned long, next_check); +static DEFINE_PER_CPU(__u64, next_check); /*** * therm_throt_process - Process thermal throttling event @@ -39,11 +39,12 @@ static DEFINE_PER_CPU(unsigned long, next_check); int therm_throt_process(int curr) { unsigned int cpu = smp_processor_id(); + __u64 tmp_jiffs = get_jiffies_64(); - if (time_before(jiffies, __get_cpu_var(next_check))) + if (time_before64(tmp_jiffs, __get_cpu_var(next_check))) return 0; - __get_cpu_var(next_check) = jiffies + CHECK_INTERVAL; + __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; /* if we just entered the thermal event */ if (curr) { -- cgit v1.2.3 From 3222b36f46c22f46697a0a53fa8804153a32669f Mon Sep 17 00:00:00 2001 From: Dmitriy Zavin Date: Tue, 26 Sep 2006 10:52:42 +0200 Subject: [PATCH] x86: Add a cumulative thermal throttle event counter. The counter is exported to /sys that keeps track of the number of thermal events, such that the user knows how bad the thermal problem might be (since the logging to syslog and mcelog is rate limited). AK: Fixed cpu hotplug locking Signed-off-by: Dmitriy Zavin Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/mcheck/p4.c | 3 + arch/i386/kernel/cpu/mcheck/therm_throt.c | 133 ++++++++++++++++++++++++++++-- 2 files changed, 130 insertions(+), 6 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c index d83a669d376..504434a4601 100644 --- a/arch/i386/kernel/cpu/mcheck/p4.c +++ b/arch/i386/kernel/cpu/mcheck/p4.c @@ -115,6 +115,9 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) l = apic_read (APIC_LVTTHMR); apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED); printk (KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); + + /* enable thermal throttle processing */ + atomic_set(&therm_throt_en, 1); return; } #endif /* CONFIG_X86_MCE_P4THERMAL */ diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c index 101f7ace00c..4f43047de40 100644 --- a/arch/i386/kernel/cpu/mcheck/therm_throt.c +++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c @@ -1,15 +1,22 @@ /* * linux/arch/i386/kerne/cpu/mcheck/therm_throt.c * - * Thermal throttle event support code. + * Thermal throttle event support code (such as syslog messaging and rate + * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). + * This allows consistent reporting of CPU thermal throttle events. + * + * Maintains a counter in /sys that keeps track of the number of thermal + * events, such that the user knows how bad the thermal problem might be + * (since the logging to syslog and mcelog is rate limited). * * Author: Dmitriy Zavin (dmitriyz@google.com) * * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. - * + * Inspired by Ross Biro's and Al Borchers' counter code. */ #include +#include #include #include #include @@ -18,15 +25,53 @@ /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) -static DEFINE_PER_CPU(__u64, next_check); +static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; +static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); +atomic_t therm_throt_en = ATOMIC_INIT(0); + +#ifdef CONFIG_SYSFS +#define define_therm_throt_sysdev_one_ro(_name) \ + static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) + +#define define_therm_throt_sysdev_show_func(name) \ +static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ + char *buf) \ +{ \ + unsigned int cpu = dev->id; \ + ssize_t ret; \ + \ + preempt_disable(); /* CPU hotplug */ \ + if (cpu_online(cpu)) \ + ret = sprintf(buf, "%lu\n", \ + per_cpu(thermal_throttle_##name, cpu)); \ + else \ + ret = 0; \ + preempt_enable(); \ + \ + return ret; \ +} + +define_therm_throt_sysdev_show_func(count); +define_therm_throt_sysdev_one_ro(count); + +static struct attribute *thermal_throttle_attrs[] = { + &attr_count.attr, + NULL +}; + +static struct attribute_group thermal_throttle_attr_group = { + .attrs = thermal_throttle_attrs, + .name = "thermal_throttle" +}; +#endif /* CONFIG_SYSFS */ /*** - * therm_throt_process - Process thermal throttling event + * therm_throt_process - Process thermal throttling event from interrupt * @curr: Whether the condition is current or not (boolean), since the * thermal interrupt normally gets called both when the thermal * event begins and once the event has ended. * - * This function is normally called by the thermal interrupt after the + * This function is called by the thermal interrupt after the * IRQ has been acknowledged. * * It will take care of rate limiting and printing messages to the syslog. @@ -41,6 +86,9 @@ int therm_throt_process(int curr) unsigned int cpu = smp_processor_id(); __u64 tmp_jiffs = get_jiffies_64(); + if (curr) + __get_cpu_var(thermal_throttle_count)++; + if (time_before64(tmp_jiffs, __get_cpu_var(next_check))) return 0; @@ -49,7 +97,9 @@ int therm_throt_process(int curr) /* if we just entered the thermal event */ if (curr) { printk(KERN_CRIT "CPU%d: Temperature above threshold, " - "cpu clock throttled\n", cpu); + "cpu clock throttled (total events = %lu)\n", cpu, + __get_cpu_var(thermal_throttle_count)); + add_taint(TAINT_MACHINE_CHECK); } else { printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu); @@ -57,3 +107,74 @@ int therm_throt_process(int curr) return 1; } + +#ifdef CONFIG_SYSFS +/* Add/Remove thermal_throttle interface for CPU device */ +static __cpuinit int thermal_throttle_add_dev(struct sys_device * sys_dev) +{ + sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +static __cpuinit int thermal_throttle_remove_dev(struct sys_device * sys_dev) +{ + sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); + return 0; +} + +/* Mutex protecting device creation against CPU hotplug */ +static DEFINE_MUTEX(therm_cpu_lock); + +/* Get notified when a cpu comes on/off. Be hotplug friendly. */ +static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct sys_device *sys_dev; + + sys_dev = get_cpu_sysdev(cpu); + mutex_lock(&therm_cpu_lock); + switch (action) { + case CPU_ONLINE: + thermal_throttle_add_dev(sys_dev); + break; + case CPU_DEAD: + thermal_throttle_remove_dev(sys_dev); + break; + } + mutex_unlock(&therm_cpu_lock); + return NOTIFY_OK; +} + +static struct notifier_block thermal_throttle_cpu_notifier = +{ + .notifier_call = thermal_throttle_cpu_callback, +}; +#endif /* CONFIG_HOTPLUG_CPU */ + +static __init int thermal_throttle_init_device(void) +{ + unsigned int cpu = 0; + + if (!atomic_read(&therm_throt_en)) + return 0; + + register_hotcpu_notifier(&thermal_throttle_cpu_notifier); + +#ifdef CONFIG_HOTPLUG_CPU + mutex_lock(&therm_cpu_lock); +#endif + /* connect live CPUs to sysfs */ + for_each_online_cpu(cpu) + thermal_throttle_add_dev(get_cpu_sysdev(cpu)); +#ifdef CONFIG_HOTPLUG_CPU + mutex_unlock(&therm_cpu_lock); +#endif + + return 0; +} + +device_initcall(thermal_throttle_init_device); +#endif /* CONFIG_SYSFS */ -- cgit v1.2.3 From dcf10307c3fff2bca4b104ad8fe4e3aa8dcb2ad1 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Tue, 26 Sep 2006 10:52:42 +0200 Subject: [PATCH] i386/x86-64: New Intel feature flags Add supplemental SSE3 instructions flag, and Direct Cache Access flag. As described in "Intel Processor idenfication and the CPUID instruction AP485 Sept 2006" AK: also added for x86-64 Signed-off-by: Dave Jones Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/proc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c index f54a15268ed..76aac088a32 100644 --- a/arch/i386/kernel/cpu/proc.c +++ b/arch/i386/kernel/cpu/proc.c @@ -46,8 +46,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) /* Intel-defined (#2) */ "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", - "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, + NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* VIA/Cyrix/Centaur-defined */ -- cgit v1.2.3