From b07f8915cda3fcd73b8b68075ba1e6cd0673365d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:26 +0200 Subject: [PATCH] x86: Temporarily revert parts of the Core 2 nmi nmi watchdog support This makes merging easier. They are readded a few patches later. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/nmi.c | 81 +++--------------------------------------------- 1 file changed, 5 insertions(+), 76 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 5baa0c726e9..42c05d6907b 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -26,7 +26,6 @@ #include #include #include -#include /* * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: @@ -66,9 +65,6 @@ static unsigned int nmi_p4_cccr_val; #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 #define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING -#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL -#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK - #define MSR_P4_MISC_ENABLE 0x1A0 #define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) #define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12) @@ -100,10 +96,7 @@ static __cpuinit inline int nmi_known_cpu(void) case X86_VENDOR_AMD: return boot_cpu_data.x86 == 15; case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return 1; - else - return (boot_cpu_data.x86 == 15); + return boot_cpu_data.x86 == 15; } return 0; } @@ -209,8 +202,6 @@ int __init setup_nmi_watchdog(char *str) __setup("nmi_watchdog=", setup_nmi_watchdog); -static void disable_intel_arch_watchdog(void); - static void disable_lapic_nmi_watchdog(void) { if (nmi_active <= 0) @@ -223,8 +214,6 @@ static void disable_lapic_nmi_watchdog(void) if (boot_cpu_data.x86 == 15) { wrmsr(MSR_P4_IQ_CCCR0, 0, 0); wrmsr(MSR_P4_CRU_ESCR0, 0, 0); - } else if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - disable_intel_arch_watchdog(); } break; } @@ -377,53 +366,6 @@ static void setup_k7_watchdog(void) wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); } -static void disable_intel_arch_watchdog(void) -{ - unsigned ebx; - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebp indicates event present. - */ - ebx = cpuid_ebx(10); - if (!(ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, 0, 0); -} - -static int setup_intel_arch_watchdog(void) -{ - unsigned int evntsel; - unsigned ebx; - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebp indicates event present. - */ - ebx = cpuid_ebx(10); - if ((ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - return 0; - - nmi_perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0; - - clear_msr_range(MSR_ARCH_PERFMON_EVENTSEL0, 2); - clear_msr_range(MSR_ARCH_PERFMON_PERFCTR0, 2); - - evntsel = ARCH_PERFMON_EVENTSEL_INT - | ARCH_PERFMON_EVENTSEL_OS - | ARCH_PERFMON_EVENTSEL_USR - | ARCH_PERFMON_NMI_EVENT_SEL - | ARCH_PERFMON_NMI_EVENT_UMASK; - - wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0); - wrmsrl(MSR_ARCH_PERFMON_PERFCTR0, -((u64)cpu_khz * 1000 / nmi_hz)); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0); - return 1; -} - static int setup_p4_watchdog(void) { @@ -477,16 +419,10 @@ void setup_apic_nmi_watchdog(void) setup_k7_watchdog(); break; case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - if (!setup_intel_arch_watchdog()) - return; - } else if (boot_cpu_data.x86 == 15) { - if (!setup_p4_watchdog()) - return; - } else { + if (boot_cpu_data.x86 != 15) + return; + if (!setup_p4_watchdog()) return; - } - break; default: @@ -571,14 +507,7 @@ void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) */ wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); apic_write(APIC_LVTPC, APIC_DM_NMI); - } else if (nmi_perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { - /* - * For Intel based architectural perfmon - * - LVTPC is masked on interrupt and must be - * unmasked by the LVTPC handler. - */ - apic_write(APIC_LVTPC, APIC_DM_NMI); - } + } wrmsrl(nmi_perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); } } -- cgit v1.2.3 From 828f0afda123a96ff4e8078f057a302f4b4232ae Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 26 Sep 2006 10:52:26 +0200 Subject: [PATCH] x86: Add performance counter reservation framework for UP kernels Adds basic infrastructure to allow subsystems to reserve performance counters on the x86 chips. Only UP kernels are supported in this patch to make reviewing easier. The SMP portion makes a lot more changes. Think of this as a locking mechanism where each bit represents a different counter. In addition, each subsystem should also reserve an appropriate event selection register that will correspond to the performance counter it will be using (this is mainly neccessary for the Pentium 4 chips as they break the 1:1 relationship to performance counters). This will help prevent subsystems like oprofile from interfering with the nmi watchdog. Signed-off-by: Don Zickus Signed-off-by: Andi Kleen --- arch/x86_64/kernel/nmi.c | 178 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 138 insertions(+), 40 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 42c05d6907b..b7a7c997384 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -27,6 +27,20 @@ #include #include +/* perfctr_nmi_owner tracks the ownership of the perfctr registers: + * evtsel_nmi_owner tracks the ownership of the event selection + * - different performance counters/ event selection may be reserved for + * different subsystems this reservation system just tries to coordinate + * things a little + */ +static DEFINE_PER_CPU(unsigned, perfctr_nmi_owner); +static DEFINE_PER_CPU(unsigned, evntsel_nmi_owner[2]); + +/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's + * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) + */ +#define NMI_MAX_COUNTER_BITS 66 + /* * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: * - it may be reserved by some other driver, or not @@ -90,6 +104,95 @@ static unsigned int nmi_p4_cccr_val; (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) +/* converts an msr to an appropriate reservation bit */ +static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) +{ + /* returns the bit offset of the performance counter register */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + return (msr - MSR_K7_PERFCTR0); + case X86_VENDOR_INTEL: + return (msr - MSR_P4_BPU_PERFCTR0); + } + return 0; +} + +/* converts an msr to an appropriate reservation bit */ +static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) +{ + /* returns the bit offset of the event selection register */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + return (msr - MSR_K7_EVNTSEL0); + case X86_VENDOR_INTEL: + return (msr - MSR_P4_BSU_ESCR0); + } + return 0; +} + +/* checks for a bit availability (hack for oprofile) */ +int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) +{ + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner))); +} + +/* checks the an msr for availability */ +int avail_to_resrv_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner))); +} + +int reserve_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + if (!test_and_set_bit(counter, &__get_cpu_var(perfctr_nmi_owner))) + return 1; + return 0; +} + +void release_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + clear_bit(counter, &__get_cpu_var(perfctr_nmi_owner)); +} + +int reserve_evntsel_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_evntsel_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + if (!test_and_set_bit(counter, &__get_cpu_var(evntsel_nmi_owner))) + return 1; + return 0; +} + +void release_evntsel_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_evntsel_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + clear_bit(counter, &__get_cpu_var(evntsel_nmi_owner)); +} + static __cpuinit inline int nmi_known_cpu(void) { switch (boot_cpu_data.x86_vendor) { @@ -325,34 +428,22 @@ late_initcall(init_lapic_nmi_sysfs); #endif /* CONFIG_PM */ -/* - * Activate the NMI watchdog via the local APIC. - * Original code written by Keith Owens. - */ - -static void clear_msr_range(unsigned int base, unsigned int n) +static int setup_k7_watchdog(void) { - unsigned int i; - - for(i = 0; i < n; ++i) - wrmsr(base+i, 0, 0); -} - -static void setup_k7_watchdog(void) -{ - int i; unsigned int evntsel; nmi_perfctr_msr = MSR_K7_PERFCTR0; - for(i = 0; i < 4; ++i) { - /* Simulator may not support it */ - if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) { - nmi_perfctr_msr = 0; - return; - } - wrmsrl(MSR_K7_PERFCTR0+i, 0UL); - } + if (!reserve_perfctr_nmi(nmi_perfctr_msr)) + goto fail; + + if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0)) + goto fail1; + + /* Simulator may not support it */ + if (checking_wrmsrl(MSR_K7_EVNTSEL0, 0UL)) + goto fail2; + wrmsrl(MSR_K7_PERFCTR0, 0UL); evntsel = K7_EVNTSEL_INT | K7_EVNTSEL_OS @@ -364,6 +455,13 @@ static void setup_k7_watchdog(void) apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= K7_EVNTSEL_ENABLE; wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); + return 1; +fail2: + release_evntsel_nmi(MSR_K7_EVNTSEL0); +fail1: + release_perfctr_nmi(nmi_perfctr_msr); +fail: + return 0; } @@ -382,22 +480,11 @@ static int setup_p4_watchdog(void) nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1; #endif - if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL)) - clear_msr_range(0x3F1, 2); - /* MSR 0x3F0 seems to have a default value of 0xFC00, but current - docs doesn't fully define it, so leave it alone for now. */ - if (boot_cpu_data.x86_model >= 0x3) { - /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */ - clear_msr_range(0x3A0, 26); - clear_msr_range(0x3BC, 3); - } else { - clear_msr_range(0x3A0, 31); - } - clear_msr_range(0x3C0, 6); - clear_msr_range(0x3C8, 6); - clear_msr_range(0x3E0, 2); - clear_msr_range(MSR_P4_CCCR0, 18); - clear_msr_range(MSR_P4_PERFCTR0, 18); + if (!reserve_perfctr_nmi(nmi_perfctr_msr)) + goto fail; + + if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0)) + goto fail1; wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0); wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0); @@ -406,6 +493,10 @@ static int setup_p4_watchdog(void) apic_write(APIC_LVTPC, APIC_DM_NMI); wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); return 1; +fail1: + release_perfctr_nmi(nmi_perfctr_msr); +fail: + return 0; } void setup_apic_nmi_watchdog(void) @@ -416,7 +507,8 @@ void setup_apic_nmi_watchdog(void) return; if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) return; - setup_k7_watchdog(); + if (!setup_k7_watchdog()) + return; break; case X86_VENDOR_INTEL: if (boot_cpu_data.x86 != 15) @@ -588,6 +680,12 @@ int proc_unknown_nmi_panic(struct ctl_table *table, int write, struct file *file EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); +EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); +EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); +EXPORT_SYMBOL(reserve_perfctr_nmi); +EXPORT_SYMBOL(release_perfctr_nmi); +EXPORT_SYMBOL(reserve_evntsel_nmi); +EXPORT_SYMBOL(release_evntsel_nmi); EXPORT_SYMBOL(reserve_lapic_nmi); EXPORT_SYMBOL(release_lapic_nmi); EXPORT_SYMBOL(disable_timer_nmi_watchdog); -- cgit v1.2.3 From f2802e7f571c05f9a901b1f5bd144aa730ccc88e Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 26 Sep 2006 10:52:26 +0200 Subject: [PATCH] Add SMP support on x86_64 to reservation framework This patch includes the changes to make the nmi watchdog on x86_64 SMP aware. A bunch of code was moved around to make it simpler to read. In addition, it is now possible to determine if a particular NMI was the result of the watchdog or not. This feature allows the kernel to filter out unknown NMIs easier. Signed-off-by: Don Zickus Signed-off-by: Andi Kleen --- arch/x86_64/kernel/apic.c | 3 +- arch/x86_64/kernel/nmi.c | 426 +++++++++++++++++++++++++++++++--------------- 2 files changed, 288 insertions(+), 141 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 2b8cef037a6..692e9b57974 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -479,8 +479,7 @@ void __cpuinit setup_local_APIC (void) } nmi_watchdog_default(); - if (nmi_watchdog == NMI_LOCAL_APIC) - setup_apic_nmi_watchdog(); + setup_apic_nmi_watchdog(NULL); apic_pm_activate(); } diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index b7a7c997384..d42374a952d 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -56,53 +56,29 @@ static unsigned int lapic_nmi_owner; #define LAPIC_NMI_RESERVED (1<<1) /* nmi_active: - * +1: the lapic NMI watchdog is active, but can be disabled - * 0: the lapic NMI watchdog has not been set up, and cannot + * >0: the lapic NMI watchdog is active, but can be disabled + * <0: the lapic NMI watchdog has not been set up, and cannot * be enabled - * -1: the lapic NMI watchdog is disabled, but can be enabled + * 0: the lapic NMI watchdog is disabled, but can be enabled */ -int nmi_active; /* oprofile uses this */ +atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ int panic_on_timeout; unsigned int nmi_watchdog = NMI_DEFAULT; static unsigned int nmi_hz = HZ; -static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ -static unsigned int nmi_p4_cccr_val; -/* Note that these events don't tick when the CPU idles. This means - the frequency varies with CPU load. */ - -#define K7_EVNTSEL_ENABLE (1 << 22) -#define K7_EVNTSEL_INT (1 << 20) -#define K7_EVNTSEL_OS (1 << 17) -#define K7_EVNTSEL_USR (1 << 16) -#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 -#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING +struct nmi_watchdog_ctlblk { + int enabled; + u64 check_bit; + unsigned int cccr_msr; + unsigned int perfctr_msr; /* the MSR to reset in NMI handler */ + unsigned int evntsel_msr; /* the MSR to select the events to handle */ +}; +static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); -#define MSR_P4_MISC_ENABLE 0x1A0 -#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) -#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12) -#define MSR_P4_PERFCTR0 0x300 -#define MSR_P4_CCCR0 0x360 -#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) -#define P4_ESCR_OS (1<<3) -#define P4_ESCR_USR (1<<2) -#define P4_CCCR_OVF_PMI0 (1<<26) -#define P4_CCCR_OVF_PMI1 (1<<27) -#define P4_CCCR_THRESHOLD(N) ((N)<<20) -#define P4_CCCR_COMPLEMENT (1<<19) -#define P4_CCCR_COMPARE (1<<18) -#define P4_CCCR_REQUIRED (3<<16) -#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) -#define P4_CCCR_ENABLE (1<<12) -/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter - CRU_ESCR0 (with any non-null event selector) through a complemented - max threshold. [IA32-Vol3, Section 14.9.9] */ -#define MSR_P4_IQ_COUNTER0 0x30C -#define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR) -#define P4_NMI_IQ_CCCR0 \ - (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ - P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) +/* local prototypes */ +static void stop_apic_nmi_watchdog(void *unused); +static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); /* converts an msr to an appropriate reservation bit */ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) @@ -241,6 +217,12 @@ int __init check_nmi_watchdog (void) int *counts; int cpu; + if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT)) + return 0; + + if (!atomic_read(&nmi_active)) + return 0; + counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); if (!counts) return -1; @@ -258,19 +240,22 @@ int __init check_nmi_watchdog (void) mdelay((10*1000)/nmi_hz); // wait 10 ticks for_each_online_cpu(cpu) { + if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled) + continue; if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) { - endflag = 1; printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", cpu, counts[cpu], cpu_pda(cpu)->__nmi_count); - nmi_active = 0; - lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG; - nmi_perfctr_msr = 0; - kfree(counts); - return -1; + per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0; + atomic_dec(&nmi_active); } } + if (!atomic_read(&nmi_active)) { + kfree(counts); + atomic_set(&nmi_active, -1); + return -1; + } endflag = 1; printk("OK.\n"); @@ -297,8 +282,11 @@ int __init setup_nmi_watchdog(char *str) get_option(&str, &nmi); - if (nmi >= NMI_INVALID) + if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE)) return 0; + + if ((nmi == NMI_LOCAL_APIC) && (nmi_known_cpu() == 0)) + return 0; /* no lapic support */ nmi_watchdog = nmi; return 1; } @@ -307,31 +295,30 @@ __setup("nmi_watchdog=", setup_nmi_watchdog); static void disable_lapic_nmi_watchdog(void) { - if (nmi_active <= 0) + BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); + + if (atomic_read(&nmi_active) <= 0) return; - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - wrmsr(MSR_K7_EVNTSEL0, 0, 0); - break; - case X86_VENDOR_INTEL: - if (boot_cpu_data.x86 == 15) { - wrmsr(MSR_P4_IQ_CCCR0, 0, 0); - wrmsr(MSR_P4_CRU_ESCR0, 0, 0); - } - break; - } - nmi_active = -1; - /* tell do_nmi() and others that we're not active any more */ - nmi_watchdog = 0; + + on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); + + BUG_ON(atomic_read(&nmi_active) != 0); } static void enable_lapic_nmi_watchdog(void) { - if (nmi_active < 0) { - nmi_watchdog = NMI_LOCAL_APIC; - touch_nmi_watchdog(); - setup_apic_nmi_watchdog(); - } + BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); + + /* are we already enabled */ + if (atomic_read(&nmi_active) != 0) + return; + + /* are we lapic aware */ + if (nmi_known_cpu() <= 0) + return; + + on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); + touch_nmi_watchdog(); } int reserve_lapic_nmi(void) @@ -363,21 +350,24 @@ void release_lapic_nmi(void) void disable_timer_nmi_watchdog(void) { - if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0)) + BUG_ON(nmi_watchdog != NMI_IO_APIC); + + if (atomic_read(&nmi_active) <= 0) return; disable_irq(0); - unset_nmi_callback(); - nmi_active = -1; - nmi_watchdog = NMI_NONE; + on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); + + BUG_ON(atomic_read(&nmi_active) != 0); } void enable_timer_nmi_watchdog(void) { - if (nmi_active < 0) { - nmi_watchdog = NMI_IO_APIC; + BUG_ON(nmi_watchdog != NMI_IO_APIC); + + if (atomic_read(&nmi_active) == 0) { touch_nmi_watchdog(); - nmi_active = 1; + on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); enable_irq(0); } } @@ -388,7 +378,7 @@ static int nmi_pm_active; /* nmi_active before suspend */ static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) { - nmi_pm_active = nmi_active; + nmi_pm_active = atomic_read(&nmi_active); disable_lapic_nmi_watchdog(); return 0; } @@ -396,7 +386,7 @@ static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) static int lapic_nmi_resume(struct sys_device *dev) { if (nmi_pm_active > 0) - enable_lapic_nmi_watchdog(); + enable_lapic_nmi_watchdog(); return 0; } @@ -415,7 +405,13 @@ static int __init init_lapic_nmi_sysfs(void) { int error; - if (nmi_active == 0 || nmi_watchdog != NMI_LOCAL_APIC) + /* should really be a BUG_ON but b/c this is an + * init call, it just doesn't work. -dcz + */ + if (nmi_watchdog != NMI_LOCAL_APIC) + return 0; + + if ( atomic_read(&nmi_active) < 0 ) return 0; error = sysdev_class_register(&nmi_sysclass); @@ -428,100 +424,232 @@ late_initcall(init_lapic_nmi_sysfs); #endif /* CONFIG_PM */ +/* + * Activate the NMI watchdog via the local APIC. + * Original code written by Keith Owens. + */ + +/* Note that these events don't tick when the CPU idles. This means + the frequency varies with CPU load. */ + +#define K7_EVNTSEL_ENABLE (1 << 22) +#define K7_EVNTSEL_INT (1 << 20) +#define K7_EVNTSEL_OS (1 << 17) +#define K7_EVNTSEL_USR (1 << 16) +#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 +#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING + static int setup_k7_watchdog(void) { + unsigned int perfctr_msr, evntsel_msr; unsigned int evntsel; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - nmi_perfctr_msr = MSR_K7_PERFCTR0; - - if (!reserve_perfctr_nmi(nmi_perfctr_msr)) + perfctr_msr = MSR_K7_PERFCTR0; + evntsel_msr = MSR_K7_EVNTSEL0; + if (!reserve_perfctr_nmi(perfctr_msr)) goto fail; - if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0)) + if (!reserve_evntsel_nmi(evntsel_msr)) goto fail1; /* Simulator may not support it */ - if (checking_wrmsrl(MSR_K7_EVNTSEL0, 0UL)) + if (checking_wrmsrl(evntsel_msr, 0UL)) goto fail2; - wrmsrl(MSR_K7_PERFCTR0, 0UL); + wrmsrl(perfctr_msr, 0UL); evntsel = K7_EVNTSEL_INT | K7_EVNTSEL_OS | K7_EVNTSEL_USR | K7_NMI_EVENT; - wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); - wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz * 1000 / nmi_hz)); + /* setup the timer */ + wrmsr(evntsel_msr, evntsel, 0); + wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= K7_EVNTSEL_ENABLE; - wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); + wrmsr(evntsel_msr, evntsel, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; //unused + wd->check_bit = 1ULL<<63; return 1; fail2: - release_evntsel_nmi(MSR_K7_EVNTSEL0); + release_evntsel_nmi(evntsel_msr); fail1: - release_perfctr_nmi(nmi_perfctr_msr); + release_perfctr_nmi(perfctr_msr); fail: return 0; } +static void stop_k7_watchdog(void) +{ + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + wrmsr(wd->evntsel_msr, 0, 0); + + release_evntsel_nmi(wd->evntsel_msr); + release_perfctr_nmi(wd->perfctr_msr); +} + +/* Note that these events don't tick when the CPU idles. This means + the frequency varies with CPU load. */ + +#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) +#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) +#define P4_ESCR_OS (1<<3) +#define P4_ESCR_USR (1<<2) +#define P4_CCCR_OVF_PMI0 (1<<26) +#define P4_CCCR_OVF_PMI1 (1<<27) +#define P4_CCCR_THRESHOLD(N) ((N)<<20) +#define P4_CCCR_COMPLEMENT (1<<19) +#define P4_CCCR_COMPARE (1<<18) +#define P4_CCCR_REQUIRED (3<<16) +#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) +#define P4_CCCR_ENABLE (1<<12) +#define P4_CCCR_OVF (1<<31) +/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter + CRU_ESCR0 (with any non-null event selector) through a complemented + max threshold. [IA32-Vol3, Section 14.9.9] */ static int setup_p4_watchdog(void) { + unsigned int perfctr_msr, evntsel_msr, cccr_msr; + unsigned int evntsel, cccr_val; unsigned int misc_enable, dummy; + unsigned int ht_num; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy); + rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy); if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) return 0; - nmi_perfctr_msr = MSR_P4_IQ_COUNTER0; - nmi_p4_cccr_val = P4_NMI_IQ_CCCR0; #ifdef CONFIG_SMP - if (smp_num_siblings == 2) - nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1; + /* detect which hyperthread we are on */ + if (smp_num_siblings == 2) { + unsigned int ebx, apicid; + + ebx = cpuid_ebx(1); + apicid = (ebx >> 24) & 0xff; + ht_num = apicid & 1; + } else #endif + ht_num = 0; + + /* performance counters are shared resources + * assign each hyperthread its own set + * (re-use the ESCR0 register, seems safe + * and keeps the cccr_val the same) + */ + if (!ht_num) { + /* logical cpu 0 */ + perfctr_msr = MSR_P4_IQ_PERFCTR0; + evntsel_msr = MSR_P4_CRU_ESCR0; + cccr_msr = MSR_P4_IQ_CCCR0; + cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); + } else { + /* logical cpu 1 */ + perfctr_msr = MSR_P4_IQ_PERFCTR1; + evntsel_msr = MSR_P4_CRU_ESCR0; + cccr_msr = MSR_P4_IQ_CCCR1; + cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4); + } - if (!reserve_perfctr_nmi(nmi_perfctr_msr)) + if (!reserve_perfctr_nmi(perfctr_msr)) goto fail; - if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0)) + if (!reserve_evntsel_nmi(evntsel_msr)) goto fail1; - wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0); - wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0); - Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz * 1000UL / nmi_hz)); - wrmsrl(MSR_P4_IQ_COUNTER0, -((u64)cpu_khz * 1000 / nmi_hz)); + evntsel = P4_ESCR_EVENT_SELECT(0x3F) + | P4_ESCR_OS + | P4_ESCR_USR; + + cccr_val |= P4_CCCR_THRESHOLD(15) + | P4_CCCR_COMPLEMENT + | P4_CCCR_COMPARE + | P4_CCCR_REQUIRED; + + wrmsr(evntsel_msr, evntsel, 0); + wrmsr(cccr_msr, cccr_val, 0); + wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); apic_write(APIC_LVTPC, APIC_DM_NMI); - wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); + cccr_val |= P4_CCCR_ENABLE; + wrmsr(cccr_msr, cccr_val, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = cccr_msr; + wd->check_bit = 1ULL<<39; return 1; fail1: - release_perfctr_nmi(nmi_perfctr_msr); + release_perfctr_nmi(perfctr_msr); fail: return 0; } -void setup_apic_nmi_watchdog(void) +static void stop_p4_watchdog(void) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - if (boot_cpu_data.x86 != 15) - return; - if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) - return; - if (!setup_k7_watchdog()) - return; - break; - case X86_VENDOR_INTEL: - if (boot_cpu_data.x86 != 15) - return; - if (!setup_p4_watchdog()) + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + wrmsr(wd->cccr_msr, 0, 0); + wrmsr(wd->evntsel_msr, 0, 0); + + release_evntsel_nmi(wd->evntsel_msr); + release_perfctr_nmi(wd->perfctr_msr); +} + +void setup_apic_nmi_watchdog(void *unused) +{ + /* only support LOCAL and IO APICs for now */ + if ((nmi_watchdog != NMI_LOCAL_APIC) && + (nmi_watchdog != NMI_IO_APIC)) + return; + + if (nmi_watchdog == NMI_LOCAL_APIC) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) + return; + if (!setup_k7_watchdog()) + return; + break; + case X86_VENDOR_INTEL: + if (!setup_p4_watchdog()) + return; + break; + default: return; - break; + } + } + __get_cpu_var(nmi_watchdog_ctlblk.enabled) = 1; + atomic_inc(&nmi_active); +} - default: - return; +static void stop_apic_nmi_watchdog(void *unused) +{ + /* only support LOCAL and IO APICs for now */ + if ((nmi_watchdog != NMI_LOCAL_APIC) && + (nmi_watchdog != NMI_IO_APIC)) + return; + + if (nmi_watchdog == NMI_LOCAL_APIC) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) + return; + stop_k7_watchdog(); + break; + case X86_VENDOR_INTEL: + stop_p4_watchdog(); + break; + default: + return; + } } - lapic_nmi_owner = LAPIC_NMI_WATCHDOG; - nmi_active = 1; + __get_cpu_var(nmi_watchdog_ctlblk.enabled) = 0; + atomic_dec(&nmi_active); } /* @@ -558,50 +686,70 @@ void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) { int sum; int touched = 0; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + u64 dummy; + + /* check for other users first */ + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) { + touched = 1; + } sum = read_pda(apic_timer_irqs); if (__get_cpu_var(nmi_touch)) { __get_cpu_var(nmi_touch) = 0; touched = 1; } + #ifdef CONFIG_X86_MCE /* Could check oops_in_progress here too, but it's safer not too */ if (atomic_read(&mce_entry) > 0) touched = 1; #endif + /* if the apic timer isn't firing, this cpu isn't doing much */ if (!touched && __get_cpu_var(last_irq_sum) == sum) { /* * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ local_inc(&__get_cpu_var(alert_counter)); - if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) { - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) { - local_set(&__get_cpu_var(alert_counter), 0); - return; - } + if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs); - } } else { __get_cpu_var(last_irq_sum) = sum; local_set(&__get_cpu_var(alert_counter), 0); } - if (nmi_perfctr_msr) { - if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) { - /* - * P4 quirks: - * - An overflown perfctr will assert its interrupt - * until the OVF flag in its CCCR is cleared. - * - LVTPC is masked on interrupt and must be - * unmasked by the LVTPC handler. - */ - wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); - apic_write(APIC_LVTPC, APIC_DM_NMI); - } - wrmsrl(nmi_perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); + + /* see if the nmi watchdog went off */ + if (wd->enabled) { + if (nmi_watchdog == NMI_LOCAL_APIC) { + rdmsrl(wd->perfctr_msr, dummy); + if (dummy & wd->check_bit){ + /* this wasn't a watchdog timer interrupt */ + goto done; + } + + /* only Intel uses the cccr msr */ + if (wd->cccr_msr != 0) { + /* + * P4 quirks: + * - An overflown perfctr will assert its interrupt + * until the OVF flag in its CCCR is cleared. + * - LVTPC is masked on interrupt and must be + * unmasked by the LVTPC handler. + */ + rdmsrl(wd->cccr_msr, dummy); + dummy &= ~P4_CCCR_OVF; + wrmsrl(wd->cccr_msr, dummy); + apic_write(APIC_LVTPC, APIC_DM_NMI); + } + /* start the cycle over again */ + wrmsrl(wd->perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); + } } +done: + return; } static __kprobes int dummy_nmi_callback(struct pt_regs * regs, int cpu) -- cgit v1.2.3 From 3adbbcce9a49b900d4cc118cdccfdefa78bf1afb Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 26 Sep 2006 10:52:26 +0200 Subject: [PATCH] x86: Cleanup NMI interrupt path This patch cleans up the NMI interrupt path. Instead of being gated by if the 'nmi callback' is set, the interrupt handler now calls everyone who is registered on the die_chain and additionally checks the nmi watchdog, reseting it if enabled. This allows more subsystems to hook into the NMI if they need to (without being block by set_nmi_callback). Signed-off-by: Don Zickus Signed-off-by: Andi Kleen --- arch/x86_64/kernel/nmi.c | 26 +++++++++++++++++++------- arch/x86_64/kernel/traps.c | 8 ++++---- 2 files changed, 23 insertions(+), 11 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index d42374a952d..f6b881b23a7 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -682,16 +682,18 @@ void touch_nmi_watchdog (void) touch_softlockup_watchdog(); } -void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) +int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) { int sum; int touched = 0; struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); u64 dummy; + int rc=0; /* check for other users first */ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) { + rc = 1; touched = 1; } @@ -746,10 +748,18 @@ void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) } /* start the cycle over again */ wrmsrl(wd->perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); - } + rc = 1; + } else if (nmi_watchdog == NMI_IO_APIC) { + /* don't know how to accurately check for this. + * just assume it was a watchdog timer interrupt + * This matches the old behaviour. + */ + rc = 1; + } else + printk(KERN_WARNING "Unknown enabled NMI hardware?!\n"); } done: - return; + return rc; } static __kprobes int dummy_nmi_callback(struct pt_regs * regs, int cpu) @@ -761,15 +771,17 @@ static nmi_callback_t nmi_callback = dummy_nmi_callback; asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code) { - int cpu = safe_smp_processor_id(); - nmi_enter(); add_pda(__nmi_count,1); - if (!rcu_dereference(nmi_callback)(regs, cpu)) - default_do_nmi(regs); + default_do_nmi(regs); nmi_exit(); } +int do_nmi_callback(struct pt_regs * regs, int cpu) +{ + return rcu_dereference(nmi_callback)(regs, cpu); +} + void set_nmi_callback(nmi_callback_t callback) { vmalloc_sync_all(); diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index b1249774d1e..42bc070fdf1 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -781,12 +781,12 @@ asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs) * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. */ - if (nmi_watchdog > 0) { - nmi_watchdog_tick(regs,reason); + if (nmi_watchdog_tick(regs,reason)) return; - } + if (!do_nmi_callback(regs,cpu)) #endif - unknown_nmi_error(reason, regs); + unknown_nmi_error(reason, regs); + return; } if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) -- cgit v1.2.3 From 1d001df19d5323e642ba8ac821c713675ebccd82 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:26 +0200 Subject: [PATCH] Add TIF_RESTORE_SIGMASK We need TIF_RESTORE_SIGMASK in order to support ppoll() and pselect() system calls. This patch originally came from Andi, and was based heavily on David Howells' implementation of same on i386. I fixed a typo which was causing do_signal() to use the wrong signal mask. Signed-off-by: David Woodhouse Signed-off-by: Andi Kleen --- arch/x86_64/kernel/signal.c | 82 +++++++++++++++++++-------------------------- 1 file changed, 35 insertions(+), 47 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c index 28161170fb0..7f58bc9a056 100644 --- a/arch/x86_64/kernel/signal.c +++ b/arch/x86_64/kernel/signal.c @@ -37,37 +37,6 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, int ia32_setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, struct pt_regs * regs); -asmlinkage long -sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize, struct pt_regs *regs) -{ - sigset_t saveset, newset; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - - if (copy_from_user(&newset, unewset, sizeof(newset))) - return -EFAULT; - sigdelsetmask(&newset, ~_BLOCKABLE); - - spin_lock_irq(¤t->sighand->siglock); - saveset = current->blocked; - current->blocked = newset; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); -#ifdef DEBUG_SIG - printk("rt_sigsuspend savset(%lx) newset(%lx) regs(%p) rip(%lx)\n", - saveset, newset, regs, regs->rip); -#endif - regs->rax = -EINTR; - while (1) { - current->state = TASK_INTERRUPTIBLE; - schedule(); - if (do_signal(regs, &saveset)) - return -EINTR; - } -} - asmlinkage long sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, struct pt_regs *regs) @@ -341,11 +310,11 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif - return 1; + return 0; give_sigsegv: force_sigsegv(sig, current); - return 0; + return -EFAULT; } /* @@ -408,7 +377,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, #endif ret = setup_rt_frame(sig, ka, info, oldset, regs); - if (ret) { + if (ret == 0) { spin_lock_irq(¤t->sighand->siglock); sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); if (!(ka->sa.sa_flags & SA_NODEFER)) @@ -425,11 +394,12 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. */ -int do_signal(struct pt_regs *regs, sigset_t *oldset) +static void do_signal(struct pt_regs *regs) { struct k_sigaction ka; siginfo_t info; int signr; + sigset_t *oldset; /* * We want the common case to go fast, which @@ -438,9 +408,11 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset) * if so. */ if (!user_mode(regs)) - return 1; + return; - if (!oldset) + if (test_thread_flag(TIF_RESTORE_SIGMASK)) + oldset = ¤t->saved_sigmask; + else oldset = ¤t->blocked; signr = get_signal_to_deliver(&info, &ka, regs, NULL); @@ -454,30 +426,46 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset) set_debugreg(current->thread.debugreg7, 7); /* Whee! Actually deliver the signal. */ - return handle_signal(signr, &info, &ka, oldset, regs); + if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { + /* a signal was successfully delivered; the saved + * sigmask will have been stored in the signal frame, + * and will be restored by sigreturn, so we can simply + * clear the TIF_RESTORE_SIGMASK flag */ + clear_thread_flag(TIF_RESTORE_SIGMASK); + } + return; } /* Did we come from a system call? */ if ((long)regs->orig_rax >= 0) { /* Restart the system call - no handlers present */ long res = regs->rax; - if (res == -ERESTARTNOHAND || - res == -ERESTARTSYS || - res == -ERESTARTNOINTR) { + switch (res) { + case -ERESTARTNOHAND: + case -ERESTARTSYS: + case -ERESTARTNOINTR: regs->rax = regs->orig_rax; regs->rip -= 2; - } - if (regs->rax == (unsigned long)-ERESTART_RESTARTBLOCK) { + break; + case -ERESTART_RESTARTBLOCK: regs->rax = test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall; regs->rip -= 2; + break; } } - return 0; + + /* if there's no signal to deliver, we just put the saved sigmask + back. */ + if (test_thread_flag(TIF_RESTORE_SIGMASK)) { + clear_thread_flag(TIF_RESTORE_SIGMASK); + sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); + } } -void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_flags) +void +do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { #ifdef DEBUG_SIG printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%lx pending:%lx\n", @@ -491,8 +479,8 @@ void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_ } /* deal with pending signal delivery */ - if (thread_info_flags & _TIF_SIGPENDING) - do_signal(regs,oldset); + if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK)) + do_signal(regs); } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) -- cgit v1.2.3 From 2fbe7b25c8edaf2d10e6c1a4cc9f8afe714c4764 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] i386/x86-64: Remove un/set_nmi_callback and reserve/release_lapic_nmi functions Removes the un/set_nmi_callback and reserve/release_lapic_nmi functions as they are no longer needed. The various subsystems are modified to register with the die_notifier instead. Also includes compile fixes by Andrew Morton. Signed-off-by: Don Zickus Signed-off-by: Andi Kleen --- arch/x86_64/kernel/crash.c | 20 ++++++++- arch/x86_64/kernel/nmi.c | 102 ++++----------------------------------------- 2 files changed, 25 insertions(+), 97 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c index d8d5750d610..44c8af65325 100644 --- a/arch/x86_64/kernel/crash.c +++ b/arch/x86_64/kernel/crash.c @@ -23,6 +23,7 @@ #include #include #include +#include /* This keeps a track of which one is crashing cpu. */ static int crashing_cpu; @@ -95,8 +96,18 @@ static void crash_save_self(struct pt_regs *regs) #ifdef CONFIG_SMP static atomic_t waiting_for_crash_ipi; -static int crash_nmi_callback(struct pt_regs *regs, int cpu) +static int crash_nmi_callback(struct notifier_block *self, + unsigned long val, void *data) { + struct pt_regs *regs; + int cpu; + + if (val != DIE_NMI) + return NOTIFY_OK; + + regs = ((struct die_args *)data)->regs; + cpu = raw_smp_processor_id(); + /* * Don't do anything if this handler is invoked on crashing cpu. * Otherwise, system will completely hang. Crashing cpu can get @@ -127,12 +138,17 @@ static void smp_send_nmi_allbutself(void) * cpu hotplug shouldn't matter. */ +static struct notifier_block crash_nmi_nb = { + .notifier_call = crash_nmi_callback, +}; + static void nmi_shootdown_cpus(void) { unsigned long msecs; atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); - set_nmi_callback(crash_nmi_callback); + if (register_die_notifier(&crash_nmi_nb)) + return; /* return what? */ /* * Ensure the new callback function is set before sending diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index f6b881b23a7..9d175dcf3a2 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -41,20 +41,6 @@ static DEFINE_PER_CPU(unsigned, evntsel_nmi_owner[2]); */ #define NMI_MAX_COUNTER_BITS 66 -/* - * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: - * - it may be reserved by some other driver, or not - * - when not reserved by some other driver, it may be used for - * the NMI watchdog, or not - * - * This is maintained separately from nmi_active because the NMI - * watchdog may also be driven from the I/O APIC timer. - */ -static DEFINE_SPINLOCK(lapic_nmi_owner_lock); -static unsigned int lapic_nmi_owner; -#define LAPIC_NMI_WATCHDOG (1<<0) -#define LAPIC_NMI_RESERVED (1<<1) - /* nmi_active: * >0: the lapic NMI watchdog is active, but can be disabled * <0: the lapic NMI watchdog has not been set up, and cannot @@ -321,33 +307,6 @@ static void enable_lapic_nmi_watchdog(void) touch_nmi_watchdog(); } -int reserve_lapic_nmi(void) -{ - unsigned int old_owner; - - spin_lock(&lapic_nmi_owner_lock); - old_owner = lapic_nmi_owner; - lapic_nmi_owner |= LAPIC_NMI_RESERVED; - spin_unlock(&lapic_nmi_owner_lock); - if (old_owner & LAPIC_NMI_RESERVED) - return -EBUSY; - if (old_owner & LAPIC_NMI_WATCHDOG) - disable_lapic_nmi_watchdog(); - return 0; -} - -void release_lapic_nmi(void) -{ - unsigned int new_owner; - - spin_lock(&lapic_nmi_owner_lock); - new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED; - lapic_nmi_owner = new_owner; - spin_unlock(&lapic_nmi_owner_lock); - if (new_owner & LAPIC_NMI_WATCHDOG) - enable_lapic_nmi_watchdog(); -} - void disable_timer_nmi_watchdog(void) { BUG_ON(nmi_watchdog != NMI_IO_APIC); @@ -762,13 +721,6 @@ done: return rc; } -static __kprobes int dummy_nmi_callback(struct pt_regs * regs, int cpu) -{ - return 0; -} - -static nmi_callback_t nmi_callback = dummy_nmi_callback; - asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code) { nmi_enter(); @@ -779,21 +731,12 @@ asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code) int do_nmi_callback(struct pt_regs * regs, int cpu) { - return rcu_dereference(nmi_callback)(regs, cpu); -} - -void set_nmi_callback(nmi_callback_t callback) -{ - vmalloc_sync_all(); - rcu_assign_pointer(nmi_callback, callback); -} -EXPORT_SYMBOL_GPL(set_nmi_callback); - -void unset_nmi_callback(void) -{ - nmi_callback = dummy_nmi_callback; +#ifdef CONFIG_SYSCTL + if (unknown_nmi_panic) + return unknown_nmi_panic_callback(regs, cpu); +#endif + return 0; } -EXPORT_SYMBOL_GPL(unset_nmi_callback); #ifdef CONFIG_SYSCTL @@ -802,37 +745,8 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) unsigned char reason = get_nmi_reason(); char buf[64]; - if (!(reason & 0xc0)) { - sprintf(buf, "NMI received for unknown reason %02x\n", reason); - die_nmi(buf,regs); - } - return 0; -} - -/* - * proc handler for /proc/sys/kernel/unknown_nmi_panic - */ -int proc_unknown_nmi_panic(struct ctl_table *table, int write, struct file *file, - void __user *buffer, size_t *length, loff_t *ppos) -{ - int old_state; - - old_state = unknown_nmi_panic; - proc_dointvec(table, write, file, buffer, length, ppos); - if (!!old_state == !!unknown_nmi_panic) - return 0; - - if (unknown_nmi_panic) { - if (reserve_lapic_nmi() < 0) { - unknown_nmi_panic = 0; - return -EBUSY; - } else { - set_nmi_callback(unknown_nmi_panic_callback); - } - } else { - release_lapic_nmi(); - unset_nmi_callback(); - } + sprintf(buf, "NMI received for unknown reason %02x\n", reason); + die_nmi(buf,regs); return 0; } @@ -846,8 +760,6 @@ EXPORT_SYMBOL(reserve_perfctr_nmi); EXPORT_SYMBOL(release_perfctr_nmi); EXPORT_SYMBOL(reserve_evntsel_nmi); EXPORT_SYMBOL(release_evntsel_nmi); -EXPORT_SYMBOL(reserve_lapic_nmi); -EXPORT_SYMBOL(release_lapic_nmi); EXPORT_SYMBOL(disable_timer_nmi_watchdog); EXPORT_SYMBOL(enable_timer_nmi_watchdog); EXPORT_SYMBOL(touch_nmi_watchdog); -- cgit v1.2.3 From 407984f1af259b31957c7c05075a454a751bb801 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] x86: Add abilty to enable/disable nmi watchdog with sysctl Adds a new /proc/sys/kernel/nmi call that will enable/disable the nmi watchdog. Signed-off-by: Don Zickus Signed-off-by: Andi Kleen --- arch/x86_64/kernel/nmi.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 9d175dcf3a2..3a17411a9a1 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -750,6 +750,54 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) return 0; } +/* + * proc handler for /proc/sys/kernel/nmi + */ +int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int old_state; + + nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; + old_state = nmi_watchdog_enabled; + proc_dointvec(table, write, file, buffer, length, ppos); + if (!!old_state == !!nmi_watchdog_enabled) + return 0; + + if (atomic_read(&nmi_active) < 0) { + printk( KERN_WARNING "NMI watchdog is permanently disabled\n"); + return -EINVAL; + } + + /* if nmi_watchdog is not set yet, then set it */ + nmi_watchdog_default(); + + if (nmi_watchdog == NMI_LOCAL_APIC) + { + if (nmi_watchdog_enabled) + enable_lapic_nmi_watchdog(); + else + disable_lapic_nmi_watchdog(); + } else if (nmi_watchdog == NMI_IO_APIC) { + /* FIXME + * for some reason these functions don't work + */ + printk("Can not enable/disable NMI on IO APIC\n"); + return -EIO; +#if 0 + if (nmi_watchdog_enabled) + enable_timer_nmi_watchdog(); + else + disable_timer_nmi_watchdog(); +#endif + } else { + printk(KERN_WARNING + "NMI watchdog doesn't know what hardware to touch\n"); + return -EIO; + } + return 0; +} + #endif EXPORT_SYMBOL(nmi_active); -- cgit v1.2.3 From e33e89ab1a8d295de0500b697f4f31c3ceee9aa2 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] x86: Add abilty to enable/disable nmi watchdog from procfs (update) Adds a new /proc/sys/kernel/nmi_watchdog call that will enable/disable the nmi watchdog. By entering a non-zero value here, a user can enable the nmi watchdog to monitor the online cpus in the system. By entering a zero value here, a user can disable the nmi watchdog and free up a performance counter which could then be utilized by the oprofile subsystem, otherwise oprofile may be short a counter when in use. Signed-off-by: Don Zickus Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/kernel/nmi.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 3a17411a9a1..dd57410dad5 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -167,7 +167,7 @@ static __cpuinit inline int nmi_known_cpu(void) } /* Run after command line and cpu_init init, but before all other checks */ -void __cpuinit nmi_watchdog_default(void) +void nmi_watchdog_default(void) { if (nmi_watchdog != NMI_DEFAULT) return; @@ -766,32 +766,19 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, if (atomic_read(&nmi_active) < 0) { printk( KERN_WARNING "NMI watchdog is permanently disabled\n"); - return -EINVAL; + return -EIO; } /* if nmi_watchdog is not set yet, then set it */ nmi_watchdog_default(); - if (nmi_watchdog == NMI_LOCAL_APIC) - { + if (nmi_watchdog == NMI_LOCAL_APIC) { if (nmi_watchdog_enabled) enable_lapic_nmi_watchdog(); else disable_lapic_nmi_watchdog(); - } else if (nmi_watchdog == NMI_IO_APIC) { - /* FIXME - * for some reason these functions don't work - */ - printk("Can not enable/disable NMI on IO APIC\n"); - return -EIO; -#if 0 - if (nmi_watchdog_enabled) - enable_timer_nmi_watchdog(); - else - disable_timer_nmi_watchdog(); -#endif } else { - printk(KERN_WARNING + printk( KERN_WARNING "NMI watchdog doesn't know what hardware to touch\n"); return -EIO; } -- cgit v1.2.3 From 8da5adda91df3d2fcc5300e68da491694c9af019 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] x86: Allow users to force a panic on NMI To quote Alan Cox: The default Linux behaviour on an NMI of either memory or unknown is to continue operation. For many environments such as scientific computing it is preferable that the box is taken out and the error dealt with than an uncorrected parity/ECC error get propogated. A small number of systems do generate NMI's for bizarre random reasons such as power management so the default is unchanged. In other respects the new proc/sys entry works like the existing panic controls already in that directory. This is separate to the edac support - EDAC allows supported chipsets to handle ECC errors well, this change allows unsupported cases to at least panic rather than cause problems further down the line. Signed-off-by: Don Zickus Signed-off-by: Andi Kleen --- arch/x86_64/kernel/traps.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 42bc070fdf1..b18829db2a6 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -732,6 +732,8 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs) { printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n"); printk("You probably have a hardware problem with your RAM chips\n"); + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); /* Clear and disable the memory parity error line. */ reason = (reason & 0xf) | 4; @@ -757,6 +759,10 @@ unknown_nmi_error(unsigned char reason, struct pt_regs * regs) { printk("Uhhuh. NMI received for unknown reason %02x.\n", reason); printk("Dazed and confused, but trying to continue\n"); printk("Do you have a strange power saving mode enabled?\n"); + + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + } /* Runs on IST stack. This code must keep interrupts off all the time. -- cgit v1.2.3 From c41c5cd3b20a2d81c30498f13b1527847a8fdf69 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] x86: x86 clean up nmi panic messages Clean up some of the output messages on the nmi error paths to make more sense when they are displayed. This is mainly a cosmetic fix and shouldn't impact any normal code path. Signed-off-by: Don Zickus Signed-off-by: Andi Kleen --- arch/x86_64/kernel/traps.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index b18829db2a6..dae10df6092 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -730,10 +730,15 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, static __kprobes void mem_parity_error(unsigned char reason, struct pt_regs * regs) { - printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n"); - printk("You probably have a hardware problem with your RAM chips\n"); + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", + reason); + printk(KERN_EMERG "You probably have a hardware problem with your " + "RAM chips\n"); + if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); + panic("NMI: Not continuing"); + + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); /* Clear and disable the memory parity error line. */ reason = (reason & 0xf) | 4; @@ -756,13 +761,15 @@ io_check_error(unsigned char reason, struct pt_regs * regs) static __kprobes void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) -{ printk("Uhhuh. NMI received for unknown reason %02x.\n", reason); - printk("Dazed and confused, but trying to continue\n"); - printk("Do you have a strange power saving mode enabled?\n"); +{ + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", + reason); + printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); + panic("NMI: Not continuing"); + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); } /* Runs on IST stack. This code must keep interrupts off all the time. -- cgit v1.2.3 From 4038f901cf102a40715b900984ed7540a9fa637f Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] i386/x86-64: Fix NMI watchdog suspend/resume Making NMI suspend/resume work with SMP. We use CPU hotplug to offline APs in SMP suspend/resume. Only BSP executes sysdev's .suspend/.resume method. APs should follow CPU hotplug code path. And: +From: Don Zickus Makes the start/stop paths of nmi watchdog more robust to handle the suspend/resume cases more gracefully. AK: I merged the two patches together Signed-off-by: Shaohua Li Signed-off-by: Andi Kleen Cc: Don Zickus Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/kernel/nmi.c | 33 ++++++++++++++++++++++++++------- arch/x86_64/kernel/smpboot.c | 2 ++ 2 files changed, 28 insertions(+), 7 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index dd57410dad5..5a35975e576 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -63,7 +63,6 @@ struct nmi_watchdog_ctlblk { static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); /* local prototypes */ -static void stop_apic_nmi_watchdog(void *unused); static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); /* converts an msr to an appropriate reservation bit */ @@ -337,15 +336,20 @@ static int nmi_pm_active; /* nmi_active before suspend */ static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) { + /* only CPU0 goes here, other CPUs should be offline */ nmi_pm_active = atomic_read(&nmi_active); - disable_lapic_nmi_watchdog(); + stop_apic_nmi_watchdog(NULL); + BUG_ON(atomic_read(&nmi_active) != 0); return 0; } static int lapic_nmi_resume(struct sys_device *dev) { - if (nmi_pm_active > 0) - enable_lapic_nmi_watchdog(); + /* only CPU0 goes here, other CPUs should be offline */ + if (nmi_pm_active > 0) { + setup_apic_nmi_watchdog(NULL); + touch_nmi_watchdog(); + } return 0; } @@ -561,11 +565,21 @@ static void stop_p4_watchdog(void) void setup_apic_nmi_watchdog(void *unused) { + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + /* only support LOCAL and IO APICs for now */ if ((nmi_watchdog != NMI_LOCAL_APIC) && (nmi_watchdog != NMI_IO_APIC)) return; + if (wd->enabled == 1) + return; + + /* cheap hack to support suspend/resume */ + /* if cpu0 is not active neither should the other cpus */ + if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) + return; + if (nmi_watchdog == NMI_LOCAL_APIC) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: @@ -582,17 +596,22 @@ void setup_apic_nmi_watchdog(void *unused) return; } } - __get_cpu_var(nmi_watchdog_ctlblk.enabled) = 1; + wd->enabled = 1; atomic_inc(&nmi_active); } -static void stop_apic_nmi_watchdog(void *unused) +void stop_apic_nmi_watchdog(void *unused) { + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + /* only support LOCAL and IO APICs for now */ if ((nmi_watchdog != NMI_LOCAL_APIC) && (nmi_watchdog != NMI_IO_APIC)) return; + if (wd->enabled == 0) + return; + if (nmi_watchdog == NMI_LOCAL_APIC) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: @@ -607,7 +626,7 @@ static void stop_apic_nmi_watchdog(void *unused) return; } } - __get_cpu_var(nmi_watchdog_ctlblk.enabled) = 0; + wd->enabled = 0; atomic_dec(&nmi_active); } diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 975380207b4..15555879ce9 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -1233,6 +1233,8 @@ int __cpu_disable(void) if (cpu == 0) return -EBUSY; + if (nmi_watchdog == NMI_LOCAL_APIC) + stop_apic_nmi_watchdog(NULL); clear_local_APIC(); /* -- cgit v1.2.3 From fac58550e80c307bf17cfa0dd544fca4eff120a5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] Fix up panic messages for different NMI panics When a unknown NMI happened the panic would claim a NMI watchdog timeout. Also it would check the variable set by nmi_watchdog=panic and panic then. Fix up the panic message to be generic Unconditionally panic on unknown NMI when panic on unknown nmi is enabled. Noticed by Jan Beulich Cc: jbeulich@novell.com Signed-off-by: Andi Kleen --- arch/x86_64/kernel/nmi.c | 5 +++-- arch/x86_64/kernel/traps.c | 7 +++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 5a35975e576..1b76d157452 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -695,7 +695,8 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) */ local_inc(&__get_cpu_var(alert_counter)); if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) - die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs); + die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs, + panic_on_timeout); } else { __get_cpu_var(last_irq_sum) = sum; local_set(&__get_cpu_var(alert_counter), 0); @@ -765,7 +766,7 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) char buf[64]; sprintf(buf, "NMI received for unknown reason %02x\n", reason); - die_nmi(buf,regs); + die_nmi(buf, regs, 1); /* Always panic here */ return 0; } diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index dae10df6092..96f62a03324 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -570,7 +570,7 @@ void die(const char * str, struct pt_regs * regs, long err) do_exit(SIGSEGV); } -void __kprobes die_nmi(char *str, struct pt_regs *regs) +void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) { unsigned long flags = oops_begin(); @@ -582,9 +582,8 @@ void __kprobes die_nmi(char *str, struct pt_regs *regs) show_registers(regs); if (kexec_should_crash(current)) crash_kexec(regs); - if (panic_on_timeout || panic_on_oops) - panic("nmi watchdog"); - printk("console shuts up ...\n"); + if (do_panic || panic_on_oops) + panic("Non maskable interrupt"); oops_end(flags); nmi_exit(); local_irq_enable(); -- cgit v1.2.3 From 3f22c5789eb76fd9aabe4be37ba609c793f046f9 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] kdump x86_64 nmi event notification fix After a crash we should wait for NMI IPI event and not for external NMI or NMI watchdog tick. Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen Cc: Don Zickus Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/kernel/crash.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c index 44c8af65325..fc57a139123 100644 --- a/arch/x86_64/kernel/crash.c +++ b/arch/x86_64/kernel/crash.c @@ -102,7 +102,7 @@ static int crash_nmi_callback(struct notifier_block *self, struct pt_regs *regs; int cpu; - if (val != DIE_NMI) + if (val != DIE_NMI_IPI) return NOTIFY_OK; regs = ((struct die_args *)data)->regs; @@ -114,7 +114,7 @@ static int crash_nmi_callback(struct notifier_block *self, * an NMI if system was initially booted with nmi_watchdog parameter. */ if (cpu == crashing_cpu) - return 1; + return NOTIFY_STOP; local_irq_disable(); crash_save_this_cpu(regs, cpu); -- cgit v1.2.3 From 248dcb2ffffe8f3e4a369556a68988788c208111 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] x86: i386/x86-64 Add nmi watchdog support for new Intel CPUs AK: This redoes the changes I temporarily reverted. Intel now has support for Architectural Performance Monitoring Counters ( Refer to IA-32 Intel Architecture Software Developer's Manual http://www.intel.com/design/pentium4/manuals/253669.htm ). This feature is present starting from Intel Core Duo and Intel Core Solo processors. What this means is, the performance monitoring counters and some performance monitoring events are now defined in an architectural way (using cpuid). And there will be no need to check for family/model etc for these architectural events. Below is the patch to use this performance counters in nmi watchdog driver. Patch handles both i386 and x86-64 kernels. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Andi Kleen --- arch/x86_64/kernel/nmi.c | 130 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 125 insertions(+), 5 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 1b76d157452..4d6fb047952 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -26,6 +26,7 @@ #include #include #include +#include /* perfctr_nmi_owner tracks the ownership of the perfctr registers: * evtsel_nmi_owner tracks the ownership of the event selection @@ -73,7 +74,10 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) case X86_VENDOR_AMD: return (msr - MSR_K7_PERFCTR0); case X86_VENDOR_INTEL: - return (msr - MSR_P4_BPU_PERFCTR0); + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return (msr - MSR_ARCH_PERFMON_PERFCTR0); + else + return (msr - MSR_P4_BPU_PERFCTR0); } return 0; } @@ -86,7 +90,10 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) case X86_VENDOR_AMD: return (msr - MSR_K7_EVNTSEL0); case X86_VENDOR_INTEL: - return (msr - MSR_P4_BSU_ESCR0); + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return (msr - MSR_ARCH_PERFMON_EVENTSEL0); + else + return (msr - MSR_P4_BSU_ESCR0); } return 0; } @@ -160,7 +167,10 @@ static __cpuinit inline int nmi_known_cpu(void) case X86_VENDOR_AMD: return boot_cpu_data.x86 == 15; case X86_VENDOR_INTEL: - return boot_cpu_data.x86 == 15; + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return 1; + else + return (boot_cpu_data.x86 == 15); } return 0; } @@ -246,8 +256,22 @@ int __init check_nmi_watchdog (void) /* now that we know it works we can reduce NMI frequency to something more reasonable; makes a difference in some configs */ - if (nmi_watchdog == NMI_LOCAL_APIC) + if (nmi_watchdog == NMI_LOCAL_APIC) { + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + nmi_hz = 1; + /* + * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter + * are writable, with higher bits sign extending from bit 31. + * So, we can only program the counter with 31 bit values and + * 32nd bit should be 1, for 33.. to be 1. + * Find the appropriate nmi_hz + */ + if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 && + ((u64)cpu_khz * 1000) > 0x7fffffffULL) { + nmi_hz = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1; + } + } kfree(counts); return 0; @@ -563,6 +587,87 @@ static void stop_p4_watchdog(void) release_perfctr_nmi(wd->perfctr_msr); } +#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL +#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK + +static int setup_intel_arch_watchdog(void) +{ + unsigned int ebx; + union cpuid10_eax eax; + unsigned int unused; + unsigned int perfctr_msr, evntsel_msr; + unsigned int evntsel; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + /* + * Check whether the Architectural PerfMon supports + * Unhalted Core Cycles Event or not. + * NOTE: Corresponding bit = 0 in ebx indicates event present. + */ + cpuid(10, &(eax.full), &ebx, &unused, &unused); + if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || + (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) + goto fail; + + perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0; + evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0; + + if (!reserve_perfctr_nmi(perfctr_msr)) + goto fail; + + if (!reserve_evntsel_nmi(evntsel_msr)) + goto fail1; + + wrmsrl(perfctr_msr, 0UL); + + evntsel = ARCH_PERFMON_EVENTSEL_INT + | ARCH_PERFMON_EVENTSEL_OS + | ARCH_PERFMON_EVENTSEL_USR + | ARCH_PERFMON_NMI_EVENT_SEL + | ARCH_PERFMON_NMI_EVENT_UMASK; + + /* setup the timer */ + wrmsr(evntsel_msr, evntsel, 0); + wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); + + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsr(evntsel_msr, evntsel, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; //unused + wd->check_bit = 1ULL << (eax.split.bit_width - 1); + return 1; +fail1: + release_perfctr_nmi(perfctr_msr); +fail: + return 0; +} + +static void stop_intel_arch_watchdog(void) +{ + unsigned int ebx; + union cpuid10_eax eax; + unsigned int unused; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + /* + * Check whether the Architectural PerfMon supports + * Unhalted Core Cycles Event or not. + * NOTE: Corresponding bit = 0 in ebx indicates event present. + */ + cpuid(10, &(eax.full), &ebx, &unused, &unused); + if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || + (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) + return; + + wrmsr(wd->evntsel_msr, 0, 0); + + release_evntsel_nmi(wd->evntsel_msr); + release_perfctr_nmi(wd->perfctr_msr); +} + void setup_apic_nmi_watchdog(void *unused) { struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); @@ -589,6 +694,11 @@ void setup_apic_nmi_watchdog(void *unused) return; break; case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + if (!setup_intel_arch_watchdog()) + return; + break; + } if (!setup_p4_watchdog()) return; break; @@ -620,6 +730,10 @@ void stop_apic_nmi_watchdog(void *unused) stop_k7_watchdog(); break; case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + stop_intel_arch_watchdog(); + break; + } stop_p4_watchdog(); break; default: @@ -724,7 +838,13 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) dummy &= ~P4_CCCR_OVF; wrmsrl(wd->cccr_msr, dummy); apic_write(APIC_LVTPC, APIC_DM_NMI); - } + } else if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { + /* + * ArchPerfom/Core Duo needs to re-unmask + * the apic vector + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); + } /* start the cycle over again */ wrmsrl(wd->perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); rc = 1; -- cgit v1.2.3 From a670fad0adb1cc6202a607d250f10bd380593905 Mon Sep 17 00:00:00 2001 From: Vojtech Pavlik Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] Add initalization of the RDTSCP auxilliary values This patch adds initalization of the RDTSCP auxilliary values to CPU numbers to time.c. If RDTSCP is available, the MSRs are written with the respective values. It can be later used to initalize per-cpu timekeeping variables. AK: Some cleanups. Move externs into headers and fix CPU hotplug. Signed-off-by: Vojtech Pavlik Signed-off-by: Andi Kleen --- arch/x86_64/kernel/smpboot.c | 2 ++ arch/x86_64/kernel/time.c | 47 +++++++++++++++++++++++++++++++++----------- 2 files changed, 37 insertions(+), 12 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 15555879ce9..582896f7d42 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -1181,6 +1181,8 @@ void __init smp_cpus_done(unsigned int max_cpus) #endif check_nmi_watchdog(); + + time_init_gtod(); } #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 7a9b1822418..97b9e46d199 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -24,6 +24,8 @@ #include #include #include +#include +#include #include #include #ifdef CONFIG_ACPI @@ -49,7 +51,7 @@ static void cpufreq_delayed_get(void); extern void i8254_timer_resume(void); extern int using_apic_timer; -static char *time_init_gtod(void); +static char *timename = NULL; DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); @@ -893,11 +895,21 @@ static struct irqaction irq0 = { timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL }; -void __init time_init(void) +static int __cpuinit +time_cpu_notifier(struct notifier_block *nb, unsigned long action, void *hcpu) { - char *timename; - char *gtod; + unsigned cpu = (unsigned long) hcpu; + if (action == CPU_ONLINE && + cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) { + unsigned p; + p = smp_processor_id() | (cpu_to_node(smp_processor_id())<<12); + write_rdtscp_aux(p); + } + return NOTIFY_DONE; +} +void __init time_init(void) +{ if (nohpet) vxtime.hpet_address = 0; @@ -931,18 +943,19 @@ void __init time_init(void) } vxtime.mode = VXTIME_TSC; - gtod = time_init_gtod(); - - printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n", - vxtime_hz / 1000000, vxtime_hz % 1000000, timename, gtod); - printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz; vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; vxtime.last_tsc = get_cycles_sync(); setup_irq(0, &irq0); set_cyc2ns_scale(cpu_khz); + + hotcpu_notifier(time_cpu_notifier, 0); + time_cpu_notifier(NULL, CPU_ONLINE, (void *)(long)smp_processor_id()); + +#ifndef CONFIG_SMP + time_init_gtod(); +#endif } /* @@ -973,12 +986,13 @@ __cpuinit int unsynchronized_tsc(void) /* * Decide what mode gettimeofday should use. */ -__init static char *time_init_gtod(void) +void time_init_gtod(void) { char *timetype; if (unsynchronized_tsc()) notsc = 1; + if (vxtime.hpet_address && notsc) { timetype = hpet_use_timer ? "HPET" : "PIT/HPET"; if (hpet_use_timer) @@ -1001,7 +1015,16 @@ __init static char *time_init_gtod(void) timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC"; vxtime.mode = VXTIME_TSC; } - return timetype; + + printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n", + vxtime_hz / 1000000, vxtime_hz % 1000000, timename, timetype); + printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", + cpu_khz / 1000, cpu_khz % 1000); + vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz; + vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; + vxtime.last_tsc = get_cycles_sync(); + + set_cyc2ns_scale(cpu_khz); } __setup("report_lost_ticks", time_setup); -- cgit v1.2.3 From c08c820508233b424deab3302bc404bbecc6493a Mon Sep 17 00:00:00 2001 From: Vojtech Pavlik Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] Add the vgetcpu vsyscall This patch adds a vgetcpu vsyscall, which depending on the CPU RDTSCP capability uses either the RDTSCP or CPUID to obtain a CPU and node numbers and pass them to the program. AK: Lots of changes over Vojtech's original code: Better prototype for vgetcpu() It's better to pass the cpu / node numbers as separate arguments to avoid mistakes when going from SMP to NUMA. Also add a fast time stamp based cache using a user supplied argument to speed things more up. Use fast method from Chuck Ebbert to retrieve node/cpu from GDT limit instead of CPUID Made sure RDTSCP init is always executed after node is known. Drop printk Signed-off-by: Vojtech Pavlik Signed-off-by: Andi Kleen --- arch/x86_64/kernel/head.S | 2 +- arch/x86_64/kernel/time.c | 13 ++++--- arch/x86_64/kernel/vmlinux.lds.S | 3 ++ arch/x86_64/kernel/vsyscall.c | 84 +++++++++++++++++++++++++++++++++++++++- 4 files changed, 93 insertions(+), 9 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index c9739ca81d0..505ec4a5750 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -371,7 +371,7 @@ ENTRY(cpu_gdt_table) .quad 0,0 /* TSS */ .quad 0,0 /* LDT */ .quad 0,0,0 /* three TLS descriptors */ - .quad 0 /* unused */ + .quad 0x0000f40000000000 /* node/CPU stored in limit */ gdt_end: /* asm/segment.h:GDT_ENTRIES must match this */ /* This should be a multiple of the cache line size */ diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 97b9e46d199..560ed944dc0 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -899,12 +899,8 @@ static int __cpuinit time_cpu_notifier(struct notifier_block *nb, unsigned long action, void *hcpu) { unsigned cpu = (unsigned long) hcpu; - if (action == CPU_ONLINE && - cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) { - unsigned p; - p = smp_processor_id() | (cpu_to_node(smp_processor_id())<<12); - write_rdtscp_aux(p); - } + if (action == CPU_ONLINE) + vsyscall_set_cpu(cpu); return NOTIFY_DONE; } @@ -993,6 +989,11 @@ void time_init_gtod(void) if (unsynchronized_tsc()) notsc = 1; + if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) + vgetcpu_mode = VGETCPU_RDTSCP; + else + vgetcpu_mode = VGETCPU_LSL; + if (vxtime.hpet_address && notsc) { timetype = hpet_use_timer ? "HPET" : "PIT/HPET"; if (hpet_use_timer) diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 7c4de31471d..8d5a5149bb3 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -99,6 +99,9 @@ SECTIONS .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) } vxtime = VVIRT(.vxtime); + .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) } + vgetcpu_mode = VVIRT(.vgetcpu_mode); + .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) } wall_jiffies = VVIRT(.wall_jiffies); diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index f603037df16..902783bc4d5 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -33,11 +34,15 @@ #include #include #include +#include +#include +#include #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) int __sysctl_vsyscall __section_sysctl_vsyscall = 1; seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; +int __vgetcpu_mode __section_vgetcpu_mode; #include @@ -127,9 +132,46 @@ time_t __vsyscall(1) vtime(time_t *t) return __xtime.tv_sec; } -long __vsyscall(2) venosys_0(void) +/* Fast way to get current CPU and node. + This helps to do per node and per CPU caches in user space. + The result is not guaranteed without CPU affinity, but usually + works out because the scheduler tries to keep a thread on the same + CPU. + + tcache must point to a two element sized long array. + All arguments can be NULL. */ +long __vsyscall(2) +vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) { - return -ENOSYS; + unsigned int dummy, p; + unsigned long j = 0; + + /* Fast cache - only recompute value once per jiffies and avoid + relatively costly rdtscp/cpuid otherwise. + This works because the scheduler usually keeps the process + on the same CPU and this syscall doesn't guarantee its + results anyways. + We do this here because otherwise user space would do it on + its own in a likely inferior way (no access to jiffies). + If you don't like it pass NULL. */ + if (tcache && tcache->t0 == (j = __jiffies)) { + p = tcache->t1; + } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { + /* Load per CPU data from RDTSCP */ + rdtscp(dummy, dummy, p); + } else { + /* Load per CPU data from GDT */ + asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); + } + if (tcache) { + tcache->t0 = j; + tcache->t1 = p; + } + if (cpu) + *cpu = p & 0xfff; + if (node) + *node = p >> 12; + return 0; } long __vsyscall(3) venosys_1(void) @@ -200,6 +242,43 @@ static ctl_table kernel_root_table2[] = { #endif +static void __cpuinit write_rdtscp_cb(void *info) +{ + write_rdtscp_aux((unsigned long)info); +} + +void __cpuinit vsyscall_set_cpu(int cpu) +{ + unsigned long *d; + unsigned long node = 0; +#ifdef CONFIG_NUMA + node = cpu_to_node[cpu]; +#endif + if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) { + void *info = (void *)((node << 12) | cpu); + /* Can happen on preemptive kernel */ + if (get_cpu() == cpu) + write_rdtscp_cb(info); +#ifdef CONFIG_SMP + else { + /* the notifier is unfortunately not executed on the + target CPU */ + smp_call_function_single(cpu,write_rdtscp_cb,info,0,1); + } +#endif + put_cpu(); + } + + /* Store cpu number in limit so that it can be loaded quickly + in user space in vgetcpu. + 12 bits for the CPU and 8 bits for the node. */ + d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU); + *d = 0x0f40000000000ULL; + *d |= cpu; + *d |= (node & 0xf) << 12; + *d |= (node >> 4) << 48; +} + static void __init map_vsyscall(void) { extern char __vsyscall_0; @@ -214,6 +293,7 @@ static int __init vsyscall_init(void) VSYSCALL_ADDR(__NR_vgettimeofday))); BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); + BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu)); map_vsyscall(); #ifdef CONFIG_SYSCTL register_sysctl_table(kernel_root_table2, 0); -- cgit v1.2.3 From 2f766d16062d0147edff91be15de4a950667ca42 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] Clean up asm/smp.h includes No need to include it from entry.S Drop all the #ifdef __ASSEMBLY__ Signed-off-by: Andi Kleen --- arch/x86_64/kernel/entry.S | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index aa8d8939abc..7eb1678e098 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -27,10 +27,8 @@ * - schedule it carefully for the final hardware. */ -#define ASSEMBLY 1 #include #include -#include #include #include #include -- cgit v1.2.3 From d3a4f48d4866b8623ca9adde8ce4e5fde979c132 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] x86-64 TIF flags for debug regs and io bitmap in ctxsw Hello, Following my discussion with Andi. Here is a patch that introduces two new TIF flags to simplify the context switch code in __switch_to(). The idea is to minimize the number of cache lines accessed in the common case, i.e., when neither the debug registers nor the I/O bitmap are used. This patch covers the x86-64 modifications. A patch for i386 follows. Changelog: - add TIF_DEBUG to track when debug registers are active - add TIF_IO_BITMAP to track when I/O bitmap is used - modify __switch_to() to use the new TIF flags : eranian@hpl.hp.com Signed-off-by: Andi Kleen --- arch/x86_64/kernel/ioport.c | 1 + arch/x86_64/kernel/process.c | 73 +++++++++++++++++++++++++------------------- arch/x86_64/kernel/ptrace.c | 8 +++-- 3 files changed, 49 insertions(+), 33 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c index b81614970ec..fe063d3cfe4 100644 --- a/arch/x86_64/kernel/ioport.c +++ b/arch/x86_64/kernel/ioport.c @@ -56,6 +56,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) memset(bitmap, 0xff, IO_BITMAP_BYTES); t->io_bitmap_ptr = bitmap; + set_thread_flag(TIF_IO_BITMAP); } /* diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index bb6745d13b8..6e0527635b4 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -350,6 +350,7 @@ void exit_thread(void) kfree(t->io_bitmap_ptr); t->io_bitmap_ptr = NULL; + clear_thread_flag(TIF_IO_BITMAP); /* * Careful, clear this in the TSS too: */ @@ -369,6 +370,7 @@ void flush_thread(void) if (t->flags & _TIF_IA32) current_thread_info()->status |= TS_COMPAT; } + t->flags &= ~_TIF_DEBUG; tsk->thread.debugreg0 = 0; tsk->thread.debugreg1 = 0; @@ -461,7 +463,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, asm("mov %%es,%0" : "=m" (p->thread.es)); asm("mov %%ds,%0" : "=m" (p->thread.ds)); - if (unlikely(me->thread.io_bitmap_ptr != NULL)) { + if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { p->thread.io_bitmap_max = 0; @@ -469,6 +471,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, } memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES); + set_tsk_thread_flag(p, TIF_IO_BITMAP); } /* @@ -498,6 +501,40 @@ out: */ #define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r) +static inline void __switch_to_xtra(struct task_struct *prev_p, + struct task_struct *next_p, + struct tss_struct *tss) +{ + struct thread_struct *prev, *next; + + prev = &prev_p->thread, + next = &next_p->thread; + + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { + loaddebug(next, 0); + loaddebug(next, 1); + loaddebug(next, 2); + loaddebug(next, 3); + /* no 4 and 5 */ + loaddebug(next, 6); + loaddebug(next, 7); + } + + if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { + /* + * Copy the relevant range of the IO bitmap. + * Normally this is 128 bytes or less: + */ + memcpy(tss->io_bitmap, next->io_bitmap_ptr, + max(prev->io_bitmap_max, next->io_bitmap_max)); + } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { + /* + * Clear any possible leftover bits: + */ + memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); + } +} + /* * switch_to(x,y) should switch tasks from x to y. * @@ -586,37 +623,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); /* - * Now maybe reload the debug registers + * Now maybe reload the debug registers and handle I/O bitmaps */ - if (unlikely(next->debugreg7)) { - loaddebug(next, 0); - loaddebug(next, 1); - loaddebug(next, 2); - loaddebug(next, 3); - /* no 4 and 5 */ - loaddebug(next, 6); - loaddebug(next, 7); - } - - - /* - * Handle the IO bitmap - */ - if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { - if (next->io_bitmap_ptr) - /* - * Copy the relevant range of the IO bitmap. - * Normally this is 128 bytes or less: - */ - memcpy(tss->io_bitmap, next->io_bitmap_ptr, - max(prev->io_bitmap_max, next->io_bitmap_max)); - else { - /* - * Clear any possible leftover bits: - */ - memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); - } - } + if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)) + || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) + __switch_to_xtra(prev_p, next_p, tss); return prev_p; } diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c index 2d50024c9f3..d35ec1bc696 100644 --- a/arch/x86_64/kernel/ptrace.c +++ b/arch/x86_64/kernel/ptrace.c @@ -420,9 +420,13 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1) break; if (i == 4) { - child->thread.debugreg7 = data; + child->thread.debugreg7 = data; + if (data) + set_tsk_thread_flag(child, TIF_DEBUG); + else + clear_tsk_thread_flag(child, TIF_DEBUG); ret = 0; - } + } break; } break; -- cgit v1.2.3 From d5d9ca6d882f7c8d47ef91a701fc042cbebbc334 Mon Sep 17 00:00:00 2001 From: Adam Henley Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] A few trivial spelling and grammar fixes A few trivial spelling and grammar mistakes picked up in "arch/x86_64/aperture.c", "arch/x86_64/crash.c" and "arch/x86_64/apic.c". I think all are correct fixes but am ever aware of my fallibility :o) This is my first patch submission so all feedback is appreciated, esp. WRT CCing to Linus, Andi and trivial@kernel.org, is this correct? And which is the most appropriate kernel version to diff against? If any. Should apply cleanly to 2.6.18-rc1 Signed-off-by: Adam Henley Signed-off-by: Andi Kleen - adam --- arch/x86_64/kernel/aperture.c | 2 +- arch/x86_64/kernel/apic.c | 4 ++-- arch/x86_64/kernel/crash.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c index 58af8e73738..04dbf166252 100644 --- a/arch/x86_64/kernel/aperture.c +++ b/arch/x86_64/kernel/aperture.c @@ -48,7 +48,7 @@ static u32 __init allocate_aperture(void) /* * Aperture has to be naturally aligned. This means an 2GB aperture won't - * have much chances to find a place in the lower 4GB of memory. + * have much chance of finding a place in the lower 4GB of memory. * Unfortunately we cannot move it up because that would make the * IOMMU useless. */ diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 692e9b57974..63d9b037afc 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -400,7 +400,7 @@ void __cpuinit setup_local_APIC (void) value |= APIC_SPIV_APIC_ENABLED; /* - * Some unknown Intel IO/APIC (or APIC) errata is biting us with + * Some unknown Intel IO/APIC (or APIC) errata are biting us with * certain networking cards. If high frequency interrupts are * happening on a particular IOAPIC pin, plus the IOAPIC routing * entry is masked/unmasked at a high rate as well then sooner or @@ -950,7 +950,7 @@ void smp_local_timer_interrupt(struct pt_regs *regs) * We take the 'long' return path, and there every subsystem * grabs the appropriate locks (kernel lock/ irq lock). * - * we might want to decouple profiling from the 'long path', + * We might want to decouple profiling from the 'long path', * and do the profiling totally in assembly. * * Currently this isn't too much of an issue (performance wise), diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c index fc57a139123..7d7a9e75f70 100644 --- a/arch/x86_64/kernel/crash.c +++ b/arch/x86_64/kernel/crash.c @@ -69,7 +69,7 @@ static void crash_save_this_cpu(struct pt_regs *regs, int cpu) * for the data I pass, and I need tags * on the data to indicate what information I have * squirrelled away. ELF notes happen to provide - * all of that that no need to invent something new. + * all of that, no need to invent something new. */ buf = (u32*)per_cpu_ptr(crash_notes, cpu); -- cgit v1.2.3 From c16b63e09d9d03158e0a92e961234e94c4862620 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] i386/x86-64: Don't randomize stack top when no randomization personality is set Based on patch from Frank van Maarseveen , but extended. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 6e0527635b4..6fbd19564e4 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -845,7 +845,7 @@ int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) unsigned long arch_align_stack(unsigned long sp) { - if (randomize_va_space) + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) sp -= get_random_int() % 8192; return sp & ~0xf; } -- cgit v1.2.3 From 31679f38d88696ed032d59e457f1605c97e7d719 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] Simplify profile_pc on x86-64 Use knowledge about EFLAGS layout (bits 22:63 are always 0) to distingush EFLAGS word and kernel address in the spin lock stack frame. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/time.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 560ed944dc0..ea00915d393 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -189,20 +189,15 @@ unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); - /* Assume the lock function has either no stack frame or only a single - word. This checks if the address on the stack looks like a kernel - text address. - There is a small window for false hits, but in that case the tick - is just accounted to the spinlock function. - Better would be to write these functions in assembler again - and check exactly. */ + /* Assume the lock function has either no stack frame or a copy + of eflags from PUSHF + Eflags always has bits 22 and up cleared unlike kernel addresses. */ if (!user_mode(regs) && in_lock_functions(pc)) { - char *v = *(char **)regs->rsp; - if ((v >= _stext && v <= _etext) || - (v >= _sinittext && v <= _einittext) || - (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END)) - return (unsigned long)v; - return ((unsigned long *)regs->rsp)[1]; + unsigned long *sp = (unsigned long *)regs->rsp; + if (sp[0] >> 22) + return sp[0]; + if (sp[1] >> 22) + return sp[1]; } return pc; } -- cgit v1.2.3 From b06babac45e1546dfb504f1f25eb0495632bfc41 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] Add proper alignment to ENTRY Previously it didn't align. Use the same one as the C compiler in blended mode, which is good for K8 and Core2 and doesn't hurt on P4. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/entry.S | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 7eb1678e098..2dc5c01f754 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -617,8 +617,7 @@ retint_signal: #ifdef CONFIG_PREEMPT /* Returning to kernel space. Check if we need preemption */ /* rcx: threadinfo. interrupts off. */ - .p2align -retint_kernel: +ENTRY(retint_kernel) cmpl $0,threadinfo_preempt_count(%rcx) jnz retint_restore_args bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx) -- cgit v1.2.3 From 5ba5891d44a6acade44887a0f3195489d46c12de Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] Add some comments what tce.c actually does Signed-off-by: Andi Kleen --- arch/x86_64/kernel/tce.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c index 5530dda3f27..0905ce6b9b3 100644 --- a/arch/x86_64/kernel/tce.c +++ b/arch/x86_64/kernel/tce.c @@ -1,4 +1,6 @@ /* + * This file manages the translation entries for the IBM Calgary IOMMU. + * * Derived from arch/powerpc/platforms/pseries/iommu.c * * Copyright (C) IBM Corporation, 2006 -- cgit v1.2.3 From 7f11d8a5efd625ffa41cde1d8472f85e885478ec Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] Remove all ifdefs for local/io apic IO-APIC or local APIC can only be disabled at runtime anyways and Kconfig has forced these options on for a long time now. The Kconfigs are kept only now for the benefit of the shared acpi boot.c code. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/Makefile | 4 ++-- arch/x86_64/kernel/apic.c | 6 +----- arch/x86_64/kernel/crash.c | 2 -- arch/x86_64/kernel/entry.S | 2 -- arch/x86_64/kernel/head64.c | 2 -- arch/x86_64/kernel/i8259.c | 9 --------- arch/x86_64/kernel/irq.c | 6 ------ arch/x86_64/kernel/mpparse.c | 9 --------- arch/x86_64/kernel/setup.c | 6 ------ arch/x86_64/kernel/smpboot.c | 5 ----- arch/x86_64/kernel/time.c | 8 -------- arch/x86_64/kernel/traps.c | 2 -- 12 files changed, 3 insertions(+), 58 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index b5aaeafc1cd..0ef39553c0f 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -20,8 +20,8 @@ obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o -obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \ +obj-y += apic.o nmi.o +obj-y += io_apic.o mpparse.o \ genapic.o genapic_cluster.o genapic_flat.o obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 63d9b037afc..c6ca100a7de 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -661,7 +661,6 @@ void __init init_apic_mappings(void) */ boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); -#ifdef CONFIG_X86_IO_APIC { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; int i; @@ -679,7 +678,6 @@ void __init init_apic_mappings(void) idx++; } } -#endif } /* @@ -1129,12 +1127,10 @@ int __init APIC_init_uniprocessor (void) setup_local_APIC(); -#ifdef CONFIG_X86_IO_APIC if (smp_found_config && !skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); + setup_IO_APIC(); else nr_ioapics = 0; -#endif setup_boot_APIC_clock(); check_nmi_watchdog(); return 0; diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c index 7d7a9e75f70..3525f884af8 100644 --- a/arch/x86_64/kernel/crash.c +++ b/arch/x86_64/kernel/crash.c @@ -194,9 +194,7 @@ void machine_crash_shutdown(struct pt_regs *regs) if(cpu_has_apic) disable_local_APIC(); -#if defined(CONFIG_X86_IO_APIC) disable_IO_APIC(); -#endif crash_save_self(regs); } diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 2dc5c01f754..a0cf36ba7c5 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -676,7 +676,6 @@ ENTRY(call_function_interrupt) END(call_function_interrupt) #endif -#ifdef CONFIG_X86_LOCAL_APIC ENTRY(apic_timer_interrupt) apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt END(apic_timer_interrupt) @@ -688,7 +687,6 @@ END(error_interrupt) ENTRY(spurious_interrupt) apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt END(spurious_interrupt) -#endif /* * Exception entry points. diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index 36647ce6aec..bacbd75c63a 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -111,10 +111,8 @@ void __init x86_64_start_kernel(char * real_mode_data) if (s != NULL) numa_setup(s+5); #endif -#ifdef CONFIG_X86_IO_APIC if (strstr(saved_command_line, "disableapic")) disable_apic = 1; -#endif /* You need early console to see that */ if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE) panic("Kernel too big for kernel mapping\n"); diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c index 0434b1f8e3d..d2f5acde574 100644 --- a/arch/x86_64/kernel/i8259.c +++ b/arch/x86_64/kernel/i8259.c @@ -55,7 +55,6 @@ */ BUILD_16_IRQS(0x0) -#ifdef CONFIG_X86_LOCAL_APIC /* * The IO-APIC gives us many more interrupt sources. Most of these * are unused but an SMP system is supposed to have enough memory ... @@ -75,8 +74,6 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_15_IRQS(0xe) #endif -#endif - #undef BUILD_16_IRQS #undef BUILD_15_IRQS #undef BI @@ -100,7 +97,6 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) void (*interrupt[NR_IRQS])(void) = { IRQLIST_16(0x0), -#ifdef CONFIG_X86_IO_APIC IRQLIST_16(0x1), IRQLIST_16(0x2), IRQLIST_16(0x3), IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), @@ -110,7 +106,6 @@ void (*interrupt[NR_IRQS])(void) = { , IRQLIST_15(0xe) #endif -#endif }; #undef IRQ @@ -453,9 +448,7 @@ void __init init_ISA_irqs (void) { int i; -#ifdef CONFIG_X86_LOCAL_APIC init_bsp_APIC(); -#endif init_8259A(0); for (i = 0; i < NR_IRQS; i++) { @@ -581,14 +574,12 @@ void __init init_IRQ(void) set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); -#ifdef CONFIG_X86_LOCAL_APIC /* self generated IPI for local APIC timer */ set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); /* IPI vectors for APIC spurious and error interrupts */ set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); -#endif /* * Set the clock to HZ Hz, we already have a valid diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c index 5221a53e90c..bfbcf92cfa4 100644 --- a/arch/x86_64/kernel/irq.c +++ b/arch/x86_64/kernel/irq.c @@ -20,11 +20,9 @@ #include atomic_t irq_err_count; -#ifdef CONFIG_X86_IO_APIC #ifdef APIC_MISMATCH_DEBUG atomic_t irq_mis_count; #endif -#endif #ifdef CONFIG_DEBUG_STACKOVERFLOW /* @@ -92,17 +90,13 @@ skip: for_each_online_cpu(j) seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); seq_putc(p, '\n'); -#ifdef CONFIG_X86_LOCAL_APIC seq_printf(p, "LOC: "); for_each_online_cpu(j) seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs); seq_putc(p, '\n'); -#endif seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); -#ifdef CONFIG_X86_IO_APIC #ifdef APIC_MISMATCH_DEBUG seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); -#endif #endif } return 0; diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index a1ab4197f8a..c73dd1f41e6 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -74,14 +74,10 @@ physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; /* ACPI MADT entry parsing functions */ #ifdef CONFIG_ACPI extern struct acpi_boot_flags acpi_boot; -#ifdef CONFIG_X86_LOCAL_APIC extern int acpi_parse_lapic (acpi_table_entry_header *header); extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header); extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header); -#endif /*CONFIG_X86_LOCAL_APIC*/ -#ifdef CONFIG_X86_IO_APIC extern int acpi_parse_ioapic (acpi_table_entry_header *header); -#endif /*CONFIG_X86_IO_APIC*/ #endif /*CONFIG_ACPI*/ u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; @@ -661,9 +657,7 @@ void __init find_intel_smp (void) */ void __init find_smp_config (void) { -#ifdef CONFIG_X86_LOCAL_APIC find_intel_smp(); -#endif } @@ -717,8 +711,6 @@ void __cpuinit mp_register_lapic ( MP_processor_info(&processor); } -#ifdef CONFIG_X86_IO_APIC - #define MP_ISA_BUS 0 #define MP_MAX_IOAPIC_PIN 127 @@ -997,5 +989,4 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) return gsi; } -#endif /*CONFIG_X86_IO_APIC*/ #endif /*CONFIG_ACPI*/ diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 34afad70482..fbe9f7faa2d 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -340,10 +340,8 @@ static __init void parse_cmdline_early (char ** cmdline_p) else if (fullarg(from, "acpi=strict")) { acpi_strict = 1; } -#ifdef CONFIG_X86_IO_APIC else if (fullarg(from, "acpi_skip_timer_override")) acpi_skip_timer_override = 1; -#endif #endif if (fullarg(from, "disable_timer_pin_1")) @@ -625,12 +623,10 @@ void __init setup_arch(char **cmdline_p) */ acpi_reserve_bootmem(); #endif -#ifdef CONFIG_X86_LOCAL_APIC /* * Find and reserve possible boot-time SMP configuration: */ find_smp_config(); -#endif #ifdef CONFIG_BLK_DEV_INITRD if (LOADER_TYPE && INITRD_START) { if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) { @@ -674,14 +670,12 @@ void __init setup_arch(char **cmdline_p) init_cpu_to_node(); -#ifdef CONFIG_X86_LOCAL_APIC /* * get boot-time SMP configuration: */ if (smp_found_config) get_smp_config(); init_apic_mappings(); -#endif /* * Request address space for all standard RAM and ROM resources diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 582896f7d42..d29571e2495 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -1175,13 +1175,8 @@ int __cpuinit __cpu_up(unsigned int cpu) void __init smp_cpus_done(unsigned int max_cpus) { smp_cleanup_boot(); - -#ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); -#endif - check_nmi_watchdog(); - time_init_gtod(); } diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index ea00915d393..d66c7f750e7 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -41,9 +41,7 @@ #include #include #include -#ifdef CONFIG_X86_LOCAL_APIC #include -#endif #ifdef CONFIG_CPU_FREQ static void cpufreq_delayed_get(void); @@ -438,12 +436,8 @@ void main_timer_handler(struct pt_regs *regs) * have to call the local interrupt handler. */ -#ifndef CONFIG_X86_LOCAL_APIC - profile_tick(CPU_PROFILING, regs); -#else if (!using_apic_timer) smp_local_timer_interrupt(regs); -#endif /* * If we have an externally synchronized Linux clock, then update CMOS clock @@ -467,10 +461,8 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) if (apic_runs_main_timer > 1) return IRQ_HANDLED; main_timer_handler(regs); -#ifdef CONFIG_X86_LOCAL_APIC if (using_apic_timer) smp_send_timer_broadcast_ipi(); -#endif return IRQ_HANDLED; } diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 96f62a03324..56d7ff0c894 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -788,7 +788,6 @@ asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs) if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) == NOTIFY_STOP) return; -#ifdef CONFIG_X86_LOCAL_APIC /* * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. @@ -796,7 +795,6 @@ asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs) if (nmi_watchdog_tick(regs,reason)) return; if (!do_nmi_callback(regs,cpu)) -#endif unknown_nmi_error(reason, regs); return; -- cgit v1.2.3 From 6c96a29f20762e8ce40b674f906055d009c302ee Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] Remove apic mismatch counter Nobody has been setting the mismatch counter and the ifdef was never set so remove it. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/irq.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c index bfbcf92cfa4..b3677e6ccc6 100644 --- a/arch/x86_64/kernel/irq.c +++ b/arch/x86_64/kernel/irq.c @@ -20,9 +20,6 @@ #include atomic_t irq_err_count; -#ifdef APIC_MISMATCH_DEBUG -atomic_t irq_mis_count; -#endif #ifdef CONFIG_DEBUG_STACKOVERFLOW /* @@ -95,9 +92,6 @@ skip: seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs); seq_putc(p, '\n'); seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); -#ifdef APIC_MISMATCH_DEBUG - seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); -#endif } return 0; } -- cgit v1.2.3 From 3f14c746a61ec932c204aca820c02c293118c5df Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] Remove old "focus disabled" chipset errata workaround The new systems already use focus disabled and the comment was completely outdated. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/apic.c | 28 ++-------------------------- 1 file changed, 2 insertions(+), 26 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index c6ca100a7de..78c43b1a306 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -399,32 +399,8 @@ void __cpuinit setup_local_APIC (void) */ value |= APIC_SPIV_APIC_ENABLED; - /* - * Some unknown Intel IO/APIC (or APIC) errata are biting us with - * certain networking cards. If high frequency interrupts are - * happening on a particular IOAPIC pin, plus the IOAPIC routing - * entry is masked/unmasked at a high rate as well then sooner or - * later IOAPIC line gets 'stuck', no more interrupts are received - * from the device. If focus CPU is disabled then the hang goes - * away, oh well :-( - * - * [ This bug can be reproduced easily with a level-triggered - * PCI Ne2000 networking cards and PII/PIII processors, dual - * BX chipset. ] - */ - /* - * Actually disabling the focus CPU check just makes the hang less - * frequent as it makes the interrupt distributon model be more - * like LRU than MRU (the short-term load is more even across CPUs). - * See also the comment in end_level_ioapic_irq(). --macro - */ -#if 1 - /* Enable focus processor (bit==0) */ - value &= ~APIC_SPIV_FOCUS_DISABLED; -#else - /* Disable focus processor (bit==1) */ - value |= APIC_SPIV_FOCUS_DISABLED; -#endif + /* We always use processor focus */ + /* * Set spurious IRQ vector */ -- cgit v1.2.3 From 2e91a17b35116885373e04af142b1d08cf1b47bf Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] Add some comments to entry.S And remove some old obsolete ones. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/entry.S | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index a0cf36ba7c5..a04fc4108ff 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -22,9 +22,21 @@ * at the top of the kernel process stack. * - partial stack frame: partially saved registers upto R11. * - full stack frame: Like partial stack frame, but all register saved. - * - * TODO: - * - schedule it carefully for the final hardware. + * + * Some macro usage: + * - CFI macros are used to generate dwarf2 unwind information for better + * backtraces. They don't change any code. + * - SAVE_ALL/RESTORE_ALL - Save/restore all registers + * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify. + * There are unfortunately lots of special cases where some registers + * not touched. The macro is a big mess that should be cleaned up. + * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS. + * Gives a full stack frame. + * - ENTRY/END Define functions in the symbol table. + * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack + * frame that is otherwise undefined after a SYSCALL + * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. + * - errorentry/paranoidentry/zeroentry - Define exception entry points. */ #include -- cgit v1.2.3 From 5cb6b99928a8f1a1bbdac7a16b88aef8cb64d432 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] Remove pirq overwrite support This was an old workaround for broken MP-BIOS. The user could specify overwrites on the command line. I've never seen it being used for anything on 64bit. So get rid of it for now. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/io_apic.c | 55 -------------------------------------------- 1 file changed, 55 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 924a4a33295..86cc07caa0e 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -225,14 +225,6 @@ static void clear_IO_APIC (void) clear_IO_APIC_pin(apic, pin); } -/* - * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to - * specific CPU-side IRQs. - */ - -#define MAX_PIRQS 8 -static int pirq_entries [MAX_PIRQS]; -static int pirqs_enabled; int skip_ioapic_setup; int ioapic_force; @@ -370,34 +362,6 @@ void __init check_ioapic(void) } } -static int __init ioapic_pirq_setup(char *str) -{ - int i, max; - int ints[MAX_PIRQS+1]; - - get_options(str, ARRAY_SIZE(ints), ints); - - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; - - pirqs_enabled = 1; - apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n"); - max = MAX_PIRQS; - if (ints[0] < MAX_PIRQS) - max = ints[0]; - - for (i = 0; i < max; i++) { - apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); - /* - * PIRQs are mapped upside down, usually. - */ - pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; - } - return 1; -} - -__setup("pirq=", ioapic_pirq_setup); - /* * Find the IRQ entry number of a certain pin. */ @@ -793,22 +757,6 @@ static int pin_2_irq(int idx, int apic, int pin) } } BUG_ON(irq >= NR_IRQS); - - /* - * PCI IRQ command line redirection. Yes, limits are hardcoded. - */ - if ((pin >= 16) && (pin <= 23)) { - if (pirq_entries[pin-16] != -1) { - if (!pirq_entries[pin-16]) { - apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16); - } else { - irq = pirq_entries[pin-16]; - apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n", - pin-16, irq); - } - } - } - BUG_ON(irq >= NR_IRQS); return irq; } @@ -1281,9 +1229,6 @@ static void __init enable_IO_APIC(void) irq_2_pin[i].pin = -1; irq_2_pin[i].next = 0; } - if (!pirqs_enabled) - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; /* * The number of IO-APIC IRQ registers (== #pins): -- cgit v1.2.3 From e50991343488edf25d58820be1684322808c763c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] Remove leftover MCE/EISA support No 64bit EISA or Microchannel systems ever. Remove the left over code in the IO-APIC driver and the mptable parser Signed-off-by: Andi Kleen --- arch/x86_64/kernel/io_apic.c | 57 ++------------------------------------------ arch/x86_64/kernel/mpparse.c | 16 +------------ 2 files changed, 3 insertions(+), 70 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 86cc07caa0e..97e90f6abcc 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -389,9 +389,7 @@ static int __init find_isa_irq_pin(int irq, int type) for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].mpc_srcbus; - if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || - mp_bus_id_to_type[lbus] == MP_BUS_EISA || - mp_bus_id_to_type[lbus] == MP_BUS_MCA) && + if (mp_bus_id_to_type[lbus] == MP_BUS_ISA && (mp_irqs[i].mpc_irqtype == type) && (mp_irqs[i].mpc_srcbusirq == irq)) @@ -407,9 +405,7 @@ static int __init find_isa_irq_apic(int irq, int type) for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].mpc_srcbus; - if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || - mp_bus_id_to_type[lbus] == MP_BUS_EISA || - mp_bus_id_to_type[lbus] == MP_BUS_MCA) && + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA) && (mp_irqs[i].mpc_irqtype == type) && (mp_irqs[i].mpc_srcbusirq == irq)) break; @@ -472,27 +468,6 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) return best_guess; } -/* - * EISA Edge/Level control register, ELCR - */ -static int EISA_ELCR(unsigned int irq) -{ - if (irq < 16) { - unsigned int port = 0x4d0 + (irq >> 3); - return (inb(port) >> (irq & 7)) & 1; - } - apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq); - return 0; -} - -/* EISA interrupts are always polarity zero and can be edge or level - * trigger depending on the ELCR value. If an interrupt is listed as - * EISA conforming in the MP table, that means its trigger type must - * be read in from the ELCR */ - -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) -#define default_EISA_polarity(idx) (0) - /* ISA interrupts are always polarity zero edge triggered, * when listed as conforming in the MP table. */ @@ -505,12 +480,6 @@ static int EISA_ELCR(unsigned int irq) #define default_PCI_trigger(idx) (1) #define default_PCI_polarity(idx) (1) -/* MCA interrupts are always polarity zero level triggered, - * when listed as conforming in the MP table. */ - -#define default_MCA_trigger(idx) (1) -#define default_MCA_polarity(idx) (0) - static int __init MPBIOS_polarity(int idx) { int bus = mp_irqs[idx].mpc_srcbus; @@ -530,21 +499,11 @@ static int __init MPBIOS_polarity(int idx) polarity = default_ISA_polarity(idx); break; } - case MP_BUS_EISA: /* EISA pin */ - { - polarity = default_EISA_polarity(idx); - break; - } case MP_BUS_PCI: /* PCI pin */ { polarity = default_PCI_polarity(idx); break; } - case MP_BUS_MCA: /* MCA pin */ - { - polarity = default_MCA_polarity(idx); - break; - } default: { printk(KERN_WARNING "broken BIOS!!\n"); @@ -599,21 +558,11 @@ static int MPBIOS_trigger(int idx) trigger = default_ISA_trigger(idx); break; } - case MP_BUS_EISA: /* EISA pin */ - { - trigger = default_EISA_trigger(idx); - break; - } case MP_BUS_PCI: /* PCI pin */ { trigger = default_PCI_trigger(idx); break; } - case MP_BUS_MCA: /* MCA pin */ - { - trigger = default_MCA_trigger(idx); - break; - } default: { printk(KERN_WARNING "broken BIOS!!\n"); @@ -731,8 +680,6 @@ static int pin_2_irq(int idx, int apic, int pin) switch (mp_bus_id_to_type[bus]) { case MP_BUS_ISA: /* ISA pin */ - case MP_BUS_EISA: - case MP_BUS_MCA: { irq = mp_irqs[idx].mpc_srcbusirq; break; diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index c73dd1f41e6..90e99cf27aa 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -175,14 +175,10 @@ static void __init MP_bus_info (struct mpc_config_bus *m) if (strncmp(str, "ISA", 3) == 0) { mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; - } else if (strncmp(str, "EISA", 4) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; } else if (strncmp(str, "PCI", 3) == 0) { mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; mp_current_pci_id++; - } else if (strncmp(str, "MCA", 3) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; } else { printk(KERN_ERR "Unknown bustype %s\n", str); } @@ -465,14 +461,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) case 5: memcpy(bus.mpc_bustype, "ISA ", 6); break; - case 2: - case 6: - case 3: - memcpy(bus.mpc_bustype, "EISA ", 6); - break; - case 4: - case 7: - memcpy(bus.mpc_bustype, "MCA ", 6); } MP_bus_info(&bus); if (mpc_default_type > 4) { @@ -629,9 +617,7 @@ void __init find_intel_smp (void) smp_scan_config(0xF0000,0x10000)) return; /* - * If it is an SMP machine we should know now, unless the - * configuration is in an EISA/MCA bus machine with an - * extended bios data area. + * If it is an SMP machine we should know now. * * there is a real-mode segmented pointer pointing to the * 4K EBDA area at 0x40E, calculate and scan it here. -- cgit v1.2.3 From a8fcf1a24a16e1c735c795d99773b5dcefb71518 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] Remove obsolete PIC mode PIC mode is an outdated way to drive the APICs that was used on some early MP boards. It is not supported in the ACPI model. It is unlikely to be ever configured by any x86-64 system Remove it thus. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/apic.c | 92 ++++++++++++++------------------------------ arch/x86_64/kernel/mpparse.c | 8 ---- arch/x86_64/kernel/smpboot.c | 1 - 3 files changed, 29 insertions(+), 72 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 78c43b1a306..20a76f20962 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -136,72 +136,40 @@ void clear_local_APIC(void) apic_read(APIC_ESR); } -void __init connect_bsp_APIC(void) -{ - if (pic_mode) { - /* - * Do not trust the local APIC being empty at bootup. - */ - clear_local_APIC(); - /* - * PIC mode, enable APIC mode in the IMCR, i.e. - * connect BSP's local APIC to INT and NMI lines. - */ - apic_printk(APIC_VERBOSE, "leaving PIC mode, enabling APIC mode.\n"); - outb(0x70, 0x22); - outb(0x01, 0x23); - } -} - void disconnect_bsp_APIC(int virt_wire_setup) { - if (pic_mode) { - /* - * Put the board back into PIC mode (has an effect - * only on certain older boards). Note that APIC - * interrupts, including IPIs, won't work beyond - * this point! The only exception are INIT IPIs. - */ - apic_printk(APIC_QUIET, "disabling APIC mode, entering PIC mode.\n"); - outb(0x70, 0x22); - outb(0x00, 0x23); - } - else { - /* Go back to Virtual Wire compatibility mode */ - unsigned long value; - - /* For the spurious interrupt use vector F, and enable it */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - value |= 0xf; - apic_write(APIC_SPIV, value); - - if (!virt_wire_setup) { - /* For LVT0 make it edge triggered, active high, external and enabled */ - value = apic_read(APIC_LVT0); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); - apic_write(APIC_LVT0, value); - } - else { - /* Disable LVT0 */ - apic_write(APIC_LVT0, APIC_LVT_MASKED); - } + /* Go back to Virtual Wire compatibility mode */ + unsigned long value; - /* For LVT1 make it edge triggered, active high, nmi and enabled */ - value = apic_read(APIC_LVT1); - value &= ~( - APIC_MODE_MASK | APIC_SEND_PENDING | + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write(APIC_SPIV, value); + + if (!virt_wire_setup) { + /* For LVT0 make it edge triggered, active high, external and enabled */ + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); - apic_write(APIC_LVT1, value); + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); + apic_write(APIC_LVT0, value); + } else { + /* Disable LVT0 */ + apic_write(APIC_LVT0, APIC_LVT_MASKED); } + + /* For LVT1 make it edge triggered, active high, nmi and enabled */ + value = apic_read(APIC_LVT1); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write(APIC_LVT1, value); } void disable_local_APIC(void) @@ -418,7 +386,7 @@ void __cpuinit setup_local_APIC (void) * TODO: set up through-local-APIC from through-I/O-APIC? --macro */ value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; - if (!smp_processor_id() && (pic_mode || !value)) { + if (!smp_processor_id() && !value) { value = APIC_DM_EXTINT; apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id()); } else { @@ -1096,8 +1064,6 @@ int __init APIC_init_uniprocessor (void) verify_local_APIC(); - connect_bsp_APIC(); - phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id); apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id)); diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 90e99cf27aa..02262450355 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -56,7 +56,6 @@ struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; int mp_irq_entries; int nr_ioapics; -int pic_mode; unsigned long mp_lapic_addr = 0; @@ -514,13 +513,6 @@ void __init get_smp_config (void) printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); - if (mpf->mpf_feature2 & (1<<7)) { - printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); - pic_mode = 1; - } else { - printk(KERN_INFO " Virtual Wire compatibility mode.\n"); - pic_mode = 0; - } /* * Now see if we need to read further. diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index d29571e2495..ae7d6d84e35 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -1090,7 +1090,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus) /* * Switch from PIC to APIC mode. */ - connect_bsp_APIC(); setup_local_APIC(); if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) { -- cgit v1.2.3 From c1a58b42b428e717afbbb298356e041cea54ad17 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] i386/x86-64: Remove obsolete sanity check in mptable parsing It apparently has never triggered in many years. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/mpparse.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 02262450355..c12acc3b555 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -223,19 +223,6 @@ static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); - /* - * Well it seems all SMP boards in existence - * use ExtINT/LVT1 == LINT0 and - * NMI/LVT2 == LINT1 - the following check - * will show us if this assumptions is false. - * Until then we do not have to add baggage. - */ - if ((m->mpc_irqtype == mp_ExtINT) && - (m->mpc_destapiclint != 0)) - BUG(); - if ((m->mpc_irqtype == mp_NMI) && - (m->mpc_destapiclint != 1)) - BUG(); } /* -- cgit v1.2.3 From eea0e11c1f0d6ef89e64182b2f1223a4ca2b74a2 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] Factor out common io apic routing entry access The IO APIC code had lots of duplicated code to read/write 64bit routing entries into the IO-APIC. Factor this out int common read/write functions In a few cases the IO APIC lock is taken more often now, but this isn't a problem because it's all initialization/shutdown only slow path code. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/io_apic.c | 82 ++++++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 41 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 97e90f6abcc..1ccf1b41c3e 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -111,6 +111,33 @@ int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1}; FINAL; \ } +union entry_union { + struct { u32 w1, w2; }; + struct IO_APIC_route_entry entry; +}; + +static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) +{ + union entry_union eu; + unsigned long flags; + spin_lock_irqsave(&ioapic_lock, flags); + eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); + eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + return eu.entry; +} + +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + unsigned long flags; + union entry_union eu; + eu.entry = e; + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + #ifdef CONFIG_SMP static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) { @@ -196,13 +223,9 @@ static void unmask_IO_APIC_irq (unsigned int irq) static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { struct IO_APIC_route_entry entry; - unsigned long flags; /* Check delivery_mode to be sure we're not clearing an SMI pin */ - spin_lock_irqsave(&ioapic_lock, flags); - *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); + entry = ioapic_read_entry(apic, pin); if (entry.delivery_mode == dest_SMI) return; /* @@ -210,10 +233,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) */ memset(&entry, 0, sizeof(entry)); entry.mask = 1; - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(apic, pin, entry); } static void clear_IO_APIC (void) @@ -838,9 +858,9 @@ static void __init setup_IO_APIC_irqs(void) if (!apic && (irq < 16)) disable_8259A_irq(irq); } + ioapic_write_entry(apic, pin, entry); + spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); set_native_irq_info(irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -978,10 +998,7 @@ void __apicdebuginit print_IO_APIC(void) for (i = 0; i <= reg_01.bits.entries; i++) { struct IO_APIC_route_entry entry; - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); - *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); - spin_unlock_irqrestore(&ioapic_lock, flags); + entry = ioapic_read_entry(apic, i); printk(KERN_DEBUG " %02x %03X %02X ", i, @@ -1191,11 +1208,7 @@ static void __init enable_IO_APIC(void) /* See if any of the pins is in ExtINT mode */ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { struct IO_APIC_route_entry entry; - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); - + entry = ioapic_read_entry(apic, pin); /* If the interrupt line is enabled and in ExtInt mode * I have found the pin where the i8259 is connected. @@ -1247,7 +1260,6 @@ void disable_IO_APIC(void) */ if (ioapic_i8259.pin != -1) { struct IO_APIC_route_entry entry; - unsigned long flags; memset(&entry, 0, sizeof(entry)); entry.mask = 0; /* Enabled */ @@ -1264,12 +1276,7 @@ void disable_IO_APIC(void) /* * Add it to the IO-APIC irq-routing table: */ - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin, - *(((int *)&entry)+1)); - io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin, - *(((int *)&entry)+0)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); } disconnect_bsp_APIC(ioapic_i8259.pin != -1); @@ -1879,17 +1886,12 @@ static int ioapic_suspend(struct sys_device *dev, pm_message_t state) { struct IO_APIC_route_entry *entry; struct sysfs_ioapic_data *data; - unsigned long flags; int i; data = container_of(dev, struct sysfs_ioapic_data, dev); entry = data->entry; - spin_lock_irqsave(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { - *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i); - *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i); - } - spin_unlock_irqrestore(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) + *entry = ioapic_read_entry(dev->id, i); return 0; } @@ -1911,11 +1913,9 @@ static int ioapic_resume(struct sys_device *dev) reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; io_apic_write(dev->id, 0, reg_00.raw); } - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { - io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1)); - io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0)); - } spin_unlock_irqrestore(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i++) + ioapic_write_entry(dev->id, i, entry[i]); return 0; } @@ -2040,10 +2040,10 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p if (!ioapic && (irq < 16)) disable_8259A_irq(irq); + ioapic_write_entry(ioapic, pin, entry); + spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); - set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS); + set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); return 0; -- cgit v1.2.3 From edd965229669f8f8dfddec8c863250440fb65ab3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] Remove MPS table APIC renumbering The MPS table specification says that the operating system should renumber the IO-APICs following the table as needed. However in ACPI this is not allowed or neeeded and all x86-64 systems are ACPI compliant. The code was already disabled on some systems because it caused problems there. Remove it completely now. CC: mdomsch@dell.com Signed-off-by: Andi Kleen --- arch/x86_64/kernel/io_apic.c | 71 -------------------------------------------- 1 file changed, 71 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 1ccf1b41c3e..a02ed17d05d 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -1282,72 +1282,6 @@ void disable_IO_APIC(void) disconnect_bsp_APIC(ioapic_i8259.pin != -1); } -/* - * function to set the IO-APIC physical IDs based on the - * values stored in the MPC table. - * - * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 - */ - -static void __init setup_ioapic_ids_from_mpc (void) -{ - union IO_APIC_reg_00 reg_00; - int apic; - int i; - unsigned char old_id; - unsigned long flags; - - /* - * Set the IOAPIC ID to the value stored in the MPC table. - */ - for (apic = 0; apic < nr_ioapics; apic++) { - - /* Read the register 0 value */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - old_id = mp_ioapics[apic].mpc_apicid; - - - printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid); - - - /* - * We need to adjust the IRQ routing table - * if the ID changed. - */ - if (old_id != mp_ioapics[apic].mpc_apicid) - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mpc_dstapic == old_id) - mp_irqs[i].mpc_dstapic - = mp_ioapics[apic].mpc_apicid; - - /* - * Read the right value from the MPC table and - * write it into the ID register. - */ - apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...", - mp_ioapics[apic].mpc_apicid); - - reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0, reg_00.raw); - spin_unlock_irqrestore(&ioapic_lock, flags); - - /* - * Sanity check - */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) - printk("could not set ID!\n"); - else - apic_printk(APIC_VERBOSE," ok.\n"); - } -} - /* * There is a nasty bug in some older SMP boards, their mptable lies * about the timer IRQ. We do the following to work around the situation: @@ -1863,11 +1797,6 @@ void __init setup_IO_APIC(void) apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); - /* - * Set up the IO-APIC IRQ routing table. - */ - if (!acpi_ioapic) - setup_ioapic_ids_from_mpc(); sync_Arb_IDs(); setup_IO_APIC_irqs(); init_IO_APIC_traps(); -- cgit v1.2.3 From dfa4698c50bf85b7927214b0e4a3dc4bc3b3c4a9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] Move early chipset quirks out to new file They did not really belong into io_apic.c. Move them into a new file and clean it up a bit. Also remove outdated ATI quirk that was obsolete, Signed-off-by: Andi Kleen --- arch/x86_64/kernel/Makefile | 2 +- arch/x86_64/kernel/early-quirks.c | 118 ++++++++++++++++++++++++++++++++++++++ arch/x86_64/kernel/io_apic.c | 101 -------------------------------- arch/x86_64/kernel/setup.c | 2 +- 4 files changed, 120 insertions(+), 103 deletions(-) create mode 100644 arch/x86_64/kernel/early-quirks.c (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 0ef39553c0f..000e67e8f02 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -8,7 +8,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ x8664_ksyms.o i387.o syscall.o vsyscall.o \ setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ - pci-dma.o pci-nommu.o alternative.o + pci-dma.o pci-nommu.o alternative.o early-quirks.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_X86_MCE) += mce.o diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c new file mode 100644 index 00000000000..d637cff1c4b --- /dev/null +++ b/arch/x86_64/kernel/early-quirks.c @@ -0,0 +1,118 @@ +/* Various workarounds for chipset bugs. + This code runs very early and can't use the regular PCI subsystem + The entries are keyed to PCI bridges which usually identify chipsets + uniquely. + This is only for whole classes of chipsets with specific problems which + need early invasive action (e.g. before the timers are initialized). + Most PCI device specific workarounds can be done later and should be + in standard PCI quirks + Mainboard specific bugs should be handled by DMI entries. + CPU specific bugs in setup.c */ + +#include +#include +#include +#include +#include +#include + +static void via_bugs(void) +{ +#ifdef CONFIG_IOMMU + if ((end_pfn > MAX_DMA32_PFN || force_iommu) && + !iommu_aperture_allowed) { + printk(KERN_INFO + "Looks like a VIA chipset. Disabling IOMMU. Override with iommu=allowed\n"); + iommu_aperture_disabled = 1; + } +#endif +} + +#ifdef CONFIG_ACPI + +static int nvidia_hpet_detected __initdata; + +static int __init nvidia_hpet_check(unsigned long phys, unsigned long size) +{ + nvidia_hpet_detected = 1; + return 0; +} +#endif + +static void nvidia_bugs(void) +{ +#ifdef CONFIG_ACPI + /* + * All timer overrides on Nvidia are + * wrong unless HPET is enabled. + */ + nvidia_hpet_detected = 0; + acpi_table_parse(ACPI_HPET, nvidia_hpet_check); + if (nvidia_hpet_detected == 0) { + acpi_skip_timer_override = 1; + printk(KERN_INFO "Nvidia board " + "detected. Ignoring ACPI " + "timer override.\n"); + } +#endif + /* RED-PEN skip them on mptables too? */ + +} + +static void ati_bugs(void) +{ +#if 1 /* for testing */ + printk("ATI board detected\n"); +#endif + /* No bugs right now */ +} + +struct chipset { + u16 vendor; + void (*f)(void); +}; + +static struct chipset early_qrk[] = { + { PCI_VENDOR_ID_NVIDIA, nvidia_bugs }, + { PCI_VENDOR_ID_VIA, via_bugs }, + { PCI_VENDOR_ID_ATI, ati_bugs }, + {} +}; + +void __init early_quirks(void) +{ + int num, slot, func; + /* Poor man's PCI discovery */ + for (num = 0; num < 32; num++) { + for (slot = 0; slot < 32; slot++) { + for (func = 0; func < 8; func++) { + u32 class; + u32 vendor; + u8 type; + int i; + class = read_pci_config(num,slot,func, + PCI_CLASS_REVISION); + if (class == 0xffffffff) + break; + + if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) + continue; + + vendor = read_pci_config(num, slot, func, + PCI_VENDOR_ID); + vendor &= 0xffff; + + for (i = 0; early_qrk[i].f; i++) + if (early_qrk[i].vendor == vendor) { + early_qrk[i].f(); + return; + } + + type = read_pci_config_byte(num, slot, func, + PCI_HEADER_TYPE); + if (!(type & 0x80)) + break; + } + } + } +} diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index a02ed17d05d..8bbff64c20d 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -280,107 +280,6 @@ static int __init setup_enable_8254_timer(char *s) __setup("disable_8254_timer", setup_disable_8254_timer); __setup("enable_8254_timer", setup_enable_8254_timer); -#include -#include -#include - - -#ifdef CONFIG_ACPI - -static int nvidia_hpet_detected __initdata; - -static int __init nvidia_hpet_check(unsigned long phys, unsigned long size) -{ - nvidia_hpet_detected = 1; - return 0; -} -#endif - -/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC - off. Check for an Nvidia or VIA PCI bridge and turn it off. - Use pci direct infrastructure because this runs before the PCI subsystem. - - Can be overwritten with "apic" - - And another hack to disable the IOMMU on VIA chipsets. - - ... and others. Really should move this somewhere else. - - Kludge-O-Rama. */ -void __init check_ioapic(void) -{ - int num,slot,func; - /* Poor man's PCI discovery */ - for (num = 0; num < 32; num++) { - for (slot = 0; slot < 32; slot++) { - for (func = 0; func < 8; func++) { - u32 class; - u32 vendor; - u8 type; - class = read_pci_config(num,slot,func, - PCI_CLASS_REVISION); - if (class == 0xffffffff) - break; - - if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) - continue; - - vendor = read_pci_config(num, slot, func, - PCI_VENDOR_ID); - vendor &= 0xffff; - switch (vendor) { - case PCI_VENDOR_ID_VIA: -#ifdef CONFIG_IOMMU - if ((end_pfn > MAX_DMA32_PFN || - force_iommu) && - !iommu_aperture_allowed) { - printk(KERN_INFO - "Looks like a VIA chipset. Disabling IOMMU. Override with \"iommu=allowed\"\n"); - iommu_aperture_disabled = 1; - } -#endif - return; - case PCI_VENDOR_ID_NVIDIA: -#ifdef CONFIG_ACPI - /* - * All timer overrides on Nvidia are - * wrong unless HPET is enabled. - */ - nvidia_hpet_detected = 0; - acpi_table_parse(ACPI_HPET, - nvidia_hpet_check); - if (nvidia_hpet_detected == 0) { - acpi_skip_timer_override = 1; - printk(KERN_INFO "Nvidia board " - "detected. Ignoring ACPI " - "timer override.\n"); - } -#endif - /* RED-PEN skip them on mptables too? */ - return; - - /* This should be actually default, but - for 2.6.16 let's do it for ATI only where - it's really needed. */ - case PCI_VENDOR_ID_ATI: - if (timer_over_8254 == 1) { - timer_over_8254 = 0; - printk(KERN_INFO - "ATI board detected. Disabling timer routing over 8254.\n"); - } - return; - } - - - /* No multi-function device? */ - type = read_pci_config_byte(num,slot,func, - PCI_HEADER_TYPE); - if (!(type & 0x80)) - break; - } - } - } -} /* * Find the IRQ entry number of a certain pin. diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index fbe9f7faa2d..aab4bd66aa0 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -653,7 +653,7 @@ void __init setup_arch(char **cmdline_p) paging_init(); - check_ioapic(); + early_quirks(); /* * set this early, so we dont allocate cpu0 -- cgit v1.2.3 From 55f05ffaa788e039df2f1ebe0d7bfbcb6f39d0b4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] Replace mp bus array with bitmap for bus not pci Since we only support PCI and ISA legacy busses now there is no need to have an full array with checking. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/io_apic.c | 92 +++++++++++--------------------------------- arch/x86_64/kernel/mpparse.c | 9 ++--- 2 files changed, 26 insertions(+), 75 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 8bbff64c20d..a1412042b91 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -308,7 +308,7 @@ static int __init find_isa_irq_pin(int irq, int type) for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].mpc_srcbus; - if (mp_bus_id_to_type[lbus] == MP_BUS_ISA && + if (test_bit(lbus, mp_bus_not_pci) && (mp_irqs[i].mpc_irqtype == type) && (mp_irqs[i].mpc_srcbusirq == irq)) @@ -324,7 +324,7 @@ static int __init find_isa_irq_apic(int irq, int type) for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].mpc_srcbus; - if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA) && + if (test_bit(lbus, mp_bus_not_pci) && (mp_irqs[i].mpc_irqtype == type) && (mp_irqs[i].mpc_srcbusirq == irq)) break; @@ -364,7 +364,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) mp_irqs[i].mpc_dstapic == MP_APIC_ALL) break; - if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) && + if (!test_bit(lbus, mp_bus_not_pci) && !mp_irqs[i].mpc_irqtype && (bus == lbus) && (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { @@ -410,28 +410,11 @@ static int __init MPBIOS_polarity(int idx) switch (mp_irqs[idx].mpc_irqflag & 3) { case 0: /* conforms, ie. bus-type dependent polarity */ - { - switch (mp_bus_id_to_type[bus]) - { - case MP_BUS_ISA: /* ISA pin */ - { - polarity = default_ISA_polarity(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - polarity = default_PCI_polarity(idx); - break; - } - default: - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - } + if (test_bit(bus, mp_bus_not_pci)) + polarity = default_ISA_polarity(idx); + else + polarity = default_PCI_polarity(idx); break; - } case 1: /* high active */ { polarity = 0; @@ -469,28 +452,11 @@ static int MPBIOS_trigger(int idx) switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) { case 0: /* conforms, ie. bus-type dependent */ - { - switch (mp_bus_id_to_type[bus]) - { - case MP_BUS_ISA: /* ISA pin */ - { - trigger = default_ISA_trigger(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - trigger = default_PCI_trigger(idx); - break; - } - default: - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - } + if (test_bit(bus, mp_bus_not_pci)) + trigger = default_ISA_trigger(idx); + else + trigger = default_PCI_trigger(idx); break; - } case 1: /* edge */ { trigger = 0; @@ -596,31 +562,17 @@ static int pin_2_irq(int idx, int apic, int pin) if (mp_irqs[idx].mpc_dstirq != pin) printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); - switch (mp_bus_id_to_type[bus]) - { - case MP_BUS_ISA: /* ISA pin */ - { - irq = mp_irqs[idx].mpc_srcbusirq; - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - /* - * PCI IRQs are mapped in order - */ - i = irq = 0; - while (i < apic) - irq += nr_ioapic_registers[i++]; - irq += pin; - irq = gsi_irq_sharing(irq); - break; - } - default: - { - printk(KERN_ERR "unknown bus type %d.\n",bus); - irq = 0; - break; - } + if (test_bit(bus, mp_bus_not_pci)) { + irq = mp_irqs[idx].mpc_srcbusirq; + } else { + /* + * PCI IRQs are mapped in order + */ + i = irq = 0; + while (i < apic) + irq += nr_ioapic_registers[i++]; + irq += pin; + irq = gsi_irq_sharing(irq); } BUG_ON(irq >= NR_IRQS); return irq; diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index c12acc3b555..a8d38c0bb44 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -42,7 +42,7 @@ int acpi_found_madt; * MP-table. */ unsigned char apic_version [MAX_APICS]; -unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; static int mp_current_pci_id = 0; @@ -173,9 +173,9 @@ static void __init MP_bus_info (struct mpc_config_bus *m) Dprintk("Bus #%d is %s\n", m->mpc_busid, str); if (strncmp(str, "ISA", 3) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; + set_bit(m->mpc_busid, mp_bus_not_pci); } else if (strncmp(str, "PCI", 3) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; + clear_bit(m->mpc_busid, mp_bus_not_pci); mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; mp_current_pci_id++; } else { @@ -808,8 +808,7 @@ void __init mp_config_acpi_legacy_irqs (void) /* * Fabricate the legacy ISA bus (bus #31). */ - mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; - Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); + set_bit(MP_ISA_BUS, mp_bus_not_pci); /* * Locate the IOAPIC that manages the ISA IRQs (0-15). -- cgit v1.2.3 From a01fd3baff5e2e744f9d1af0e5d3e6b6082e4dcf Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] Remove useless wrapper in mpparse.c code It used to contain support code for NUMAQ, but that is long gone already on 64bit. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/mpparse.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index a8d38c0bb44..79423d1bfca 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -579,7 +579,7 @@ static int __init smp_scan_config (unsigned long base, unsigned long length) return 0; } -void __init find_intel_smp (void) +void __init find_smp_config(void) { unsigned int address; @@ -617,15 +617,6 @@ void __init find_intel_smp (void) printk(KERN_INFO "No mptable found.\n"); } -/* - * - Intel MP Configuration Table - */ -void __init find_smp_config (void) -{ - find_intel_smp(); -} - - /* -------------------------------------------------------------------------- ACPI-based MP Configuration -------------------------------------------------------------------------- */ -- cgit v1.2.3 From 276ec1a76ad204d47947894a914e10e798400ffb Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] Remove some unneeded ACPI externs in mpparse.c They are not used in this file so remove them. i386 didn't have them either. Cc: len.brown@intel.com Signed-off-by: Andi Kleen --- arch/x86_64/kernel/mpparse.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 79423d1bfca..0cc88f7f288 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -70,15 +70,6 @@ unsigned disabled_cpus __initdata; /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; -/* ACPI MADT entry parsing functions */ -#ifdef CONFIG_ACPI -extern struct acpi_boot_flags acpi_boot; -extern int acpi_parse_lapic (acpi_table_entry_header *header); -extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header); -extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header); -extern int acpi_parse_ioapic (acpi_table_entry_header *header); -#endif /*CONFIG_ACPI*/ - u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; -- cgit v1.2.3 From efec3b9a3282714c8441b9bf476f8358bed9ecaa Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] Fix up some non linuxy style in ACPI functions in mpparse.c No functional changes. Cc: len.brown@intel.com Signed-off-by: Andi Kleen --- arch/x86_64/kernel/mpparse.c | 57 +++++++++++++------------------------------- 1 file changed, 16 insertions(+), 41 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 0cc88f7f288..d63f849aea2 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -614,23 +614,17 @@ void __init find_smp_config(void) #ifdef CONFIG_ACPI -void __init mp_register_lapic_address ( - u64 address) +void __init mp_register_lapic_address(u64 address) { mp_lapic_addr = (unsigned long) address; - set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); - if (boot_cpu_id == -1U) boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); } - -void __cpuinit mp_register_lapic ( - u8 id, - u8 enabled) +void __cpuinit mp_register_lapic (u8 id, u8 enabled) { struct mpc_config_processor processor; int boot_cpu = 0; @@ -668,11 +662,9 @@ static struct mp_ioapic_routing { u32 pin_programmed[4]; } mp_ioapic_routing[MAX_IO_APICS]; - -static int mp_find_ioapic ( - int gsi) +static int mp_find_ioapic(int gsi) { - int i = 0; + int i = 0; /* Find the IOAPIC that manages this GSI. */ for (i = 0; i < nr_ioapics; i++) { @@ -682,17 +674,12 @@ static int mp_find_ioapic ( } printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); - return -1; } - -void __init mp_register_ioapic ( - u8 id, - u32 address, - u32 gsi_base) +void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) { - int idx = 0; + int idx = 0; if (nr_ioapics >= MAX_IO_APICS) { printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " @@ -729,16 +716,10 @@ void __init mp_register_ioapic ( mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, mp_ioapic_routing[idx].gsi_start, mp_ioapic_routing[idx].gsi_end); - - return; } - -void __init mp_override_legacy_irq ( - u8 bus_irq, - u8 polarity, - u8 trigger, - u32 gsi) +void __init +mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) { struct mpc_config_intsrc intsrc; int ioapic = -1; @@ -776,16 +757,13 @@ void __init mp_override_legacy_irq ( mp_irqs[mp_irq_entries] = intsrc; if (++mp_irq_entries == MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded!\n"); - - return; } - -void __init mp_config_acpi_legacy_irqs (void) +void __init mp_config_acpi_legacy_irqs(void) { struct mpc_config_intsrc intsrc; - int i = 0; - int ioapic = -1; + int i = 0; + int ioapic = -1; /* * Fabricate the legacy ISA bus (bus #31). @@ -843,24 +821,22 @@ void __init mp_config_acpi_legacy_irqs (void) if (++mp_irq_entries == MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded!\n"); } - - return; } #define MAX_GSI_NUM 4096 int mp_register_gsi(u32 gsi, int triggering, int polarity) { - int ioapic = -1; - int ioapic_pin = 0; - int idx, bit = 0; - static int pci_irq = 16; + int ioapic = -1; + int ioapic_pin = 0; + int idx, bit = 0; + static int pci_irq = 16; /* * Mapping between Global System Interrupts, which * represent all possible interrupts, to the IRQs * assigned to actual devices. */ - static int gsi_to_irq[MAX_GSI_NUM]; + static int gsi_to_irq[MAX_GSI_NUM]; if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return gsi; @@ -934,5 +910,4 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) polarity == ACPI_ACTIVE_HIGH ? 0 : 1); return gsi; } - #endif /*CONFIG_ACPI*/ -- cgit v1.2.3 From fe7414a2882c953788af13a7f2c9d570ed8f71c5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] Use BUILD_BUG_ON in apic.c build sanity checking Makes code a little shorter. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/apic.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 20a76f20962..496cd1bd2ae 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -265,8 +265,6 @@ void __init sync_Arb_IDs(void) | APIC_DM_INIT); } -extern void __error_in_apic_c (void); - /* * An initial setup of the virtual wire mode. */ @@ -313,8 +311,7 @@ void __cpuinit setup_local_APIC (void) value = apic_read(APIC_LVR); - if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f) - __error_in_apic_c(); + BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f); /* * Double-check whether this APIC is really registered. -- cgit v1.2.3 From 44cc45267bbe7c64f7d85b074bd670b48b5abdfb Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] Remove obsolete CVS $Id$ from assembler files in arch/x86_64/kernel/* CVS hasn't been used for a long time for them. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/entry.S | 2 -- arch/x86_64/kernel/head.S | 2 -- 2 files changed, 4 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index a04fc4108ff..2092f565aa8 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -4,8 +4,6 @@ * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs * Copyright (C) 2000 Pavel Machek - * - * $Id$ */ /* diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 505ec4a5750..4a3326ce709 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -5,8 +5,6 @@ * Copyright (C) 2000 Pavel Machek * Copyright (C) 2000 Karsten Keil * Copyright (C) 2001,2002 Andi Kleen - * - * $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $ */ -- cgit v1.2.3 From caff0710ebf6f2c44cbd2b8b31fd6329148bed2e Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] initialize end of memory variables as early as possible While an earlier patch already did a small step into that direction, this patch moves initialization of all memory end variables to as early as possible, so that dependent code doesn't need to check whether these variables have already been set. Also, remove a misleading (perhaps just outdated) comment, and make static a variable only used in a single file. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/x86_64/kernel/e820.c | 2 +- arch/x86_64/kernel/setup.c | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index d6d7f731f6f..1362aad4f3a 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -40,7 +40,7 @@ unsigned long end_pfn_map; /* * Last pfn which the user wants to use. */ -unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT; +static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT; extern struct resource code_resource, data_resource; diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index aab4bd66aa0..ddc3b2d9a16 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -554,7 +554,7 @@ void __init setup_arch(char **cmdline_p) * we are rounding upwards: */ end_pfn = e820_end_of_ram(); - num_physpages = end_pfn; /* for pfn_valid */ + num_physpages = end_pfn; check_efer(); @@ -574,6 +574,11 @@ void __init setup_arch(char **cmdline_p) acpi_boot_table_init(); #endif + /* How many end-of-memory variables you have, grandma! */ + max_low_pfn = end_pfn; + max_pfn = end_pfn; + high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1; + #ifdef CONFIG_ACPI_NUMA /* * Parse SRAT to discover nodes. -- cgit v1.2.3 From 3b94355c47e2b025a7ececa0a14180e8fce6b0f1 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] remove int_delivery_dest The genapic field and the accessor macro weren't used anywhere. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/x86_64/kernel/genapic_cluster.c | 1 - arch/x86_64/kernel/genapic_flat.c | 2 -- 2 files changed, 3 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/genapic_cluster.c b/arch/x86_64/kernel/genapic_cluster.c index 3020917546d..cdb90e671b8 100644 --- a/arch/x86_64/kernel/genapic_cluster.c +++ b/arch/x86_64/kernel/genapic_cluster.c @@ -118,7 +118,6 @@ struct genapic apic_cluster = { .name = "clustered", .int_delivery_mode = dest_Fixed, .int_dest_mode = (APIC_DEST_PHYSICAL != 0), - .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_FIXED, .target_cpus = cluster_target_cpus, .apic_id_registered = cluster_apic_id_registered, .init_apic_ldr = cluster_init_apic_ldr, diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c index eb86d374813..2ceb8b872b5 100644 --- a/arch/x86_64/kernel/genapic_flat.c +++ b/arch/x86_64/kernel/genapic_flat.c @@ -121,7 +121,6 @@ struct genapic apic_flat = { .name = "flat", .int_delivery_mode = dest_LowestPrio, .int_dest_mode = (APIC_DEST_LOGICAL != 0), - .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST, .target_cpus = flat_target_cpus, .apic_id_registered = flat_apic_id_registered, .init_apic_ldr = flat_init_apic_ldr, @@ -180,7 +179,6 @@ struct genapic apic_physflat = { .name = "physical flat", .int_delivery_mode = dest_Fixed, .int_dest_mode = (APIC_DEST_PHYSICAL != 0), - .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_FIXED, .target_cpus = physflat_target_cpus, .apic_id_registered = flat_apic_id_registered, .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/ -- cgit v1.2.3 From f38db651d5da5e10235fd7dd31095969fb7ef6fb Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] Calgary IOMMU: consolidate per bus data structures Move the tce_table_kva array, disabled bitmap and bus_to_phb array into a new per bus 'struct calgary_bus_info'. Also slightly reorganize build_tce_table and tce_table_setparms to avoid exporting bus_info to tce.c. Signed-off-by: Muli Ben-Yehuda Signed-off-by: Jon Mason Signed-off-by: Andi Kleen --- arch/x86_64/kernel/pci-calgary.c | 53 +++++++++++++++++++++------------------- arch/x86_64/kernel/tce.c | 10 -------- 2 files changed, 28 insertions(+), 35 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 146924ba5df..96f6a866afa 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -111,17 +111,17 @@ static const unsigned long phb_offsets[] = { 0xB000 /* PHB3 */ }; -static char bus_to_phb[MAX_PHB_BUS_NUM]; -void* tce_table_kva[MAX_PHB_BUS_NUM]; unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; static int translate_empty_slots __read_mostly = 0; static int calgary_detected __read_mostly = 0; -/* - * the bitmap of PHBs the user requested that we disable - * translation on. - */ -static DECLARE_BITMAP(translation_disabled, MAX_PHB_BUS_NUM); +struct calgary_bus_info { + void *tce_space; + int translation_disabled; + signed char phbid; +}; + +static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; static void tce_cache_blast(struct iommu_table *tbl); @@ -149,7 +149,7 @@ static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) static inline int translate_phb(struct pci_dev* dev) { - int disabled = test_bit(dev->bus->number, translation_disabled); + int disabled = bus_info[dev->bus->number].translation_disabled; return !disabled; } @@ -454,7 +454,7 @@ static struct dma_mapping_ops calgary_dma_ops = { static inline int busno_to_phbid(unsigned char num) { - return bus_to_phb[num]; + return bus_info[num].phbid; } static inline unsigned long split_queue_offset(unsigned char num) @@ -631,6 +631,10 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) if (ret) return ret; + tbl = dev->sysdata; + tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; + tce_free(tbl, 0, tbl->it_size); + calgary_reserve_regions(dev); /* set TARs for each PHB */ @@ -824,7 +828,7 @@ static int __init calgary_init(void) calgary_init_one_nontraslated(dev); continue; } - if (!tce_table_kva[dev->bus->number] && !translate_empty_slots) { + if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots) { pci_dev_put(dev); continue; } @@ -844,7 +848,7 @@ error: pci_dev_put(dev); continue; } - if (!tce_table_kva[dev->bus->number] && !translate_empty_slots) + if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots) continue; calgary_disable_translation(dev); calgary_free_tar(dev); @@ -894,9 +898,8 @@ void __init detect_calgary(void) for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { int dev; - - tce_table_kva[bus] = NULL; - bus_to_phb[bus] = -1; + struct calgary_bus_info *info = &bus_info[bus]; + info->phbid = -1; if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY) continue; @@ -907,12 +910,9 @@ void __init detect_calgary(void) */ phb = (phb + 1) % PHBS_PER_CALGARY; - if (test_bit(bus, translation_disabled)) { - printk(KERN_INFO "Calgary: translation is disabled for " - "PHB 0x%x\n", bus); - /* skip this phb, don't allocate a tbl for it */ + if (info->translation_disabled) continue; - } + /* * Scan the slots of the PCI bus to see if there is a device present. * The parent bus will be the zero-ith device, so start at 1. @@ -923,8 +923,8 @@ void __init detect_calgary(void) tbl = alloc_tce_table(); if (!tbl) goto cleanup; - tce_table_kva[bus] = tbl; - bus_to_phb[bus] = phb; + info->tce_space = tbl; + info->phbid = phb; calgary_found = 1; break; } @@ -940,9 +940,12 @@ void __init detect_calgary(void) return; cleanup: - for (--bus; bus >= 0; --bus) - if (tce_table_kva[bus]) - free_tce_table(tce_table_kva[bus]); + for (--bus; bus >= 0; --bus) { + struct calgary_bus_info *info = &bus_info[bus]; + + if (info->tce_space) + free_tce_table(info->tce_space); + } } int __init calgary_iommu_init(void) @@ -1016,7 +1019,7 @@ static int __init calgary_parse_options(char *p) if (bridge < MAX_PHB_BUS_NUM) { printk(KERN_INFO "Calgary: disabling " "translation for PHB 0x%x\n", bridge); - set_bit(bridge, translation_disabled); + bus_info[bridge].translation_disabled = 1; } } diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c index 0905ce6b9b3..cbabfdf78e0 100644 --- a/arch/x86_64/kernel/tce.c +++ b/arch/x86_64/kernel/tce.c @@ -106,14 +106,6 @@ static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl) /* set the tce table size - measured in entries */ tbl->it_size = table_size_to_number_of_entries(specified_table_size); - tbl->it_base = (unsigned long)tce_table_kva[dev->bus->number]; - if (!tbl->it_base) { - printk(KERN_ERR "Calgary: iommu_table_setparms: " - "no table allocated?!\n"); - ret = -ENOMEM; - goto done; - } - /* * number of bytes needed for the bitmap size in number of * entries; we need one bit per entry @@ -162,8 +154,6 @@ int build_tce_table(struct pci_dev *dev, void __iomem *bbar) if (ret) goto free_tbl; - tce_free(tbl, 0, tbl->it_size); - tbl->bbar = bbar; /* -- cgit v1.2.3 From 9f2dc46d5ec6fd7787182d2232a1003af11879f1 Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] Calgary IOMMU: break out of pci_find_device_reverse if dev not found Signed-off-by: Muli Ben-Yehuda Signed-off-by: Jon Mason Signed-off-by: Andi Kleen --- arch/x86_64/kernel/pci-calgary.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 96f6a866afa..b2182c93630 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -844,6 +844,8 @@ error: dev = pci_find_device_reverse(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_CALGARY, dev); + if (!dev) + break; if (!translate_phb(dev)) { pci_dev_put(dev); continue; -- cgit v1.2.3 From b8f4fe66a560b5ccbb4d4d0d2df356a5d9b98b83 Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] Calgary IOMMU: fix error path memleak in calgary_free_tar We were freeing the iommu_table and leaking the bitmap pages. Also rename it to calgary_free_bus, which is more accurate. Signed-off-by: Muli Ben-Yehuda Signed-off-by: Jon Mason Signed-off-by: Andi Kleen --- arch/x86_64/kernel/pci-calgary.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index b2182c93630..6c5da1d813f 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -658,11 +658,12 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) return 0; } -static void __init calgary_free_tar(struct pci_dev *dev) +static void __init calgary_free_bus(struct pci_dev *dev) { u64 val64; struct iommu_table *tbl = dev->sysdata; void __iomem *target; + unsigned int bitmapsz; target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number)); val64 = be64_to_cpu(readq(target)); @@ -670,8 +671,15 @@ static void __init calgary_free_tar(struct pci_dev *dev) writeq(cpu_to_be64(val64), target); readq(target); /* flush */ + bitmapsz = tbl->it_size / BITS_PER_BYTE; + free_pages((unsigned long)tbl->it_map, get_order(bitmapsz)); + tbl->it_map = NULL; + kfree(tbl); dev->sysdata = NULL; + + /* Can't free bootmem allocated memory after system is up :-( */ + bus_info[dev->bus->number].tce_space = NULL; } static void calgary_watchdog(unsigned long data) @@ -853,7 +861,7 @@ error: if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots) continue; calgary_disable_translation(dev); - calgary_free_tar(dev); + calgary_free_bus(dev); pci_dev_put(dev); } -- cgit v1.2.3 From 871b17008e93d7e96f96829ce5f0393c9902d25b Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] Calgary IOMMU: fix reference counting of Calgary PCI devices The pci_get_device() API decrements the reference count on the 'from' parameter when it continues searching. Therefore, take a ref count on Calgary bus when we initialize them in either translated or non-translated mode. Signed-off-by: Muli Ben-Yehuda Signed-off-by: Jon Mason Signed-off-by: Andi Kleen --- arch/x86_64/kernel/pci-calgary.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 6c5da1d813f..7302834718c 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -786,6 +786,7 @@ static inline unsigned int __init locate_register_space(struct pci_dev *dev) static int __init calgary_init_one_nontraslated(struct pci_dev *dev) { + pci_dev_get(dev); dev->sysdata = NULL; dev->bus->self = dev; @@ -810,6 +811,7 @@ static int __init calgary_init_one(struct pci_dev *dev) if (ret) goto iounmap; + pci_dev_get(dev); dev->bus->self = dev; calgary_enable_translation(dev); @@ -836,10 +838,9 @@ static int __init calgary_init(void) calgary_init_one_nontraslated(dev); continue; } - if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots) { - pci_dev_put(dev); + if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots) continue; - } + ret = calgary_init_one(dev); if (ret) goto error; @@ -860,9 +861,10 @@ error: } if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots) continue; + calgary_disable_translation(dev); calgary_free_bus(dev); - pci_dev_put(dev); + pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ } return ret; -- cgit v1.2.3 From a4fc520a0ff92810eea46d74bf742ef73849302e Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] Calgary IOMMU: calgary_init_one_nontraslated() can return void Signed-off-by: Muli Ben-Yehuda Signed-off-by: Jon Mason Signed-off-by: Andi Kleen --- arch/x86_64/kernel/pci-calgary.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 7302834718c..f541fb564d9 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -784,13 +784,11 @@ static inline unsigned int __init locate_register_space(struct pci_dev *dev) return address; } -static int __init calgary_init_one_nontraslated(struct pci_dev *dev) +static void __init calgary_init_one_nontraslated(struct pci_dev *dev) { pci_dev_get(dev); dev->sysdata = NULL; dev->bus->self = dev; - - return 0; } static int __init calgary_init_one(struct pci_dev *dev) -- cgit v1.2.3 From 0577f148b5e9a773020e3da1e6332a7c6df9d601 Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] Calgary IOMMU: save a bit of space in bus_info Make translation_disabled a uchar rather than an int Signed-off-by: Muli Ben-Yehuda Signed-off-by: Jon Mason Signed-off-by: Andi Kleen --- arch/x86_64/kernel/pci-calgary.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index f541fb564d9..caf84e4e4ca 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -117,7 +117,7 @@ static int calgary_detected __read_mostly = 0; struct calgary_bus_info { void *tce_space; - int translation_disabled; + unsigned char translation_disabled; signed char phbid; }; -- cgit v1.2.3 From a752d7194c4fb5a3e767c95542d04fc5decb1d52 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] fix is_at_popf() for compat tasks When testing for the REX instruction prefix, first check for 32-bit mode because in compat mode the REX prefix is an increment instruction. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen --- arch/x86_64/kernel/ptrace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c index d35ec1bc696..3a52c7bcfd1 100644 --- a/arch/x86_64/kernel/ptrace.c +++ b/arch/x86_64/kernel/ptrace.c @@ -141,8 +141,11 @@ static int is_at_popf(struct task_struct *child, struct pt_regs *regs) case 0xf0: case 0xf2: case 0xf3: continue; - /* REX prefixes */ case 0x40 ... 0x4f: + if (regs->cs != __USER_CS) + /* 32-bit mode: register increment */ + return 0; + /* 64-bit mode: REX prefix */ continue; /* CHECKME: f0, f2, f3 */ -- cgit v1.2.3 From 29a6c25bd63dbd52e73d25268402d065ad84c0fe Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] Fix gdt table size in trampoline.S Allows easier extension of the GDT by using the proper C symbol for the size in the descriptor. Signed-off-by: Eric W. Biederman Signed-off-by: Andi Kleen --- arch/x86_64/kernel/trampoline.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/trampoline.S b/arch/x86_64/kernel/trampoline.S index 23a03eb91fc..c79b99a9e2f 100644 --- a/arch/x86_64/kernel/trampoline.S +++ b/arch/x86_64/kernel/trampoline.S @@ -64,7 +64,7 @@ idt_48: .word 0, 0 # idt base = 0L gdt_48: - .short __KERNEL32_CS + 7 # gdt limit + .short GDT_ENTRIES*8 - 1 # gdt limit .long cpu_gdt_table-__START_KERNEL_map .globl trampoline_end -- cgit v1.2.3 From 145106e81090fcdfb57b7f2e90b98c6445b6f33d Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] remove superflous BUG_ON's in nommu and gart There's no need to check for invalid DMA data direction in nommu and gart since we do it in dma-mapping.h anyway before calling the individual dma-ops. Signed-off-by: Muli Ben-Yehuda Signed-off-by: Andi Kleen --- arch/x86_64/kernel/pci-gart.c | 3 --- arch/x86_64/kernel/pci-nommu.c | 1 - 2 files changed, 4 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c index 6d3e61baf7a..16261a8a330 100644 --- a/arch/x86_64/kernel/pci-gart.c +++ b/arch/x86_64/kernel/pci-gart.c @@ -239,8 +239,6 @@ dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir) { unsigned long phys_mem, bus; - BUG_ON(dir == DMA_NONE); - if (!dev) dev = &fallback_dev; @@ -383,7 +381,6 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) unsigned long pages = 0; int need = 0, nextneed; - BUG_ON(dir == DMA_NONE); if (nents == 0) return 0; diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c index aad7609d8e9..df09ab05a1b 100644 --- a/arch/x86_64/kernel/pci-nommu.c +++ b/arch/x86_64/kernel/pci-nommu.c @@ -59,7 +59,6 @@ int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, { int i; - BUG_ON(direction == DMA_NONE); for (i = 0; i < nents; i++ ) { struct scatterlist *s = &sg[i]; BUG_ON(!s->page); -- cgit v1.2.3 From d4d35854a1cf7fb170c8e133a5e40a9d19e7fc17 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] remove lock prefix from is_at_popf() tests The lock prefix will cause an exception when used with the popf instruction, so no need to continue searching after it's found. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen --- arch/x86_64/kernel/ptrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c index 3a52c7bcfd1..412c6a8bf90 100644 --- a/arch/x86_64/kernel/ptrace.c +++ b/arch/x86_64/kernel/ptrace.c @@ -138,7 +138,7 @@ static int is_at_popf(struct task_struct *child, struct pt_regs *regs) case 0x26: case 0x2e: case 0x36: case 0x3e: case 0x64: case 0x65: - case 0xf0: case 0xf2: case 0xf3: + case 0xf2: case 0xf3: continue; case 0x40 ... 0x4f: @@ -148,7 +148,7 @@ static int is_at_popf(struct task_struct *child, struct pt_regs *regs) /* 64-bit mode: REX prefix */ continue; - /* CHECKME: f0, f2, f3 */ + /* CHECKME: f2, f3 */ /* * pushf: NOTE! We should probably not let -- cgit v1.2.3 From 9ca33eb6981549c0d1b7aea7f99f1ba602161356 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] Use early CPU identify before early command line parsing This makes it possible to modify CPU flags in command line options without hacks. And remove another copy in head64.c Signed-off-by: Andi Kleen --- arch/x86_64/kernel/head64.c | 19 ------------------- arch/x86_64/kernel/setup.c | 4 ++-- 2 files changed, 2 insertions(+), 21 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index bacbd75c63a..f2461fde9f8 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -56,24 +56,6 @@ static void __init copy_bootdata(char *real_mode_data) printk("Bootdata ok (command line is %s)\n", saved_command_line); } -static void __init setup_boot_cpu_data(void) -{ - unsigned int dummy, eax; - - /* get vendor info */ - cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level, - (unsigned int *)&boot_cpu_data.x86_vendor_id[0], - (unsigned int *)&boot_cpu_data.x86_vendor_id[8], - (unsigned int *)&boot_cpu_data.x86_vendor_id[4]); - - /* get cpu type */ - cpuid(1, &eax, &dummy, &dummy, - (unsigned int *) &boot_cpu_data.x86_capability); - boot_cpu_data.x86 = (eax >> 8) & 0xf; - boot_cpu_data.x86_model = (eax >> 4) & 0xf; - boot_cpu_data.x86_mask = eax & 0xf; -} - void __init x86_64_start_kernel(char * real_mode_data) { char *s; @@ -117,6 +99,5 @@ void __init x86_64_start_kernel(char * real_mode_data) if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE) panic("Kernel too big for kernel mapping\n"); - setup_boot_cpu_data(); start_kernel(); } diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index ddc3b2d9a16..73f1cdd140f 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -545,10 +545,10 @@ void __init setup_arch(char **cmdline_p) data_resource.start = virt_to_phys(&_etext); data_resource.end = virt_to_phys(&_edata)-1; - parse_cmdline_early(cmdline_p); - early_identify_cpu(&boot_cpu_data); + parse_cmdline_early(cmdline_p); + /* * partially used pages are not usable - thus * we are rounding upwards: -- cgit v1.2.3 From 2c8c0e6b8d7700a990da8d24eff767f9ca223b96 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] Convert x86-64 to early param Instead of hackish manual parsing Requires earlier i386 patchkit, but also fixes i386 early_printk again. I removed some obsolete really early parameters which didn't do anything useful. Also made a few parameters that needed it early (mostly oops printing setup) Also removed one panic check that wasn't visible without early console anyways (the early console is now initialized after that panic) This cleans up a lot of code. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/apic.c | 34 ++++--- arch/x86_64/kernel/e820.c | 53 ++++++++-- arch/x86_64/kernel/early_printk.c | 20 ++-- arch/x86_64/kernel/head64.c | 15 --- arch/x86_64/kernel/io_apic.c | 15 ++- arch/x86_64/kernel/machine_kexec.c | 28 ++++++ arch/x86_64/kernel/pci-dma.c | 7 +- arch/x86_64/kernel/setup.c | 197 ++++--------------------------------- arch/x86_64/kernel/setup64.c | 8 +- arch/x86_64/kernel/smpboot.c | 6 +- arch/x86_64/kernel/traps.c | 24 +++-- 11 files changed, 151 insertions(+), 256 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 496cd1bd2ae..660602c51f5 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -36,6 +36,7 @@ #include #include #include +#include int apic_verbosity; int apic_runs_main_timer; @@ -546,18 +547,24 @@ static void apic_pm_activate(void) { } static int __init apic_set_verbosity(char *str) { + if (str == NULL) { + skip_ioapic_setup = 0; + ioapic_force = 1; + return 0; + } if (strcmp("debug", str) == 0) apic_verbosity = APIC_DEBUG; else if (strcmp("verbose", str) == 0) apic_verbosity = APIC_VERBOSE; - else + else { printk(KERN_WARNING "APIC Verbosity level %s not recognised" - " use apic=verbose or apic=debug", str); + " use apic=verbose or apic=debug\n", str); + return -EINVAL; + } - return 1; + return 0; } - -__setup("apic=", apic_set_verbosity); +early_param("apic", apic_set_verbosity); /* * Detect and enable local APICs on non-SMP boards. @@ -1078,14 +1085,17 @@ int __init APIC_init_uniprocessor (void) static __init int setup_disableapic(char *str) { disable_apic = 1; - return 1; -} + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + return 0; +} +early_param("disableapic", setup_disableapic); +/* same as disableapic, for compatibility */ static __init int setup_nolapic(char *str) { - disable_apic = 1; - return 1; + return setup_disableapic(str); } +early_param("nolapic", setup_nolapic); static __init int setup_noapictimer(char *str) { @@ -1118,11 +1128,5 @@ static __init int setup_apicpmtimer(char *s) } __setup("apicpmtimer", setup_apicpmtimer); -/* dummy parsing: see setup.c */ - -__setup("disableapic", setup_disableapic); -__setup("nolapic", setup_nolapic); /* same as disableapic, for compatibility */ - __setup("noapictimer", setup_noapictimer); -/* no "lapic" flag - we only use the lapic when the BIOS tells us so. */ diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 1362aad4f3a..a4739dc8293 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -596,31 +596,64 @@ void __init setup_memory_region(void) e820_print_map(who); } -void __init parse_memopt(char *p, char **from) -{ - end_user_pfn = memparse(p, from); +static int __init parse_memopt(char *p) +{ + if (!p) + return -EINVAL; + end_user_pfn = memparse(p, &p); end_user_pfn >>= PAGE_SHIFT; + return 0; } +early_param("mem", parse_memopt); + +static int userdef __initdata; -void __init parse_memmapopt(char *p, char **from) +static int __init parse_memmap_opt(char *p) { + char *oldp; unsigned long long start_at, mem_size; - mem_size = memparse(p, from); - p = *from; + if (!strcmp(p, "exactmap")) { +#ifdef CONFIG_CRASH_DUMP + /* If we are doing a crash dump, we + * still need to know the real mem + * size before original memory map is + * reset. + */ + saved_max_pfn = e820_end_of_ram(); +#endif + end_pfn_map = 0; + e820.nr_map = 0; + userdef = 1; + return 0; + } + + oldp = p; + mem_size = memparse(p, &p); + if (p == oldp) + return -EINVAL; if (*p == '@') { - start_at = memparse(p+1, from); + start_at = memparse(p+1, &p); add_memory_region(start_at, mem_size, E820_RAM); } else if (*p == '#') { - start_at = memparse(p+1, from); + start_at = memparse(p+1, &p); add_memory_region(start_at, mem_size, E820_ACPI); } else if (*p == '$') { - start_at = memparse(p+1, from); + start_at = memparse(p+1, &p); add_memory_region(start_at, mem_size, E820_RESERVED); } else { end_user_pfn = (mem_size >> PAGE_SHIFT); } - p = *from; + return *p == '\0' ? 0 : -EINVAL; +} +early_param("memmap", parse_memmap_opt); + +void finish_e820_parsing(void) +{ + if (userdef) { + printk(KERN_INFO "user-defined physical RAM map:\n"); + e820_print_map("user"); + } } unsigned long pci_mem_start = 0xaeedbabe; diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c index 140051e07fa..e22ecd54870 100644 --- a/arch/x86_64/kernel/early_printk.c +++ b/arch/x86_64/kernel/early_printk.c @@ -215,20 +215,16 @@ void early_printk(const char *fmt, ...) static int __initdata keep_early; -int __init setup_early_printk(char *opt) +static int __init setup_early_printk(char *buf) { - char *space; - char buf[256]; + if (!buf) + return 0; if (early_console_initialized) - return 1; - - strlcpy(buf,opt,sizeof(buf)); - space = strchr(buf, ' '); - if (space) - *space = 0; + return 0; + early_console_initialized = 1; - if (strstr(buf,"keep")) + if (!strcmp(buf,"keep")) keep_early = 1; if (!strncmp(buf, "serial", 6)) { @@ -248,11 +244,12 @@ int __init setup_early_printk(char *opt) early_console = &simnow_console; keep_early = 1; } - early_console_initialized = 1; register_console(early_console); return 0; } +early_param("earlyprintk", setup_early_printk); + void __init disable_early_printk(void) { if (!early_console_initialized || !early_console) @@ -266,4 +263,3 @@ void __init disable_early_printk(void) } } -__setup("earlyprintk=", setup_early_printk); diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index f2461fde9f8..6716cbfc34a 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -58,7 +58,6 @@ static void __init copy_bootdata(char *real_mode_data) void __init x86_64_start_kernel(char * real_mode_data) { - char *s; int i; for (i = 0; i < 256; i++) @@ -85,19 +84,5 @@ void __init x86_64_start_kernel(char * real_mode_data) #ifdef CONFIG_SMP cpu_set(0, cpu_online_map); #endif - s = strstr(saved_command_line, "earlyprintk="); - if (s != NULL) - setup_early_printk(strchr(s, '=') + 1); -#ifdef CONFIG_NUMA - s = strstr(saved_command_line, "numa="); - if (s != NULL) - numa_setup(s+5); -#endif - if (strstr(saved_command_line, "disableapic")) - disable_apic = 1; - /* You need early console to see that */ - if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE) - panic("Kernel too big for kernel mapping\n"); - start_kernel(); } diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index a1412042b91..afac3dbb372 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -48,7 +48,7 @@ int sis_apic_bug; /* not actually supported, dummy for compile */ static int no_timer_check; -int disable_timer_pin_1 __initdata; +static int disable_timer_pin_1 __initdata; int timer_over_8254 __initdata = 0; @@ -253,18 +253,17 @@ int ioapic_force; static int __init disable_ioapic_setup(char *str) { skip_ioapic_setup = 1; - return 1; + return 0; } +early_param("noapic", disable_ioapic_setup); -static int __init enable_ioapic_setup(char *str) +/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ +static int __init disable_timer_pin_setup(char *arg) { - ioapic_force = 1; - skip_ioapic_setup = 0; + disable_timer_pin_1 = 1; return 1; } - -__setup("noapic", disable_ioapic_setup); -__setup("apic", enable_ioapic_setup); +__setup("disable_timer_pin_1", disable_timer_pin_setup); static int __init setup_disable_8254_timer(char *s) { diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c index 106076b370f..2e94c072d84 100644 --- a/arch/x86_64/kernel/machine_kexec.c +++ b/arch/x86_64/kernel/machine_kexec.c @@ -226,3 +226,31 @@ NORET_TYPE void machine_kexec(struct kimage *image) rnk = (relocate_new_kernel_t) control_code_buffer; (*rnk)(page_list, control_code_buffer, image->start, start_pgtable); } + +/* crashkernel=size@addr specifies the location to reserve for + * a crash kernel. By reserving this memory we guarantee + * that linux never set's it up as a DMA target. + * Useful for holding code to do something appropriate + * after a kernel panic. + */ +static int __init setup_crashkernel(char *arg) +{ + unsigned long size, base; + char *p; + if (!arg) + return -EINVAL; + size = memparse(arg, &p); + if (arg == p) + return -EINVAL; + if (*p == '@') { + base = memparse(p+1, &p); + /* FIXME: Do I want a sanity check to validate the + * memory range? Yes you do, but it's too early for + * e820 -AK */ + crashk_res.start = base; + crashk_res.end = base + size - 1; + } + return 0; +} +early_param("crashkernel", setup_crashkernel); + diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c index 9c44f4f2433..4dcb671bd19 100644 --- a/arch/x86_64/kernel/pci-dma.c +++ b/arch/x86_64/kernel/pci-dma.c @@ -236,6 +236,9 @@ __init int iommu_setup(char *p) { iommu_merge = 1; + if (!p) + return -EINVAL; + while (*p) { if (!strncmp(p,"off",3)) no_iommu = 1; @@ -278,9 +281,9 @@ __init int iommu_setup(char *p) if (*p == ',') ++p; } - return 1; + return 0; } -__setup("iommu=", iommu_setup); +early_param("iommu", iommu_setup); void __init pci_iommu_alloc(void) { diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 73f1cdd140f..f55540a6084 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -76,11 +76,6 @@ unsigned long mmu_cr4_features; int acpi_disabled; EXPORT_SYMBOL(acpi_disabled); -#ifdef CONFIG_ACPI -extern int __initdata acpi_ht; -extern acpi_interrupt_flags acpi_sci_flags; -int __initdata acpi_force = 0; -#endif int acpi_numa __initdata; @@ -276,183 +271,22 @@ static void __init probe_roms(void) } } -/* Check for full argument with no trailing characters */ -static int fullarg(char *p, char *arg) +#ifdef CONFIG_PROC_VMCORE +/* elfcorehdr= specifies the location of elf core header + * stored by the crashed kernel. This option will be passed + * by kexec loader to the capture kernel. + */ +static int __init setup_elfcorehdr(char *arg) { - int l = strlen(arg); - return !memcmp(p, arg, l) && (p[l] == 0 || isspace(p[l])); + char *end; + if (!arg) + return -EINVAL; + elfcorehdr_addr = memparse(arg, &end); + return end > arg ? 0 : -EINVAL; } - -static __init void parse_cmdline_early (char ** cmdline_p) -{ - char c = ' ', *to = command_line, *from = COMMAND_LINE; - int len = 0; - int userdef = 0; - - for (;;) { - if (c != ' ') - goto next_char; - -#ifdef CONFIG_SMP - /* - * If the BIOS enumerates physical processors before logical, - * maxcpus=N at enumeration-time can be used to disable HT. - */ - else if (!memcmp(from, "maxcpus=", 8)) { - extern unsigned int maxcpus; - - maxcpus = simple_strtoul(from + 8, NULL, 0); - } -#endif -#ifdef CONFIG_ACPI - /* "acpi=off" disables both ACPI table parsing and interpreter init */ - if (fullarg(from,"acpi=off")) - disable_acpi(); - - if (fullarg(from, "acpi=force")) { - /* add later when we do DMI horrors: */ - acpi_force = 1; - acpi_disabled = 0; - } - - /* acpi=ht just means: do ACPI MADT parsing - at bootup, but don't enable the full ACPI interpreter */ - if (fullarg(from, "acpi=ht")) { - if (!acpi_force) - disable_acpi(); - acpi_ht = 1; - } - else if (fullarg(from, "pci=noacpi")) - acpi_disable_pci(); - else if (fullarg(from, "acpi=noirq")) - acpi_noirq_set(); - - else if (fullarg(from, "acpi_sci=edge")) - acpi_sci_flags.trigger = 1; - else if (fullarg(from, "acpi_sci=level")) - acpi_sci_flags.trigger = 3; - else if (fullarg(from, "acpi_sci=high")) - acpi_sci_flags.polarity = 1; - else if (fullarg(from, "acpi_sci=low")) - acpi_sci_flags.polarity = 3; - - /* acpi=strict disables out-of-spec workarounds */ - else if (fullarg(from, "acpi=strict")) { - acpi_strict = 1; - } - else if (fullarg(from, "acpi_skip_timer_override")) - acpi_skip_timer_override = 1; +early_param("elfcorehdr", setup_elfcorehdr); #endif - if (fullarg(from, "disable_timer_pin_1")) - disable_timer_pin_1 = 1; - if (fullarg(from, "enable_timer_pin_1")) - disable_timer_pin_1 = -1; - - if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) { - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); - disable_apic = 1; - } - - if (fullarg(from, "noapic")) - skip_ioapic_setup = 1; - - if (fullarg(from,"apic")) { - skip_ioapic_setup = 0; - ioapic_force = 1; - } - - if (!memcmp(from, "mem=", 4)) - parse_memopt(from+4, &from); - - if (!memcmp(from, "memmap=", 7)) { - /* exactmap option is for used defined memory */ - if (!memcmp(from+7, "exactmap", 8)) { -#ifdef CONFIG_CRASH_DUMP - /* If we are doing a crash dump, we - * still need to know the real mem - * size before original memory map is - * reset. - */ - saved_max_pfn = e820_end_of_ram(); -#endif - from += 8+7; - end_pfn_map = 0; - e820.nr_map = 0; - userdef = 1; - } - else { - parse_memmapopt(from+7, &from); - userdef = 1; - } - } - -#ifdef CONFIG_NUMA - if (!memcmp(from, "numa=", 5)) - numa_setup(from+5); -#endif - - if (!memcmp(from,"iommu=",6)) { - iommu_setup(from+6); - } - - if (fullarg(from,"oops=panic")) - panic_on_oops = 1; - - if (!memcmp(from, "noexec=", 7)) - nonx_setup(from + 7); - -#ifdef CONFIG_KEXEC - /* crashkernel=size@addr specifies the location to reserve for - * a crash kernel. By reserving this memory we guarantee - * that linux never set's it up as a DMA target. - * Useful for holding code to do something appropriate - * after a kernel panic. - */ - else if (!memcmp(from, "crashkernel=", 12)) { - unsigned long size, base; - size = memparse(from+12, &from); - if (*from == '@') { - base = memparse(from+1, &from); - /* FIXME: Do I want a sanity check - * to validate the memory range? - */ - crashk_res.start = base; - crashk_res.end = base + size - 1; - } - } -#endif - -#ifdef CONFIG_PROC_VMCORE - /* elfcorehdr= specifies the location of elf core header - * stored by the crashed kernel. This option will be passed - * by kexec loader to the capture kernel. - */ - else if(!memcmp(from, "elfcorehdr=", 11)) - elfcorehdr_addr = memparse(from+11, &from); -#endif - -#ifdef CONFIG_HOTPLUG_CPU - else if (!memcmp(from, "additional_cpus=", 16)) - setup_additional_cpus(from+16); -#endif - - next_char: - c = *(from++); - if (!c) - break; - if (COMMAND_LINE_SIZE <= ++len) - break; - *(to++) = c; - } - if (userdef) { - printk(KERN_INFO "user-defined physical RAM map:\n"); - e820_print_map("user"); - } - *to = '\0'; - *cmdline_p = command_line; -} - #ifndef CONFIG_NUMA static void __init contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) @@ -547,7 +381,12 @@ void __init setup_arch(char **cmdline_p) early_identify_cpu(&boot_cpu_data); - parse_cmdline_early(cmdline_p); + strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE); + *cmdline_p = command_line; + + parse_early_param(); + + finish_e820_parsing(); /* * partially used pages are not usable - thus diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 417de564456..646caa0a20b 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -46,8 +46,10 @@ Control non executable mappings for 64bit processes. on Enable(default) off Disable */ -int __init nonx_setup(char *str) +static int __init nonx_setup(char *str) { + if (!str) + return -EINVAL; if (!strncmp(str, "on", 2)) { __supported_pte_mask |= _PAGE_NX; do_not_nx = 0; @@ -55,9 +57,9 @@ int __init nonx_setup(char *str) do_not_nx = 1; __supported_pte_mask &= ~_PAGE_NX; } - return 1; + return 0; } -__setup("noexec=", nonx_setup); /* parsed early actually */ +early_param("noexec", nonx_setup); int force_personality32 = 0; diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index ae7d6d84e35..0b9f28b05e7 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -1270,11 +1270,11 @@ void __cpu_die(unsigned int cpu) printk(KERN_ERR "CPU %u didn't die...\n", cpu); } -__init int setup_additional_cpus(char *s) +static __init int setup_additional_cpus(char *s) { - return get_option(&s, &additional_cpus); + return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL; } -__setup("additional_cpus=", setup_additional_cpus); +early_param("additional_cpus", setup_additional_cpus); #else /* ... !CONFIG_HOTPLUG_CPU */ diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 56d7ff0c894..9ec2b1d5893 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -1119,24 +1119,30 @@ void __init trap_init(void) } -/* Actual parsing is done early in setup.c. */ -static int __init oops_dummy(char *s) +static int __init oops_setup(char *s) { - panic_on_oops = 1; - return 1; + if (!s) + return -EINVAL; + if (!strcmp(s, "panic")) + panic_on_oops = 1; + return 0; } -__setup("oops=", oops_dummy); +early_param("oops", oops_setup); static int __init kstack_setup(char *s) { + if (!s) + return -EINVAL; kstack_depth_to_print = simple_strtoul(s,NULL,0); - return 1; + return 0; } -__setup("kstack=", kstack_setup); +early_param("kstack", kstack_setup); #ifdef CONFIG_STACK_UNWIND static int __init call_trace_setup(char *s) { + if (!s) + return -EINVAL; if (strcmp(s, "old") == 0) call_trace = -1; else if (strcmp(s, "both") == 0) @@ -1145,7 +1151,7 @@ static int __init call_trace_setup(char *s) call_trace = 1; else if (strcmp(s, "new") == 0) call_trace = 2; - return 1; + return 0; } -__setup("call_trace=", call_trace_setup); +early_param("call_trace", call_trace_setup); #endif -- cgit v1.2.3 From 43c85c9c5dff76efc1e411d3302840027ea92004 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] Remove need for early lockdep init I think it was only needed for the printks and we can do them later. I put in a single early_printk so that we know the kernel is alive (early_printk doesn't need any locks) This makes some things easier for initialization of unwind for lockdep, which is needed by later patches. cc: mingo@elte.hu Signed-off-by: Andi Kleen --- arch/x86_64/kernel/head64.c | 8 +------- arch/x86_64/kernel/setup.c | 2 ++ 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index 6716cbfc34a..9561eb3c5b5 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -45,15 +45,12 @@ static void __init copy_bootdata(char *real_mode_data) new_data = *(int *) (x86_boot_params + NEW_CL_POINTER); if (!new_data) { if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) { - printk("so old bootloader that it does not support commandline?!\n"); return; } new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET; - printk("old bootloader convention, maybe loadlin?\n"); } command_line = (char *) ((u64)(new_data)); memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE); - printk("Bootdata ok (command line is %s)\n", saved_command_line); } void __init x86_64_start_kernel(char * real_mode_data) @@ -65,10 +62,7 @@ void __init x86_64_start_kernel(char * real_mode_data) asm volatile("lidt %0" :: "m" (idt_descr)); clear_bss(); - /* - * This must be called really, really early: - */ - lockdep_init(); + early_printk("Kernel alive\n"); /* * switch to init_level4_pgt from boot_level4_pgt diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index f55540a6084..9c4d93c53c0 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -353,6 +353,8 @@ static void discover_ebda(void) void __init setup_arch(char **cmdline_p) { + printk(KERN_INFO "Command line: %s\n", saved_command_line); + ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); screen_info = SCREEN_INFO; edid_info = EDID_INFO; -- cgit v1.2.3 From df3bb57d2c0160ccd1ee51322f50aa295c3b0858 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] i386/x86-64: Move acpi_disabled variables into acpi/boot.c Removes code duplication between i386/x86-64. Not needed anymore in setup.c since early_param cleanup Cc: len.brown@intel.com Signed-off-by: Andi Kleen --- arch/x86_64/kernel/setup.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 9c4d93c53c0..0f86976811d 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -74,9 +74,6 @@ EXPORT_SYMBOL(boot_cpu_data); unsigned long mmu_cr4_features; -int acpi_disabled; -EXPORT_SYMBOL(acpi_disabled); - int acpi_numa __initdata; /* Boot loader ID as an integer, for the benefit of proc_dointvec */ -- cgit v1.2.3 From c31fbb1ad890b11f037c16496e53f28877c12722 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] Clean up acpi_numa variable Move it into srat.c No need to clutter up setup.c for it And remove use in setup.c completely - it only guarded a printk which can be done unconditionally. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/setup.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 0f86976811d..b85528597aa 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -74,8 +74,6 @@ EXPORT_SYMBOL(boot_cpu_data); unsigned long mmu_cr4_features; -int acpi_numa __initdata; - /* Boot loader ID as an integer, for the benefit of proc_dointvec */ int bootloader_type; @@ -812,8 +810,7 @@ static void srat_detect_node(void) node = first_node(node_online_map); numa_set_node(cpu, node); - if (acpi_numa > 0) - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); #endif } -- cgit v1.2.3 From 3bd4d18cbab622c504f131f3c0029c3aa29c05be Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] Move e820 map into e820.c Minor cleanup. Keep setup.c free from unrelated clutter. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/e820.c | 2 ++ arch/x86_64/kernel/setup.c | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index a4739dc8293..164d0b83dc9 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -24,6 +24,8 @@ #include #include +struct e820map e820 __initdata; + /* * PFN of last memory page. */ diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index b85528597aa..4be29832e3d 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -97,7 +97,6 @@ struct sys_desc_table_struct { struct edid_info edid_info; EXPORT_SYMBOL_GPL(edid_info); -struct e820map e820; extern int root_mountflags; -- cgit v1.2.3 From 131cfd7bd54767ec8959e013f83839442a54d546 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] Add sparse annotation to vsyscall.c Fixes linux/arch/x86_64/kernel/vsyscall.c:276:7: warning: constant 0x0f40000000000 is so big it is long linux/arch/x86_64/kernel/vsyscall.c:80:14: warning: incorrect type in argument 1 (different address spaces) linux/arch/x86_64/kernel/vsyscall.c:80:14: expected void const volatile [noderef] *addr linux/arch/x86_64/kernel/vsyscall.c:80:14: got void * linux/arch/x86_64/kernel/vsyscall.c:200:7: warning: incorrect type in assignment (different address spaces) linux/arch/x86_64/kernel/vsyscall.c:200:7: expected unsigned short [usertype] *map1 linux/arch/x86_64/kernel/vsyscall.c:200:7: got void [noderef] * linux/arch/x86_64/kernel/vsyscall.c:203:7: warning: incorrect type in assignment (different address spaces) linux/arch/x86_64/kernel/vsyscall.c:203:7: expected unsigned short [usertype] *map2 linux/arch/x86_64/kernel/vsyscall.c:203:7: got void [noderef] * linux/arch/x86_64/kernel/vsyscall.c:215:10: warning: incorrect type in argument 1 (different address spaces) linux/arch/x86_64/kernel/vsyscall.c:215:10: expected void volatile [noderef] *addr linux/arch/x86_64/kernel/vsyscall.c:215:10: got unsigned short [usertype] *map2 linux/arch/x86_64/kernel/vsyscall.c:217:10: warning: incorrect type in argument 1 (different address spaces) linux/arch/x86_64/kernel/vsyscall.c:217:10: expected void volatile [noderef] *addr linux/arch/x86_64/kernel/vsyscall.c:217:10: got unsigned short [usertype] *map1 Signed-off-by: Andi Kleen --- arch/x86_64/kernel/vsyscall.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index 902783bc4d5..ac48c3857dd 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -77,7 +77,8 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) __vxtime.tsc_quot) >> 32; /* See comment in x86_64 do_gettimeofday. */ } else { - usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) - + usec += ((readl((void __iomem *) + fix_to_virt(VSYSCALL_HPET) + 0xf0) - __vxtime.last) * __vxtime.quot) >> 32; } } while (read_seqretry(&__xtime_lock, sequence)); @@ -191,7 +192,8 @@ static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos) { extern u16 vsysc1, vsysc2; - u16 *map1, *map2; + u16 __iomem *map1; + u16 __iomem *map2; int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); if (!write) return ret; @@ -206,11 +208,11 @@ static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, goto out; } if (!sysctl_vsyscall) { - *map1 = SYSCALL; - *map2 = SYSCALL; + writew(SYSCALL, map1); + writew(SYSCALL, map2); } else { - *map1 = NOP2; - *map2 = NOP2; + writew(NOP2, map1); + writew(NOP2, map2); } iounmap(map2); out: -- cgit v1.2.3 From 2b94ab2fd55768030e177a6ec224dedd031ad06b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20Luis=20V=E1zquez=20Cao?= Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] Replace local_save_flags+local_irq_disable with The combination of "local_save_flags" and "local_irq_disable" seems to be equivalent to "local_irq_save" (see code snips below). Consequently, replace occurrences of local_save_flags+local_irq_disable with local_irq_save. * local_irq_save #define raw_local_irq_save(flags) \ do { (flags) = __raw_local_irq_save(); } while (0) static inline unsigned long __raw_local_irq_save(void) { unsigned long flags = __raw_local_save_flags(); raw_local_irq_disable(); return flags; } * local_save_flags #define raw_local_save_flags(flags) \ do { (flags) = __raw_local_save_flags(); } while (0) Signed-off-by: Fernando Vazquez Signed-off-by: Andi Kleen --- arch/x86_64/kernel/apic.c | 3 +-- arch/x86_64/kernel/genapic_flat.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 660602c51f5..b95d1e378b9 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -468,8 +468,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) apic_pm_state.apic_tmict = apic_read(APIC_TMICT); apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); - local_save_flags(flags); - local_irq_disable(); + local_irq_save(flags); disable_local_APIC(); local_irq_restore(flags); return 0; diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c index 2ceb8b872b5..50ad153eaac 100644 --- a/arch/x86_64/kernel/genapic_flat.c +++ b/arch/x86_64/kernel/genapic_flat.c @@ -49,8 +49,7 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector) unsigned long cfg; unsigned long flags; - local_save_flags(flags); - local_irq_disable(); + local_irq_save(flags); /* * Wait for idle. -- cgit v1.2.3 From 2ade2920dcefdf5595c6380ebed131c964190855 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] i386/x86-64: rename is_at_popf(), add iret to tests and fix is_at_popf() needs to test for the iret instruction as well as popf. So add that test and rename it to is_setting_trap_flag(). Also change max insn length from 16 to 15 to match reality. LAHF / SAHF can't affect TF, so the comment in x86_64 is removed. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen --- arch/x86_64/kernel/ptrace.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c index 412c6a8bf90..addc14af0c5 100644 --- a/arch/x86_64/kernel/ptrace.c +++ b/arch/x86_64/kernel/ptrace.c @@ -116,17 +116,17 @@ unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *r return addr; } -static int is_at_popf(struct task_struct *child, struct pt_regs *regs) +static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) { int i, copied; - unsigned char opcode[16]; + unsigned char opcode[15]; unsigned long addr = convert_rip_to_linear(child, regs); copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); for (i = 0; i < copied; i++) { switch (opcode[i]) { - /* popf */ - case 0x9d: + /* popf and iret */ + case 0x9d: case 0xcf: return 1; /* CHECKME: 64 65 */ @@ -189,10 +189,8 @@ static void set_singlestep(struct task_struct *child) * ..but if TF is changed by the instruction we will trace, * don't mark it as being "us" that set it, so that we * won't clear it by hand later. - * - * AK: this is not enough, LAHF and IRET can change TF in user space too. */ - if (is_at_popf(child, regs)) + if (is_setting_trap_flag(child, regs)) return; child->ptrace |= PT_DTRACE; -- cgit v1.2.3 From de684652f34f57cb60d4d78d09139a0e0c5e7b1b Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] print whether CONFIG_IOMMU_DEBUG is enabled Signed-off-by: Muli Ben-Yehuda Signed-off-by: Jon Mason Signed-off-by: Andi Kleen --- arch/x86_64/kernel/pci-calgary.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index caf84e4e4ca..ebe4e930b64 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -127,15 +127,19 @@ static void tce_cache_blast(struct iommu_table *tbl); /* enable this to stress test the chip's TCE cache */ #ifdef CONFIG_IOMMU_DEBUG +int debugging __read_mostly = 1; + static inline void tce_cache_blast_stress(struct iommu_table *tbl) { tce_cache_blast(tbl); } -#else +#else /* debugging is disabled */ +int debugging __read_mostly = 0; + static inline void tce_cache_blast_stress(struct iommu_table *tbl) { } -#endif /* BLAST_TCE_CACHE_ON_UNMAP */ +#endif /* CONFIG_IOMMU_DEBUG */ static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) { @@ -944,8 +948,10 @@ void __init detect_calgary(void) if (calgary_found) { iommu_detected = 1; calgary_detected = 1; - printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected. " - "TCE table spec is %d.\n", specified_table_size); + printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); + printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " + "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, + debugging ? "enabled" : "disabled"); } return; -- cgit v1.2.3 From 796e4390e0378e1e57c033349610cfc741696a3d Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] only verify the allocation bitmap if CONFIG_IOMMU_DEBUG is on Introduce new function verify_bit_range(). Define two versions, one for CONFIG_IOMMU_DEBUG enabled and one for disabled. Previously we were checking that the bitmap was consistent every time we allocated or freed an entry in the TCE table, which is good for debugging but incurs an unnecessary penalty on non debug builds. Signed-off-by: Muli Ben-Yehuda Signed-off-by: Jon Mason Signed-off-by: Andi Kleen --- arch/x86_64/kernel/pci-calgary.c | 44 ++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 9 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index ebe4e930b64..7c43cb0f71a 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -133,12 +133,35 @@ static inline void tce_cache_blast_stress(struct iommu_table *tbl) { tce_cache_blast(tbl); } + +static inline unsigned long verify_bit_range(unsigned long* bitmap, + int expected, unsigned long start, unsigned long end) +{ + unsigned long idx = start; + + BUG_ON(start >= end); + + while (idx < end) { + if (!!test_bit(idx, bitmap) != expected) + return idx; + ++idx; + } + + /* all bits have the expected value */ + return ~0UL; +} #else /* debugging is disabled */ int debugging __read_mostly = 0; static inline void tce_cache_blast_stress(struct iommu_table *tbl) { } + +static inline unsigned long verify_bit_range(unsigned long* bitmap, + int expected, unsigned long start, unsigned long end) +{ + return ~0UL; +} #endif /* CONFIG_IOMMU_DEBUG */ static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) @@ -162,6 +185,7 @@ static void iommu_range_reserve(struct iommu_table *tbl, { unsigned long index; unsigned long end; + unsigned long badbit; index = start_addr >> PAGE_SHIFT; @@ -173,14 +197,15 @@ static void iommu_range_reserve(struct iommu_table *tbl, if (end > tbl->it_size) /* don't go off the table */ end = tbl->it_size; - while (index < end) { - if (test_bit(index, tbl->it_map)) + badbit = verify_bit_range(tbl->it_map, 0, index, end); + if (badbit != ~0UL) { + if (printk_ratelimit()) printk(KERN_ERR "Calgary: entry already allocated at " "0x%lx tbl %p dma 0x%lx npages %u\n", - index, tbl, start_addr, npages); - ++index; + badbit, tbl, start_addr, npages); } - set_bit_string(tbl->it_map, start_addr >> PAGE_SHIFT, npages); + + set_bit_string(tbl->it_map, index, npages); } static unsigned long iommu_range_alloc(struct iommu_table *tbl, @@ -247,7 +272,7 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned int npages) { unsigned long entry; - unsigned long i; + unsigned long badbit; entry = dma_addr >> PAGE_SHIFT; @@ -255,11 +280,12 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, tce_free(tbl, entry, npages); - for (i = 0; i < npages; ++i) { - if (!test_bit(entry + i, tbl->it_map)) + badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages); + if (badbit != ~0UL) { + if (printk_ratelimit()) printk(KERN_ERR "Calgary: bit is off at 0x%lx " "tbl %p dma 0x%Lx entry 0x%lx npages %u\n", - entry + i, tbl, dma_addr, entry, npages); + badbit, tbl, dma_addr, entry, npages); } __clear_bit_string(tbl->it_map, entry, npages); -- cgit v1.2.3 From 4ccf4ae3144360ab9c00d0b53427f43369287bfb Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] remove tce_cache_blast_stress() tce_cache_blast_stress was useful during bringup to stress the IOMMU's cache flushing. Now that we quiesce DMAs on every cache flush, using _stress() brings the machine down to its knees once you put it under load. Remove this debug / bringup code that isn't useful anymore completely. Signed-off-by: Muli Ben-Yehuda Signed-off-by: Jon Mason Signed-off-by: Andi Kleen --- arch/x86_64/kernel/pci-calgary.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 7c43cb0f71a..cd866135e4e 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -129,11 +129,6 @@ static void tce_cache_blast(struct iommu_table *tbl); #ifdef CONFIG_IOMMU_DEBUG int debugging __read_mostly = 1; -static inline void tce_cache_blast_stress(struct iommu_table *tbl) -{ - tce_cache_blast(tbl); -} - static inline unsigned long verify_bit_range(unsigned long* bitmap, int expected, unsigned long start, unsigned long end) { @@ -153,10 +148,6 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap, #else /* debugging is disabled */ int debugging __read_mostly = 0; -static inline void tce_cache_blast_stress(struct iommu_table *tbl) -{ -} - static inline unsigned long verify_bit_range(unsigned long* bitmap, int expected, unsigned long start, unsigned long end) { @@ -289,8 +280,6 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, } __clear_bit_string(tbl->it_map, entry, npages); - - tce_cache_blast_stress(tbl); } static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, -- cgit v1.2.3 From 4ea8a5d8b57cd504b4b2de1212523848e7ab50cf Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] Calgary IOMMU: eradicate sole remaining 80 chars per line offender Signed-off-by: Muli Ben-Yehuda Signed-off-by: Jon Mason Signed-off-by: Andi Kleen --- arch/x86_64/kernel/pci-calgary.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index cd866135e4e..466588f9560 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -86,7 +86,8 @@ #define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */ #define MAX_NUM_CHASSIS 8 /* max number of chassis */ -#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2) /* max dev->bus->number */ +/* MAX_PHB_BUS_NUM is the maximal possible dev->bus->number */ +#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2) #define PHBS_PER_CALGARY 4 /* register offsets in Calgary's internal register space */ -- cgit v1.2.3 From 5a1b3999d6cb7ab87f1f3b1700bc91839fd6fa29 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] x86: Some preparationary cleanup for stack trace - Remove unused all_contexts parameter No caller used it - Move skip argument into the structure (needed for followon patches) Cc: mingo@elte.hu Signed-off-by: Andi Kleen --- arch/x86_64/kernel/stacktrace.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/stacktrace.c b/arch/x86_64/kernel/stacktrace.c index 32cf55eb9af..1c022af8fe1 100644 --- a/arch/x86_64/kernel/stacktrace.c +++ b/arch/x86_64/kernel/stacktrace.c @@ -109,9 +109,10 @@ out_restore: * Save stack-backtrace addresses into a stack_trace buffer: */ static inline unsigned long -save_context_stack(struct stack_trace *trace, unsigned int skip, +save_context_stack(struct stack_trace *trace, unsigned long stack, unsigned long stack_end) { + int skip = trace->skip; unsigned long addr; #ifdef CONFIG_FRAME_POINTER @@ -159,12 +160,8 @@ save_context_stack(struct stack_trace *trace, unsigned int skip, /* * Save stack-backtrace addresses into a stack_trace buffer. - * If all_contexts is set, all contexts (hardirq, softirq and process) - * are saved. If not set then only the current context is saved. */ -void save_stack_trace(struct stack_trace *trace, - struct task_struct *task, int all_contexts, - unsigned int skip) +void save_stack_trace(struct stack_trace *trace, struct task_struct *task) { unsigned long stack = (unsigned long)&stack; int i, nr_stacks = 0, stacks_done[MAX_STACKS]; @@ -207,9 +204,8 @@ void save_stack_trace(struct stack_trace *trace, return; stacks_done[nr_stacks] = stack_end; - stack = save_context_stack(trace, skip, stack, stack_end); - if (!all_contexts || !stack || - trace->nr_entries >= trace->max_entries) + stack = save_context_stack(trace, stack, stack_end); + if (!stack || trace->nr_entries >= trace->max_entries) return; trace->entries[trace->nr_entries++] = ULONG_MAX; if (trace->nr_entries >= trace->max_entries) -- cgit v1.2.3 From b7f5e3c7742d5332b78b831131f43fc3630e6322 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] Don't access the APIC in safe_smp_processor_id when it is not mapped yet Lockdep can call the dwarf2 unwinder early, and the dwarf2 code uses safe_smp_processor_id which tries to access the local APIC page. But that doesn't work before the APIC code has set up its fixmap. Check for this case and always return boot cpu then. Cc: jbeulich@novell.com Cc: mingo@elte.hu Signed-off-by: Andi Kleen --- arch/x86_64/kernel/apic.c | 2 ++ arch/x86_64/kernel/smp.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index b95d1e378b9..6472e321cad 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -38,6 +38,7 @@ #include #include +int apic_mapped; int apic_verbosity; int apic_runs_main_timer; int apic_calibrate_pmtmr __initdata; @@ -600,6 +601,7 @@ void __init init_apic_mappings(void) apic_phys = mp_lapic_addr; set_fixmap_nocache(FIX_APIC_BASE, apic_phys); + apic_mapped = 1; apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys); /* diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 06af6ca6012..2df988bbf31 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -526,7 +526,7 @@ int safe_smp_processor_id(void) { unsigned apicid, i; - if (disable_apic) + if (disable_apic || !apic_mapped) return 0; apicid = hard_smp_processor_id(); -- cgit v1.2.3 From c0b766f13d8e1189ce4d00e54700c9d96b543b9a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] Merge stacktrace and show_trace This unifies the standard backtracer and the new stacktrace in memory backtracer. The standard one is converted to use callbacks and then reimplement stacktrace using new callbacks. The main advantage is that stacktrace can now use the new dwarf2 unwinder and avoid false positives in many cases. I kept it simple to make sure the standard backtracer stays reliable. Cc: mingo@elte.hu Signed-off-by: Andi Kleen --- arch/x86_64/kernel/stacktrace.c | 214 +++++----------------------------------- arch/x86_64/kernel/traps.c | 99 ++++++++++++++----- 2 files changed, 102 insertions(+), 211 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/stacktrace.c b/arch/x86_64/kernel/stacktrace.c index 1c022af8fe1..6026b31d037 100644 --- a/arch/x86_64/kernel/stacktrace.c +++ b/arch/x86_64/kernel/stacktrace.c @@ -7,211 +7,49 @@ */ #include #include +#include +#include -#include - -static inline int -in_range(unsigned long start, unsigned long addr, unsigned long end) +static void save_stack_warning(void *data, char *msg) { - return addr >= start && addr <= end; } -static unsigned long -get_stack_end(struct task_struct *task, unsigned long stack) +static void +save_stack_warning_symbol(void *data, char *msg, unsigned long symbol) { - unsigned long stack_start, stack_end, flags; - int i, cpu; - - /* - * The most common case is that we are in the task stack: - */ - stack_start = (unsigned long)task->thread_info; - stack_end = stack_start + THREAD_SIZE; - - if (in_range(stack_start, stack, stack_end)) - return stack_end; - - /* - * We are in an interrupt if irqstackptr is set: - */ - raw_local_irq_save(flags); - cpu = safe_smp_processor_id(); - stack_end = (unsigned long)cpu_pda(cpu)->irqstackptr; - - if (stack_end) { - stack_start = stack_end & ~(IRQSTACKSIZE-1); - if (in_range(stack_start, stack, stack_end)) - goto out_restore; - /* - * We get here if we are in an IRQ context but we - * are also in an exception stack. - */ - } - - /* - * Iterate over all exception stacks, and figure out whether - * 'stack' is in one of them: - */ - for (i = 0; i < N_EXCEPTION_STACKS; i++) { - /* - * set 'end' to the end of the exception stack. - */ - stack_end = per_cpu(init_tss, cpu).ist[i]; - stack_start = stack_end - EXCEPTION_STKSZ; - - /* - * Is 'stack' above this exception frame's end? - * If yes then skip to the next frame. - */ - if (stack >= stack_end) - continue; - /* - * Is 'stack' above this exception frame's start address? - * If yes then we found the right frame. - */ - if (stack >= stack_start) - goto out_restore; - - /* - * If this is a debug stack, and if it has a larger size than - * the usual exception stacks, then 'stack' might still - * be within the lower portion of the debug stack: - */ -#if DEBUG_STKSZ > EXCEPTION_STKSZ - if (i == DEBUG_STACK - 1 && stack >= stack_end - DEBUG_STKSZ) { - /* - * Black magic. A large debug stack is composed of - * multiple exception stack entries, which we - * iterate through now. Dont look: - */ - do { - stack_end -= EXCEPTION_STKSZ; - stack_start -= EXCEPTION_STKSZ; - } while (stack < stack_start); - - goto out_restore; - } -#endif - } - /* - * Ok, 'stack' is not pointing to any of the system stacks. - */ - stack_end = 0; - -out_restore: - raw_local_irq_restore(flags); - - return stack_end; } - -/* - * Save stack-backtrace addresses into a stack_trace buffer: - */ -static inline unsigned long -save_context_stack(struct stack_trace *trace, - unsigned long stack, unsigned long stack_end) +static int save_stack_stack(void *data, char *name) { - int skip = trace->skip; - unsigned long addr; - -#ifdef CONFIG_FRAME_POINTER - unsigned long prev_stack = 0; + struct stack_trace *trace = (struct stack_trace *)data; + return trace->all_contexts ? 0 : -1; +} - while (in_range(prev_stack, stack, stack_end)) { - pr_debug("stack: %p\n", (void *)stack); - addr = (unsigned long)(((unsigned long *)stack)[1]); - pr_debug("addr: %p\n", (void *)addr); - if (!skip) - trace->entries[trace->nr_entries++] = addr-1; - else - skip--; - if (trace->nr_entries >= trace->max_entries) - break; - if (!addr) - return 0; - /* - * Stack frames must go forwards (otherwise a loop could - * happen if the stackframe is corrupted), so we move - * prev_stack forwards: - */ - prev_stack = stack; - stack = (unsigned long)(((unsigned long *)stack)[0]); - } - pr_debug("invalid: %p\n", (void *)stack); -#else - while (stack < stack_end) { - addr = ((unsigned long *)stack)[0]; - stack += sizeof(long); - if (__kernel_text_address(addr)) { - if (!skip) - trace->entries[trace->nr_entries++] = addr-1; - else - skip--; - if (trace->nr_entries >= trace->max_entries) - break; - } +static void save_stack_address(void *data, unsigned long addr) +{ + struct stack_trace *trace = (struct stack_trace *)data; + if (trace->skip > 0) { + trace->skip--; + return; } -#endif - return stack; + if (trace->nr_entries < trace->max_entries - 1) + trace->entries[trace->nr_entries++] = addr; } -#define MAX_STACKS 10 +static struct stacktrace_ops save_stack_ops = { + .warning = save_stack_warning, + .warning_symbol = save_stack_warning_symbol, + .stack = save_stack_stack, + .address = save_stack_address, +}; /* * Save stack-backtrace addresses into a stack_trace buffer. */ void save_stack_trace(struct stack_trace *trace, struct task_struct *task) { - unsigned long stack = (unsigned long)&stack; - int i, nr_stacks = 0, stacks_done[MAX_STACKS]; - - WARN_ON(trace->nr_entries || !trace->max_entries); - - if (!task) - task = current; - - pr_debug("task: %p, ti: %p\n", task, task->thread_info); - - if (!task || task == current) { - /* Grab rbp right from our regs: */ - asm ("mov %%rbp, %0" : "=r" (stack)); - pr_debug("rbp: %p\n", (void *)stack); - } else { - /* rbp is the last reg pushed by switch_to(): */ - stack = task->thread.rsp; - pr_debug("other task rsp: %p\n", (void *)stack); - stack = (unsigned long)(((unsigned long *)stack)[0]); - pr_debug("other task rbp: %p\n", (void *)stack); - } - - while (1) { - unsigned long stack_end = get_stack_end(task, stack); - - pr_debug("stack: %p\n", (void *)stack); - pr_debug("stack end: %p\n", (void *)stack_end); - - /* - * Invalid stack addres? - */ - if (!stack_end) - return; - /* - * Were we in this stack already? (recursion) - */ - for (i = 0; i < nr_stacks; i++) - if (stacks_done[i] == stack_end) - return; - stacks_done[nr_stacks] = stack_end; - - stack = save_context_stack(trace, stack, stack_end); - if (!stack || trace->nr_entries >= trace->max_entries) - return; - trace->entries[trace->nr_entries++] = ULONG_MAX; - if (trace->nr_entries >= trace->max_entries) - return; - if (++nr_stacks >= MAX_STACKS) - return; - } + dump_trace(task, NULL, NULL, &save_stack_ops, trace); + trace->entries[trace->nr_entries++] = ULONG_MAX; } +EXPORT_SYMBOL(save_stack_trace); diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 9ec2b1d5893..4ac18b02ead 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -45,6 +45,7 @@ #include #include #include +#include asmlinkage void divide_error(void); asmlinkage void debug(void); @@ -142,7 +143,7 @@ void printk_address(unsigned long address) #endif static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, - unsigned *usedp, const char **idp) + unsigned *usedp, char **idp) { static char ids[][8] = { [DEBUG_STACK - 1] = "#DB", @@ -234,13 +235,19 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, return NULL; } -static int show_trace_unwind(struct unwind_frame_info *info, void *context) +struct ops_and_data { + struct stacktrace_ops *ops; + void *data; +}; + +static int dump_trace_unwind(struct unwind_frame_info *info, void *context) { + struct ops_and_data *oad = (struct ops_and_data *)context; int n = 0; while (unwind(info) == 0 && UNW_PC(info)) { n++; - printk_address(UNW_PC(info)); + oad->ops->address(oad->data, UNW_PC(info)); if (arch_unw_user_mode(info)) break; } @@ -254,45 +261,51 @@ static int show_trace_unwind(struct unwind_frame_info *info, void *context) * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack */ -void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack) +void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack, + struct stacktrace_ops *ops, void *data) { const unsigned cpu = safe_smp_processor_id(); unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; unsigned used = 0; - printk("\nCall Trace:\n"); - if (!tsk) tsk = current; if (call_trace >= 0) { int unw_ret = 0; struct unwind_frame_info info; + struct ops_and_data oad = { .ops = ops, .data = data }; if (regs) { if (unwind_init_frame_info(&info, tsk, regs) == 0) - unw_ret = show_trace_unwind(&info, NULL); + unw_ret = dump_trace_unwind(&info, &oad); } else if (tsk == current) - unw_ret = unwind_init_running(&info, show_trace_unwind, NULL); + unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad); else { if (unwind_init_blocked(&info, tsk) == 0) - unw_ret = show_trace_unwind(&info, NULL); + unw_ret = dump_trace_unwind(&info, &oad); } if (unw_ret > 0) { if (call_trace == 1 && !arch_unw_user_mode(&info)) { - print_symbol("DWARF2 unwinder stuck at %s\n", + ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n", UNW_PC(&info)); if ((long)UNW_SP(&info) < 0) { - printk("Leftover inexact backtrace:\n"); + ops->warning(data, "Leftover inexact backtrace:\n"); stack = (unsigned long *)UNW_SP(&info); } else - printk("Full inexact backtrace again:\n"); + ops->warning(data, "Full inexact backtrace again:\n"); } else if (call_trace >= 1) return; else - printk("Full inexact backtrace again:\n"); + ops->warning(data, "Full inexact backtrace again:\n"); } else - printk("Inexact backtrace:\n"); + ops->warning(data, "Inexact backtrace:\n"); + } + if (!stack) { + unsigned long dummy; + stack = &dummy; + if (tsk && tsk != current) + stack = (unsigned long *)tsk->thread.rsp; } /* @@ -312,7 +325,7 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s * down the cause of the crash will be able to figure \ * out the call path that was taken. \ */ \ - printk_address(addr); \ + ops->address(data, addr); \ } \ } while (0) @@ -321,16 +334,17 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s * current stack address. If the stacks consist of nested * exceptions */ - for ( ; ; ) { - const char *id; + for (;;) { + char *id; unsigned long *estack_end; estack_end = in_exception_stack(cpu, (unsigned long)stack, &used, &id); if (estack_end) { - printk(" <%s>", id); + if (ops->stack(data, id) < 0) + break; HANDLE_STACK (stack < estack_end); - printk(" "); + ops->stack(data, ""); /* * We link to the next stack via the * second-to-last pointer (index -2 to end) in the @@ -345,7 +359,8 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s (IRQSTACKSIZE - 64) / sizeof(*irqstack); if (stack >= irqstack && stack < irqstack_end) { - printk(" "); + if (ops->stack(data, "IRQ") < 0) + break; HANDLE_STACK (stack < irqstack_end); /* * We link to the next stack (which would be @@ -354,7 +369,7 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s */ stack = (unsigned long *) (irqstack_end[-1]); irqstack_end = NULL; - printk(" "); + ops->stack(data, "EOI"); continue; } } @@ -362,15 +377,53 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s } /* - * This prints the process stack: + * This handles the process stack: */ HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0); #undef HANDLE_STACK +} +EXPORT_SYMBOL(dump_trace); +static void +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + print_symbol(msg, symbol); + printk("\n"); +} + +static void print_trace_warning(void *data, char *msg) +{ + printk("%s\n", msg); +} + +static int print_trace_stack(void *data, char *name) +{ + printk(" <%s> ", name); + return 0; +} + +static void print_trace_address(void *data, unsigned long addr) +{ + printk_address(addr); +} + +static struct stacktrace_ops print_trace_ops = { + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, +}; + +void +show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack) +{ + printk("\nCall Trace:\n"); + dump_trace(tsk, regs, stack, &print_trace_ops, NULL); printk("\n"); } -static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long * rsp) +static void +_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp) { unsigned long *stack; int i; -- cgit v1.2.3 From be7a91709b90825990e571b2f20cea937d5eef6c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] Check for end of stack trace before falling back Signed-off-by: Andi Kleen --- arch/x86_64/kernel/traps.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 4ac18b02ead..28e53342f29 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -292,6 +292,8 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s if ((long)UNW_SP(&info) < 0) { ops->warning(data, "Leftover inexact backtrace:\n"); stack = (unsigned long *)UNW_SP(&info); + if (!stack) + return; } else ops->warning(data, "Full inexact backtrace again:\n"); } else if (call_trace >= 1) -- cgit v1.2.3 From d28c4393a7bf558538e9def269c1caeab6ec056f Mon Sep 17 00:00:00 2001 From: "Prasanna S.P" Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] x86: error_code is not safe for kprobes This patch moves the entry.S:error_entry to .kprobes.text section, since code marked unsafe for kprobes jumps directly to entry.S::error_entry, that must be marked unsafe as well. This patch also moves all the ".previous.text" asm directives to ".previous" for kprobes section. AK: Following a similar i386 patch from Chuck Ebbert AK: Also merged Jeremy's fix in. +From: Jeremy Fitzhardinge KPROBE_ENTRY does a .section .kprobes.text, and expects its users to do a .previous at the end of the function. Unfortunately, if any code within the function switches sections, for example .fixup, then the .previous ends up putting all subsequent code into .fixup. Worse, any subsequent .fixup code gets intermingled with the code its supposed to be fixing (which is also in .fixup). It's surprising this didn't cause more havok. The fix is to use .pushsection/.popsection, so this stuff nests properly. A further cleanup would be to get rid of all .section/.previous pairs, since they're inherently fragile. +From: Chuck Ebbert <76306.1226@compuserve.com> Because code marked unsafe for kprobes jumps directly to entry.S::error_code, that must be marked unsafe as well. The easiest way to do that is to move the page fault entry point to just before error_code and let it inherit the same section. Also moved all the ".previous" asm directives for kprobes sections to column 1 and removed ".text" from them. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen --- arch/x86_64/kernel/entry.S | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 2092f565aa8..780f9b26169 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -819,7 +819,7 @@ paranoid_schedule\trace: * Exception entry point. This expects an error code/orig_rax on the stack * and the exception handler in %rax. */ -ENTRY(error_entry) +KPROBE_ENTRY(error_entry) _frame RDI /* rdi slot contains rax, oldrax contains error code */ cld @@ -903,7 +903,7 @@ error_kernelspace: cmpq $gs_change,RIP(%rsp) je error_swapgs jmp error_sti -END(error_entry) +KPROBE_END(error_entry) /* Reload gs selector with exception handling */ /* edi: new selector */ @@ -1025,8 +1025,7 @@ ENDPROC(execve) KPROBE_ENTRY(page_fault) errorentry do_page_fault -END(page_fault) - .previous .text +KPROBE_END(page_fault) ENTRY(coprocessor_error) zeroentry do_coprocessor_error @@ -1047,8 +1046,7 @@ KPROBE_ENTRY(debug) CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_debug, DEBUG_STACK paranoidexit -END(debug) - .previous .text +KPROBE_END(debug) /* runs on exception stack */ KPROBE_ENTRY(nmi) @@ -1062,8 +1060,7 @@ KPROBE_ENTRY(nmi) jmp paranoid_exit1 CFI_ENDPROC #endif -END(nmi) - .previous .text +KPROBE_END(nmi) KPROBE_ENTRY(int3) INTR_FRAME @@ -1072,8 +1069,7 @@ KPROBE_ENTRY(int3) paranoidentry do_int3, DEBUG_STACK jmp paranoid_exit1 CFI_ENDPROC -END(int3) - .previous .text +KPROBE_END(int3) ENTRY(overflow) zeroentry do_overflow @@ -1121,8 +1117,7 @@ END(stack_segment) KPROBE_ENTRY(general_protection) errorentry do_general_protection -END(general_protection) - .previous .text +KPROBE_END(general_protection) ENTRY(alignment_check) errorentry do_alignment_check -- cgit v1.2.3 From cbf9b4bb76c9ce53b7fdde0dffcd000951b5f0d4 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] X86_64 monotonic_clock goes backwards I've noticed some erratic behavior while testing the X86_64 version of monotonic_clock(). While spinning in a loop reading monotonic clock values (pinned to a single cpu) I noticed that the difference between subsequent values occasionally went negative (time going backwards). I found that in the following code: this_offset = get_cycles_sync(); /* FIXME: 1000 or 1000000? */ --> offset = (this_offset - last_offset)*1000 / cpu_khz; } return base + offset; the offset sometimes turns out to be 0, even though this_offset > last_offset. +Added fix From: Toyo Abe The x86_64-mm-monotonic-clock.patch in 2.6.18-rc4-mm2 made a change to the updating of monotonic_base. It now uses cycles_2_ns(). I suggest that a set_cyc2ns_scale() should be done prior to the setup_irq(). Because cycles_2_ns() can be called from the timer ISR right after the irq0 is enabled. Signed-off-by: Toyo Abe Signed-off-by: Dimitri Sivanich Signed-off-by: Andi Kleen --- arch/x86_64/kernel/time.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index d66c7f750e7..97115e608ed 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -276,6 +276,7 @@ static void set_rtc_mmss(unsigned long nowtime) * Note: This function is required to return accurate * time even in the absence of multiple timer ticks. */ +static inline unsigned long long cycles_2_ns(unsigned long long cyc); unsigned long long monotonic_clock(void) { unsigned long seq; @@ -300,8 +301,7 @@ unsigned long long monotonic_clock(void) base = monotonic_base; } while (read_seqretry(&xtime_lock, seq)); this_offset = get_cycles_sync(); - /* FIXME: 1000 or 1000000? */ - offset = (this_offset - last_offset)*1000 / cpu_khz; + offset = cycles_2_ns(this_offset - last_offset); } return base + offset; } @@ -405,8 +405,7 @@ void main_timer_handler(struct pt_regs *regs) offset %= USEC_PER_TICK; } - /* FIXME: 1000 or 1000000? */ - monotonic_base += (tsc - vxtime.last_tsc) * 1000000 / cpu_khz; + monotonic_base += cycles_2_ns(tsc - vxtime.last_tsc); vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot; @@ -929,10 +928,8 @@ void __init time_init(void) vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz; vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; vxtime.last_tsc = get_cycles_sync(); - setup_irq(0, &irq0); - set_cyc2ns_scale(cpu_khz); - + setup_irq(0, &irq0); hotcpu_notifier(time_cpu_notifier, 0); time_cpu_notifier(NULL, CPU_ONLINE, (void *)(long)smp_processor_id()); -- cgit v1.2.3 From 2717941c6a1d8fa543ddca337d450ab30ef91543 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] Make boot_param_data pure BSS Since it's all zero. Actually I think gcc 4+ will do that automatically, but earlier compilers won't Signed-off-by: Andi Kleen --- arch/x86_64/kernel/setup64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 646caa0a20b..b09e60fa96b 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -24,7 +24,7 @@ #include #include -char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,}; +char x86_boot_params[BOOT_PARAM_SIZE] __initdata; cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; -- cgit v1.2.3 From ba4d40bb5c465f0a4dcc30d02dab80c2cb7e1ff3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] Auto size the per cpu area. Now for a completely different but trivial approach. I just boot tested it with 255 CPUS and everything worked. Currently everything (except module data) we place in the per cpu area we know about at compile time. So instead of allocating a fixed size for the per_cpu area allocate the number of bytes we need plus a fixed constant for to be used for modules. It isn't perfect but it is much less of a pain to work with than what we are doing now. AK: fixed warning Signed-off-by: Eric W. Biederman Signed-off-by: Andi Kleen --- arch/x86_64/kernel/setup64.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index b09e60fa96b..e85cfbb49b6 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -95,12 +95,9 @@ void __init setup_per_cpu_areas(void) #endif /* Copy section for each CPU (we discard the original) */ - size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES); -#ifdef CONFIG_MODULES - if (size < PERCPU_ENOUGH_ROOM) - size = PERCPU_ENOUGH_ROOM; -#endif + size = PERCPU_ENOUGH_ROOM; + printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size); for_each_cpu_mask (i, cpu_possible_map) { char *ptr; -- cgit v1.2.3 From e07e23e1fd3000289fc7ccc6c71879070d3b19e0 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] non lazy "sleazy" fpu implementation Right now the kernel on x86-64 has a 100% lazy fpu behavior: after *every* context switch a trap is taken for the first FPU use to restore the FPU context lazily. This is of course great for applications that have very sporadic or no FPU use (since then you avoid doing the expensive save/restore all the time). However for very frequent FPU users... you take an extra trap every context switch. The patch below adds a simple heuristic to this code: After 5 consecutive context switches of FPU use, the lazy behavior is disabled and the context gets restored every context switch. If the app indeed uses the FPU, the trap is avoided. (the chance of the 6th time slice using FPU after the previous 5 having done so are quite high obviously). After 256 switches, this is reset and lazy behavior is returned (until there are 5 consecutive ones again). The reason for this is to give apps that do longer bursts of FPU use still the lazy behavior back after some time. [akpm@osdl.org: place new task_struct field next to jit_keyring to save space] Signed-off-by: Arjan van de Ven Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/kernel/process.c | 10 ++++++++++ arch/x86_64/kernel/traps.c | 1 + 2 files changed, 11 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 6fbd19564e4..9e9a70e50c7 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -552,6 +552,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); + /* we're going to use this soon, after a few expensive things */ + if (next_p->fpu_counter>5) + prefetch(&next->i387.fxsave); + /* * Reload esp0, LDT and the page table pointer: */ @@ -629,6 +633,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) __switch_to_xtra(prev_p, next_p, tss); + /* If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + if (next_p->fpu_counter>5) + math_state_restore(); return prev_p; } diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 28e53342f29..ffc40cff1e0 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -1136,6 +1136,7 @@ asmlinkage void math_state_restore(void) init_fpu(me); restore_fpu_checking(&me->thread.i387.fxsave); task_thread_info(me)->status |= TS_USEDFPU; + me->fpu_counter++; } void __init trap_init(void) -- cgit v1.2.3 From abf0f10948b316b577851ef21c728341f1046552 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] wire up oops_enter()/oops_exit() Implement pause_on_oops() on x86_64. AK: I redid the patch to do the oops_enter/exit in the existing oops_begin()/end(). This makes it much shorter. Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/x86_64/kernel/traps.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index ffc40cff1e0..34660b1e272 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -561,6 +561,8 @@ unsigned __kprobes long oops_begin(void) int cpu = safe_smp_processor_id(); unsigned long flags; + oops_enter(); + /* racy, but better than risking deadlock. */ local_irq_save(flags); if (!spin_trylock(&die_lock)) { @@ -589,6 +591,7 @@ void __kprobes oops_end(unsigned long flags) spin_unlock_irqrestore(&die_lock, flags); if (panic_on_oops) panic("Fatal exception"); + oops_exit(); } void __kprobes __die(const char * str, struct pt_regs * regs, long err) -- cgit v1.2.3 From ed77504b2007ff7ce56841227467ac3ead52df62 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] mark init_amd() as __cpuinit The init_amd() function is only called from identify_cpu() which is already marked as __cpuinit. So let's mark it as __cpuinit. Signed-off-by: Magnus Damm Signed-off-by: Andi Kleen --- arch/x86_64/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 4be29832e3d..e328e3eb8cb 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -672,7 +672,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) #endif } -static void __init init_amd(struct cpuinfo_x86 *c) +static void __cpuinit init_amd(struct cpuinfo_x86 *c) { unsigned level; -- cgit v1.2.3 From dbf9272e863bf4b17ee8e3c66c26682b2061d40d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] Don't force reserve the 640k-1MB range From i386 x86-64 inherited code to force reserve the 640k-1MB area. That was needed on some old systems. But we generally trust the e820 map to be correct on 64bit systems and mark all areas that are not memory correctly. This patch will allow to use the real memory in there. Or rather the only way to find out if it's still needed is to try. So far I'm optimistic. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/e820.c | 33 +-------------------------------- 1 file changed, 1 insertion(+), 32 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 164d0b83dc9..e06c2714ecf 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -71,12 +71,7 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size) return 1; } #endif - /* kernel code + 640k memory hole (later should not be needed, but - be paranoid for now) */ - if (last >= 640*1024 && addr < 1024*1024) { - *addrp = 1024*1024; - return 1; - } + /* kernel code */ if (last >= __pa_symbol(&_text) && last < __pa_symbol(&_end)) { *addrp = __pa_symbol(&_end); return 1; @@ -519,13 +514,6 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) * If we're lucky and live on a modern system, the setup code * will have given us a memory map that we can use to properly * set up memory. If we aren't, we'll fake a memory map. - * - * We check to see that the memory map contains at least 2 elements - * before we'll use it, because the detection code in setup.S may - * not be perfect and most every PC known to man has two memory - * regions: one from 0 to 640k, and one from 1mb up. (The IBM - * thinkpad 560x, for example, does not cooperate with the memory - * detection code.) */ static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) { @@ -543,25 +531,6 @@ static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) if (start > end) return -1; - /* - * Some BIOSes claim RAM in the 640k - 1M region. - * Not right. Fix it up. - * - * This should be removed on Hammer which is supposed to not - * have non e820 covered ISA mappings there, but I had some strange - * problems so it stays for now. -AK - */ - if (type == E820_RAM) { - if (start < 0x100000ULL && end > 0xA0000ULL) { - if (start < 0xA0000ULL) - add_memory_region(start, 0xA0000ULL-start, type); - if (end <= 0x100000ULL) - continue; - start = 0x100000ULL; - size = end - start; - } - } - add_memory_region(start, size, type); } while (biosmap++,--nr_map); return 0; -- cgit v1.2.3 From 34464a5b8937b79801776dfb6970c1b949fed4be Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Detect clock skew during suspend Detect the situations in which the time after a resume from disk would be earlier than the time before the suspend and prevent them from happening on x86_64. Signed-off-by: Rafael J. Wysocki Signed-off-by: Andi Kleen --- arch/x86_64/kernel/time.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 97115e608ed..9dd15d12b65 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -1039,8 +1039,16 @@ static int timer_resume(struct sys_device *dev) unsigned long flags; unsigned long sec; unsigned long ctime = get_cmos_time(); - unsigned long sleep_length = (ctime - sleep_start) * HZ; + long sleep_length = (ctime - sleep_start) * HZ; + if (sleep_length < 0) { + printk(KERN_WARNING "Time skew detected in timer resume!\n"); + /* The time after the resume must not be earlier than the time + * before the suspend or some nasty things will happen + */ + sleep_length = 0; + ctime = sleep_start; + } if (vxtime.hpet_address) hpet_reenable(); else -- cgit v1.2.3 From 151f8cc1169f9052095b2be36183ab132d75c6c2 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Remove safe_smp_processor_id() And replace all users with ordinary smp_processor_id. The function was originally added to get some basic oops information out even if the GS register was corrupted. However that didn't work for some anymore because printk is needed to print the oops and it uses smp_processor_id() already. Also GS register corruptions are not particularly common anymore. This also helps the Xen port which would otherwise need to do this in a special way because it can't access the local APIC. Cc: Chris Wright Signed-off-by: Andi Kleen --- arch/x86_64/kernel/mce.c | 2 +- arch/x86_64/kernel/smp.c | 23 ----------------------- arch/x86_64/kernel/traps.c | 10 +++++----- 3 files changed, 6 insertions(+), 29 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 4e017fb30fb..1a93c373840 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -182,7 +182,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) goto out2; memset(&m, 0, sizeof(struct mce)); - m.cpu = safe_smp_processor_id(); + m.cpu = smp_processor_id(); rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); if (!(m.mcgstatus & MCG_STATUS_RIPV)) kill_it = 1; diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 2df988bbf31..4f67697f503 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -522,26 +522,3 @@ asmlinkage void smp_call_function_interrupt(void) } } -int safe_smp_processor_id(void) -{ - unsigned apicid, i; - - if (disable_apic || !apic_mapped) - return 0; - - apicid = hard_smp_processor_id(); - if (apicid < NR_CPUS && x86_cpu_to_apicid[apicid] == apicid) - return apicid; - - for (i = 0; i < NR_CPUS; ++i) { - if (x86_cpu_to_apicid[i] == apicid) - return i; - } - - /* No entries in x86_cpu_to_apicid? Either no MPS|ACPI, - * or called too early. Either way, we must be CPU 0. */ - if (x86_cpu_to_apicid[0] == BAD_APICID) - return 0; - - return 0; /* Should not happen */ -} diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 34660b1e272..38bc821e457 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -264,7 +264,7 @@ static int dump_trace_unwind(struct unwind_frame_info *info, void *context) void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack, struct stacktrace_ops *ops, void *data) { - const unsigned cpu = safe_smp_processor_id(); + const unsigned cpu = smp_processor_id(); unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; unsigned used = 0; @@ -429,7 +429,7 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp) { unsigned long *stack; int i; - const int cpu = safe_smp_processor_id(); + const int cpu = smp_processor_id(); unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr); unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); @@ -483,7 +483,7 @@ void show_registers(struct pt_regs *regs) int i; int in_kernel = !user_mode(regs); unsigned long rsp; - const int cpu = safe_smp_processor_id(); + const int cpu = smp_processor_id(); struct task_struct *cur = cpu_pda(cpu)->pcurrent; rsp = regs->rsp; @@ -558,7 +558,7 @@ static unsigned int die_nest_count; unsigned __kprobes long oops_begin(void) { - int cpu = safe_smp_processor_id(); + int cpu = smp_processor_id(); unsigned long flags; oops_enter(); @@ -636,7 +636,7 @@ void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) * We are in trouble anyway, lets at least try * to get a message out. */ - printk(str, safe_smp_processor_id()); + printk(str, smp_processor_id()); show_registers(regs); if (kexec_should_crash(current)) crash_kexec(regs); -- cgit v1.2.3 From f2c2cca3acef8b253a36381d9b469ad4fb08563a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Remove APIC version/cpu capability mpparse checking/printing ACPI went to great trouble to get the APIC version and CPU capabilities of different CPUs before passing them to the mpparser. But all that data was used was to print it out. Actually it even faked some data based on the boot cpu, not on the actual CPU being booted. Remove all this code because it's not needed. Cc: len.brown@intel.com Signed-off-by: Andi Kleen --- arch/x86_64/kernel/io_apic.c | 13 ----------- arch/x86_64/kernel/mpparse.c | 52 +++++++++++++++----------------------------- 2 files changed, 17 insertions(+), 48 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index afac3dbb372..0491019d4c8 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -1748,19 +1748,6 @@ device_initcall(ioapic_init_sysfs); #define IO_APIC_MAX_ID 0xFE -int __init io_apic_get_version (int ioapic) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return reg_01.bits.version; -} - - int __init io_apic_get_redir_entries (int ioapic) { union IO_APIC_reg_01 reg_01; diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index d63f849aea2..32b6977f80c 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -41,7 +41,6 @@ int acpi_found_madt; * Various Linux-internal data structures created from the * MP-table. */ -unsigned char apic_version [MAX_APICS]; DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; @@ -94,24 +93,21 @@ static int __init mpf_checksum(unsigned char *mp, int len) static void __cpuinit MP_processor_info (struct mpc_config_processor *m) { int cpu; - unsigned char ver; cpumask_t tmp_map; + char *bootup_cpu = ""; if (!(m->mpc_cpuflag & CPU_ENABLED)) { disabled_cpus++; return; } - printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n", - m->mpc_apicid, - (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8, - (m->mpc_cpufeature & CPU_MODEL_MASK)>>4, - m->mpc_apicver); - if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { - Dprintk(" Bootup CPU\n"); + bootup_cpu = " (Bootup-CPU)"; boot_cpu_id = m->mpc_apicid; } + + printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu); + if (num_processors >= NR_CPUS) { printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." " Processor ignored.\n", NR_CPUS); @@ -129,17 +125,7 @@ static void __cpuinit MP_processor_info (struct mpc_config_processor *m) return; } #endif - ver = m->mpc_apicver; - physid_set(m->mpc_apicid, phys_cpu_present_map); - /* - * Validate version - */ - if (ver == 0x0) { - printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid); - ver = 0x10; - } - apic_version[m->mpc_apicid] = ver; if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { /* * bios_cpu_apicid is required to have processors listed @@ -179,8 +165,8 @@ static void __init MP_ioapic_info (struct mpc_config_ioapic *m) if (!(m->mpc_flags & MPC_APIC_USABLE)) return; - printk("I/O APIC #%d Version %d at 0x%X.\n", - m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); + printk("I/O APIC #%d at 0x%X.\n", + m->mpc_apicid, m->mpc_apicaddr); if (nr_ioapics >= MAX_IO_APICS) { printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n", MAX_IO_APICS, nr_ioapics); @@ -413,13 +399,10 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) * 2 CPUs, numbered 0 & 1. */ processor.mpc_type = MP_PROCESSOR; - /* Either an integrated APIC or a discrete 82489DX. */ - processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + processor.mpc_apicver = 0; processor.mpc_cpuflag = CPU_ENABLED; - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | - (boot_cpu_data.x86_model << 4) | - boot_cpu_data.x86_mask; - processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_cpufeature = 0; + processor.mpc_featureflag = 0; processor.mpc_reserved[0] = 0; processor.mpc_reserved[1] = 0; for (i = 0; i < 2; i++) { @@ -448,7 +431,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) ioapic.mpc_type = MP_IOAPIC; ioapic.mpc_apicid = 2; - ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + ioapic.mpc_apicver = 0; ioapic.mpc_flags = MPC_APIC_USABLE; ioapic.mpc_apicaddr = 0xFEC00000; MP_ioapic_info(&ioapic); @@ -640,12 +623,11 @@ void __cpuinit mp_register_lapic (u8 id, u8 enabled) processor.mpc_type = MP_PROCESSOR; processor.mpc_apicid = id; - processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR)); + processor.mpc_apicver = 0; processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | - (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; - processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_cpufeature = 0; + processor.mpc_featureflag = 0; processor.mpc_reserved[0] = 0; processor.mpc_reserved[1] = 0; @@ -700,7 +682,7 @@ void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); mp_ioapics[idx].mpc_apicid = id; - mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); + mp_ioapics[idx].mpc_apicver = 0; /* * Build basic IRQ lookup table to facilitate gsi->io_apic lookups @@ -711,9 +693,9 @@ void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) mp_ioapic_routing[idx].gsi_end = gsi_base + io_apic_get_redir_entries(idx); - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, " "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, + mp_ioapics[idx].mpc_apicaddr, mp_ioapic_routing[idx].gsi_start, mp_ioapic_routing[idx].gsi_end); } -- cgit v1.2.3 From e4251e130deef9de5226cc36faa70a1c6671d3c5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Remove some cruft in apic id checking during processor setup - Remove a define that was used only once - Remove the too large APIC ID check because we always support the full 8bit range of APICs. - Restructure code a bit to be simpler. Cc: len.brown@intel.com Signed-off-by: Andi Kleen --- arch/x86_64/kernel/mpparse.c | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 32b6977f80c..02eef8f6415 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -100,7 +100,6 @@ static void __cpuinit MP_processor_info (struct mpc_config_processor *m) disabled_cpus++; return; } - if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { bootup_cpu = " (Bootup-CPU)"; boot_cpu_id = m->mpc_apicid; @@ -118,13 +117,6 @@ static void __cpuinit MP_processor_info (struct mpc_config_processor *m) cpus_complement(tmp_map, cpu_present_map); cpu = first_cpu(tmp_map); -#if MAX_APICS < 255 - if ((int)m->mpc_apicid > MAX_APICS) { - printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", - m->mpc_apicid, MAX_APICS); - return; - } -#endif physid_set(m->mpc_apicid, phys_cpu_present_map); if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { /* @@ -603,8 +595,6 @@ void __init mp_register_lapic_address(u64 address) set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); if (boot_cpu_id == -1U) boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); - - Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); } void __cpuinit mp_register_lapic (u8 id, u8 enabled) @@ -612,13 +602,7 @@ void __cpuinit mp_register_lapic (u8 id, u8 enabled) struct mpc_config_processor processor; int boot_cpu = 0; - if (id >= MAX_APICS) { - printk(KERN_WARNING "Processor #%d invalid (max %d)\n", - id, MAX_APICS); - return; - } - - if (id == boot_cpu_physical_apicid) + if (id == boot_cpu_id) boot_cpu = 1; processor.mpc_type = MP_PROCESSOR; -- cgit v1.2.3 From aecc63615e15de861db7436c50dade495639132c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Fix coding style and output of the mptable parser Give the printks a consistent prefix. Add some missing white space. Cc: len.brown@intel.com Signed-off-by: Andi Kleen --- arch/x86_64/kernel/mpparse.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 02eef8f6415..20e88f4b564 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -205,7 +205,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) unsigned char *mpt=((unsigned char *)mpc)+count; if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { - printk("SMP mptable: bad signature [%c%c%c%c]!\n", + printk("MPTABLE: bad signature [%c%c%c%c]!\n", mpc->mpc_signature[0], mpc->mpc_signature[1], mpc->mpc_signature[2], @@ -213,31 +213,31 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) return 0; } if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { - printk("SMP mptable: checksum error!\n"); + printk("MPTABLE: checksum error!\n"); return 0; } if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { - printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n", + printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n", mpc->mpc_spec); return 0; } if (!mpc->mpc_lapic) { - printk(KERN_ERR "SMP mptable: null local APIC address!\n"); + printk(KERN_ERR "MPTABLE: null local APIC address!\n"); return 0; } memcpy(str,mpc->mpc_oem,8); - str[8]=0; - printk(KERN_INFO "OEM ID: %s ",str); + str[8] = 0; + printk(KERN_INFO "MPTABLE: OEM ID: %s ",str); memcpy(str,mpc->mpc_productid,12); - str[12]=0; - printk("Product ID: %s ",str); + str[12] = 0; + printk("MPTABLE: Product ID: %s ",str); - printk("APIC at: 0x%X\n",mpc->mpc_lapic); + printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic); /* save the local APIC address, it might be non-default */ if (!acpi_lapic) - mp_lapic_addr = mpc->mpc_lapic; + mp_lapic_addr = mpc->mpc_lapic; /* * Now process the configuration blocks. @@ -249,7 +249,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) struct mpc_config_processor *m= (struct mpc_config_processor *)mpt; if (!acpi_lapic) - MP_processor_info(m); + MP_processor_info(m); mpt += sizeof(*m); count += sizeof(*m); break; @@ -268,8 +268,8 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) struct mpc_config_ioapic *m= (struct mpc_config_ioapic *)mpt; MP_ioapic_info(m); - mpt+=sizeof(*m); - count+=sizeof(*m); + mpt += sizeof(*m); + count += sizeof(*m); break; } case MP_INTSRC: @@ -278,8 +278,8 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) (struct mpc_config_intsrc *)mpt; MP_intsrc_info(m); - mpt+=sizeof(*m); - count+=sizeof(*m); + mpt += sizeof(*m); + count += sizeof(*m); break; } case MP_LINTSRC: @@ -287,15 +287,15 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) struct mpc_config_lintsrc *m= (struct mpc_config_lintsrc *)mpt; MP_lintsrc_info(m); - mpt+=sizeof(*m); - count+=sizeof(*m); + mpt += sizeof(*m); + count += sizeof(*m); break; } } } clustered_apic_check(); if (!num_processors) - printk(KERN_ERR "SMP mptable: no processors registered!\n"); + printk(KERN_ERR "MPTABLE: no processors registered!\n"); return num_processors; } -- cgit v1.2.3 From 7a0a2dff1cac1df82acfa0395bc9bc1bf0bc16ef Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Add a missing check for irq flags tracing in NMI NMIs are not supposed to track the irq flags, but TRACE_IRQS_IRETQ did it anyways. Add a check. Cc: mingo@elte.hu Signed-off-by: Andi Kleen --- arch/x86_64/kernel/entry.S | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 780f9b26169..4fcc0ad8bbe 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -773,7 +773,9 @@ paranoid_exit\trace: testl $3,CS(%rsp) jnz paranoid_userspace\trace paranoid_swapgs\trace: + .if \trace TRACE_IRQS_IRETQ 0 + .endif swapgs paranoid_restore\trace: RESTORE_ALL 8 -- cgit v1.2.3 From 8380aabb99719af583447133f19a4d8074b5c337 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Remove non e820 fallbacks in high level code Drop support for non e820 BIOS calls to get the memory map. The boot assembler code still has some support, but not the C code now. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/e820.c | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index e06c2714ecf..5d1275a9365 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -536,10 +536,14 @@ static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) return 0; } -void __init setup_memory_region(void) +void early_panic(char *msg) { - char *who = "BIOS-e820"; + early_printk(msg); + panic(msg); +} +void __init setup_memory_region(void) +{ /* * Try to copy the BIOS-supplied E820-map. * @@ -547,24 +551,10 @@ void __init setup_memory_region(void) * the next section from 1mb->appropriate_mem_k */ sanitize_e820_map(E820_MAP, &E820_MAP_NR); - if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) { - unsigned long mem_size; - - /* compare results from other methods and take the greater */ - if (ALT_MEM_K < EXT_MEM_K) { - mem_size = EXT_MEM_K; - who = "BIOS-88"; - } else { - mem_size = ALT_MEM_K; - who = "BIOS-e801"; - } - - e820.nr_map = 0; - add_memory_region(0, LOWMEMSIZE(), E820_RAM); - add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM); - } + if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) + early_panic("Cannot find a valid memory map"); printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - e820_print_map(who); + e820_print_map("BIOS-e820"); } static int __init parse_memopt(char *p) -- cgit v1.2.3 From 26374c7b7dca1ff90607c83d9b82e917119f0456 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] Reload CS when startup_64 is used. In long mode the %cs is largely a relic. However there are a few cases like iret where it matters that we have a valid value. Without this patch it is possible to enter the kernel in startup_64 without setting %cs to a valid value. With this patch we don't care what %cs value we enter the kernel with, so long as the cs shadow register indicates it is a privileged code segment. Thanks to Magnus Damm for finding this problem and posting the first workable patch. I have moved the jump to set %cs down a few instructions so we don't need to take an extra jump. Which keeps the code simpler. Signed-of-by: Eric W. Biederman Signed-off-by: Andi Kleen --- arch/x86_64/kernel/head.S | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 4a3326ce709..1e6f8087067 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -185,12 +185,15 @@ startup_64: /* Finally jump to run C code and to be on real kernel address * Since we are running on identity-mapped space we have to jump - * to the full 64bit address , this is only possible as indirect - * jump + * to the full 64bit address, this is only possible as indirect + * jump. In addition we need to ensure %cs is set so we make this + * a far return. */ movq initial_code(%rip),%rax - pushq $0 # fake return address - jmp *%rax + pushq $0 # fake return address to stop unwinder + pushq $__KERNEL_CS # set correct cs + pushq %rax # target address in negative space + lretq /* SMP bootup changes these two */ .align 8 -- cgit v1.2.3 From f2a9e1dec27189a09ff642f2648e49ad9e76607b Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] Put .note.* sections into a PT_NOTE segment This patch updates x86_64 linker script to pack any .note.* sections into a PT_NOTE segment in the output file. To do this, we tell ld that we need a PT_NOTE segment. This requires us to start explicitly mapping sections to segments, so we also need to explicitly create PT_LOAD segments for text and data, and map the sections to them appropriately. Fortunately, each section will default to its previous section's segment, so it doesn't take many changes to vmlinux.lds.S. The corresponding change is already made for i386 in -mm and I'd like this patch to join it. The section to segment mappings do change as do the segment flags so some time in -mm would be good for that reason as well, just in case. In particular .data and .bss move from the text segment to the data segment and .data.cacheline_aligned .data.read_mostly are put in the data segment instead of a separate one. I think that it would be possible to exactly match the existing section to segment mapping and flags but it would be a more intrusive change and I'm not sure there is a reason for the existing layout other than it is what you get by default if you don't explicitly specify something else. If there is a reason for the existing layout then I will of course make the more intrusive change. If there is no reason we could probably drop the executable or writable flags from some segments but I don't know how much attention is paid to them anyway so it might not be worth the effort. The vsyscall related sections need to go in a different segment to the normal data segment and so I invented a "user" segment to contain them. I believe this should appear to be another data segment as far as the kernel is concerned so the flags are setup accordingly. The notes will be used in the Xen paravirt_ops backend to provide additional information to the domain builder. I am in the process of converting the xen-unstable kernels and tools over to this scheme at the moment to support this in the future. It has been suggested to me that the notes segment should have flags 0 (i.e. not readable) since it is only used by the loader and is not used at runtime. For now I went with a readable segment since that is what the i386 patch uses. AK: dropped NOTES addition right now because the needed infrastructure for that is not merged yet Signed-off-by: Ian Campbell Signed-off-by: Andi Kleen --- arch/x86_64/kernel/vmlinux.lds.S | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 8d5a5149bb3..2802aa963fa 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -13,6 +13,12 @@ OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") OUTPUT_ARCH(i386:x86-64) ENTRY(phys_startup_64) jiffies_64 = jiffies; +PHDRS { + text PT_LOAD FLAGS(5); /* R_E */ + data PT_LOAD FLAGS(7); /* RWE */ + user PT_LOAD FLAGS(7); /* RWE */ + note PT_NOTE FLAGS(4); /* R__ */ +} SECTIONS { . = __START_KERNEL; @@ -31,7 +37,7 @@ SECTIONS KPROBES_TEXT *(.fixup) *(.gnu.warning) - } = 0x9090 + } :text = 0x9090 /* out-of-line lock text */ .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) } @@ -57,7 +63,7 @@ SECTIONS .data : AT(ADDR(.data) - LOAD_OFFSET) { *(.data) CONSTRUCTORS - } + } :data _edata = .; /* End of data section */ @@ -89,7 +95,7 @@ SECTIONS #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) . = VSYSCALL_ADDR; - .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } + .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user __vsyscall_0 = VSYSCALL_VIRT_ADDR; . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); @@ -135,7 +141,7 @@ SECTIONS . = ALIGN(8192); /* init_task */ .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { *(.data.init_task) - } + } :data . = ALIGN(4096); .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { -- cgit v1.2.3 From 53ee11ae0d73f28029a5f0d991bc4dcd7c817e7a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] Optimize PDA accesses slightly Based on a idea by Jeremy Fitzhardinge: Replace the volatiles and memory clobbers in the PDA access with telling gcc about access to a proxy PDA structure that doesn't actually exist. But the dummy accesses give a defined ordering for read/write accesses. Also add some memory barriers to the early GS initialization to make sure no PDA access is moved before it. Advantage is some .text savings (probably most from better code for accessing "current"): text data bss dec hex filename 4845647 1223688 615864 6685199 66020f vmlinux 4837780 1223688 615864 6677332 65e354 vmlinux-pda 1.2% smaller code Cc: Jeremy Fitzhardinge Signed-off-by: Andi Kleen --- arch/x86_64/kernel/setup64.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index e85cfbb49b6..491361752c7 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -121,7 +121,10 @@ void pda_init(int cpu) /* Setup up data that may be needed in __get_free_pages early */ asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); + /* Memory clobbers used to order PDA accessed */ + mb(); wrmsrl(MSR_GS_BASE, pda); + mb(); pda->cpunumber = cpu; pda->irqcount = -1; -- cgit v1.2.3 From f574164491d00d28b727d713685fb5edc9138200 Mon Sep 17 00:00:00 2001 From: Keith Owens Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] Remove most of the special cases for the debug IST stack Remove most of the special cases for the debug IST stack. This is a follow on clean up patch, it requires the bug fix patch that adds orig_ist. Signed-off-by: Keith Owens Signed-off-by: Andi Kleen --- arch/x86_64/kernel/setup64.c | 21 +++++---------------- arch/x86_64/kernel/traps.c | 21 +-------------------- 2 files changed, 6 insertions(+), 36 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 491361752c7..9332d2361e0 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -237,28 +237,17 @@ void __cpuinit cpu_init (void) * set up and load the per-CPU TSS */ for (v = 0; v < N_EXCEPTION_STACKS; v++) { + static const unsigned int order[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER + }; if (cpu) { - static const unsigned int order[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER - }; - estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); if (!estacks) panic("Cannot allocate exception stack %ld %d\n", v, cpu); } - switch (v + 1) { -#if DEBUG_STKSZ > EXCEPTION_STKSZ - case DEBUG_STACK: - cpu_pda(cpu)->debugstack = (unsigned long)estacks; - estacks += DEBUG_STKSZ; - break; -#endif - default: - estacks += EXCEPTION_STKSZ; - break; - } + estacks += PAGE_SIZE << order[v]; orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks; } diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 38bc821e457..fb8486eca1b 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -162,26 +162,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, * 'stack' is in one of them: */ for (k = 0; k < N_EXCEPTION_STACKS; k++) { - unsigned long end; - - /* - * set 'end' to the end of the exception stack. - */ - switch (k + 1) { - /* - * TODO: this block is not needed i think, because - * setup64.c:cpu_init() sets up t->ist[DEBUG_STACK] - * properly too. - */ -#if DEBUG_STKSZ > EXCEPTION_STKSZ - case DEBUG_STACK: - end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ; - break; -#endif - default: - end = per_cpu(orig_ist, cpu).ist[k]; - break; - } + unsigned long end = per_cpu(orig_ist, cpu).ist[k]; /* * Is 'stack' above this exception frame's end? * If yes then skip to the next frame. -- cgit v1.2.3 From 4bfaaef01a1badb9e8ffb0c0a37cd2379008d21f Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] Avoid overwriting the current pgd (V4, x86_64) kexec: Avoid overwriting the current pgd (V4, x86_64) This patch upgrades the x86_64-specific kexec code to avoid overwriting the current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used to start a secondary kernel that dumps the memory of the previous kernel. The code introduces a new set of page tables. These tables are used to provide an executable identity mapping without overwriting the current pgd. Signed-off-by: Magnus Damm Signed-off-by: Andi Kleen --- arch/x86_64/kernel/machine_kexec.c | 71 ++++++++------- arch/x86_64/kernel/relocate_kernel.S | 171 +++++++++++++++++++++++++++++++---- 2 files changed, 189 insertions(+), 53 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c index 2e94c072d84..0497e3bd5bf 100644 --- a/arch/x86_64/kernel/machine_kexec.c +++ b/arch/x86_64/kernel/machine_kexec.c @@ -15,6 +15,15 @@ #include #include +#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) +static u64 kexec_pgd[512] PAGE_ALIGNED; +static u64 kexec_pud0[512] PAGE_ALIGNED; +static u64 kexec_pmd0[512] PAGE_ALIGNED; +static u64 kexec_pte0[512] PAGE_ALIGNED; +static u64 kexec_pud1[512] PAGE_ALIGNED; +static u64 kexec_pmd1[512] PAGE_ALIGNED; +static u64 kexec_pte1[512] PAGE_ALIGNED; + static void init_level2_page(pmd_t *level2p, unsigned long addr) { unsigned long end_addr; @@ -144,32 +153,19 @@ static void load_segments(void) ); } -typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page, - unsigned long control_code_buffer, - unsigned long start_address, - unsigned long pgtable) ATTRIB_NORET; - -extern const unsigned char relocate_new_kernel[]; -extern const unsigned long relocate_new_kernel_size; - int machine_kexec_prepare(struct kimage *image) { - unsigned long start_pgtable, control_code_buffer; + unsigned long start_pgtable; int result; /* Calculate the offsets */ start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; - control_code_buffer = start_pgtable + PAGE_SIZE; /* Setup the identity mapped 64bit page table */ result = init_pgtable(image, start_pgtable); if (result) return result; - /* Place the code in the reboot code buffer */ - memcpy(__va(control_code_buffer), relocate_new_kernel, - relocate_new_kernel_size); - return 0; } @@ -184,28 +180,34 @@ void machine_kexec_cleanup(struct kimage *image) */ NORET_TYPE void machine_kexec(struct kimage *image) { - unsigned long page_list; - unsigned long control_code_buffer; - unsigned long start_pgtable; - relocate_new_kernel_t rnk; + unsigned long page_list[PAGES_NR]; + void *control_page; /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); - /* Calculate the offsets */ - page_list = image->head; - start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; - control_code_buffer = start_pgtable + PAGE_SIZE; - - /* Set the low half of the page table to my identity mapped - * page table for kexec. Leave the high half pointing at the - * kernel pages. Don't bother to flush the global pages - * as that will happen when I fully switch to my identity mapped - * page table anyway. - */ - memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2); - __flush_tlb(); - + control_page = page_address(image->control_code_page) + PAGE_SIZE; + memcpy(control_page, relocate_kernel, PAGE_SIZE); + + page_list[PA_CONTROL_PAGE] = __pa(control_page); + page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; + page_list[PA_PGD] = __pa(kexec_pgd); + page_list[VA_PGD] = (unsigned long)kexec_pgd; + page_list[PA_PUD_0] = __pa(kexec_pud0); + page_list[VA_PUD_0] = (unsigned long)kexec_pud0; + page_list[PA_PMD_0] = __pa(kexec_pmd0); + page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; + page_list[PA_PTE_0] = __pa(kexec_pte0); + page_list[VA_PTE_0] = (unsigned long)kexec_pte0; + page_list[PA_PUD_1] = __pa(kexec_pud1); + page_list[VA_PUD_1] = (unsigned long)kexec_pud1; + page_list[PA_PMD_1] = __pa(kexec_pmd1); + page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; + page_list[PA_PTE_1] = __pa(kexec_pte1); + page_list[VA_PTE_1] = (unsigned long)kexec_pte1; + + page_list[PA_TABLE_PAGE] = + (unsigned long)__pa(page_address(image->control_code_page)); /* The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is @@ -222,9 +224,10 @@ NORET_TYPE void machine_kexec(struct kimage *image) */ set_gdt(phys_to_virt(0),0); set_idt(phys_to_virt(0),0); + /* now call it */ - rnk = (relocate_new_kernel_t) control_code_buffer; - (*rnk)(page_list, control_code_buffer, image->start, start_pgtable); + relocate_kernel((unsigned long)image->head, (unsigned long)page_list, + image->start); } /* crashkernel=size@addr specifies the location to reserve for diff --git a/arch/x86_64/kernel/relocate_kernel.S b/arch/x86_64/kernel/relocate_kernel.S index d24fa9b72a2..14e95872c6a 100644 --- a/arch/x86_64/kernel/relocate_kernel.S +++ b/arch/x86_64/kernel/relocate_kernel.S @@ -7,31 +7,169 @@ */ #include +#include +#include - /* - * Must be relocatable PIC code callable as a C function, that once - * it starts can not use the previous processes stack. - */ - .globl relocate_new_kernel +/* + * Must be relocatable PIC code callable as a C function + */ + +#define PTR(x) (x << 3) +#define PAGE_ALIGNED (1 << PAGE_SHIFT) +#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */ + + .text + .align PAGE_ALIGNED .code64 + .globl relocate_kernel +relocate_kernel: + /* %rdi indirection_page + * %rsi page_list + * %rdx start address + */ + + /* map the control page at its virtual address */ + + movq $0x0000ff8000000000, %r10 /* mask */ + mov $(39 - 3), %cl /* bits to shift */ + movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */ + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PGD)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PUD_0)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PUD_0)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PMD_0)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PMD_0)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PTE_0)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PTE_0)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + /* identity map the control page at its physical address */ + + movq $0x0000ff8000000000, %r10 /* mask */ + mov $(39 - 3), %cl /* bits to shift */ + movq PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */ + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PGD)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PUD_1)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PUD_1)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PMD_1)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PMD_1)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PTE_1)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PTE_1)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + relocate_new_kernel: - /* %rdi page_list - * %rsi reboot_code_buffer + /* %rdi indirection_page + * %rsi page_list * %rdx start address - * %rcx page_table - * %r8 arg5 - * %r9 arg6 */ /* zero out flags, and disable interrupts */ pushq $0 popfq - /* set a new stack at the bottom of our page... */ - lea 4096(%rsi), %rsp + /* get physical address of control page now */ + /* this is impossible after page table switch */ + movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 + + /* get physical address of page table now too */ + movq PTR(PA_TABLE_PAGE)(%rsi), %rcx - /* store the parameters back on the stack */ - pushq %rdx /* store the start address */ + /* switch to new set of page tables */ + movq PTR(PA_PGD)(%rsi), %r9 + movq %r9, %cr3 + + /* setup a new stack at the end of the physical control page */ + lea 4096(%r8), %rsp + + /* jump to identity mapped page */ + addq $(identity_mapped - relocate_kernel), %r8 + pushq %r8 + ret + +identity_mapped: + /* store the start address on the stack */ + pushq %rdx /* Set cr0 to a known state: * 31 1 == Paging enabled @@ -136,8 +274,3 @@ relocate_new_kernel: xorq %r15, %r15 ret -relocate_new_kernel_end: - - .globl relocate_new_kernel_size -relocate_new_kernel_size: - .quad relocate_new_kernel_end - relocate_new_kernel -- cgit v1.2.3 From e8c7391de4cd91b2cbb0c791f69ba1e066595848 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] Don't use kernel_text_address in oops context Because it can take spinlocks. Suggested by Mathieu Desnoyers Cc: Mathieu Desnoyers Signed-off-by: Andi Kleen --- arch/x86_64/kernel/traps.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index fb8486eca1b..01f2a8d254c 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -299,7 +299,9 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s #define HANDLE_STACK(cond) \ do while (cond) { \ unsigned long addr = *stack++; \ - if (kernel_text_address(addr)) { \ + if (oops_in_progress ? \ + __kernel_text_address(addr) : \ + kernel_text_address(addr)) { \ /* \ * If the address is either in the text segment of the \ * kernel, or in the region which contains vmalloc'ed \ -- cgit v1.2.3 From 0a4254058037eb172758961d0a5b94f4320a1425 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] Add the canary field to the PDA area and the task struct This patch adds the per thread cookie field to the task struct and the PDA. Also it makes sure that the PDA value gets the new cookie value at context switch, and that a new task gets a new cookie at task creation time. Signed-off-by: Arjan van Ven Signed-off-by: Ingo Molnar Signed-off-by: Andi Kleen CC: Andi Kleen --- arch/x86_64/kernel/process.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 9e9a70e50c7..fba8dfeda67 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -625,6 +625,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) unlazy_fpu(prev_p); write_pda(kernelstack, task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); +#ifdef CONFIG_CC_STACKPROTECTOR + write_pda(stack_canary, next_p->stack_canary); + /* + * Build time only check to make sure the stack_canary is at + * offset 40 in the pda; this is a gcc ABI requirement + */ + BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40); +#endif /* * Now maybe reload the debug registers and handle I/O bitmaps -- cgit v1.2.3 From 96e540492ab54423f3693958329e095878f1f12b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:39 +0200 Subject: [PATCH] Fix a irqcount comment in entry.S Signed-off-by: Andi Kleen --- arch/x86_64/kernel/entry.S | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 4fcc0ad8bbe..ea32688386f 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -519,7 +519,12 @@ END(stub_rt_sigreturn) testl $3,CS(%rdi) je 1f swapgs -1: incl %gs:pda_irqcount # RED-PEN should check preempt count + /* irqcount is used to check if a CPU is already on an interrupt + stack or not. While this is essentially redundant with preempt_count + it is a little cheaper to use a separate counter in the PDA + (short of moving irq_enter into assembly, which would be too + much work) */ +1: incl %gs:pda_irqcount cmoveq %gs:pda_irqstackptr,%rsp push %rbp # backlink for old unwinder /* -- cgit v1.2.3 From 7b0bda74f7e77f362eaeee837e7911238acf4c76 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:39 +0200 Subject: [PATCH] Fix a PDA warning uncovered by the new type checking Fix linux/arch/x86_64/kernel/process.c: In function __switch_to: linux/arch/x86_64/kernel/process.c:626: warning: assignment makes integer from pointer without a cast Signed-off-by: Andi Kleen --- arch/x86_64/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index fba8dfeda67..885c318f76a 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -624,7 +624,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) And the AMD workaround requires it to be after DS reload. */ unlazy_fpu(prev_p); write_pda(kernelstack, - task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); + (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); #ifdef CONFIG_CC_STACKPROTECTOR write_pda(stack_canary, next_p->stack_canary); /* -- cgit v1.2.3 From 54dbc0c9ebefb38840c6b07fa6eabaeb96c921f5 Mon Sep 17 00:00:00 2001 From: "adurbin@google.com" Date: Tue, 26 Sep 2006 10:52:39 +0200 Subject: [PATCH] insert IOAPIC(s) and Local APIC into resource map This patch places the IOAPIC(s) and the Local APIC specified by ACPI tables into the resource map. The APICs will then be visible within /proc/iomem Signed-off-by: Aaron Durbin Signed-off-by: Andi Kleen --- arch/x86_64/kernel/apic.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 6472e321cad..135ff25e6b4 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -45,6 +46,11 @@ int apic_calibrate_pmtmr __initdata; int disable_apic_timer __initdata; +static struct resource lapic_resource = { + .name = "Local APIC", + .flags = IORESOURCE_MEM | IORESOURCE_BUSY, +}; + /* * cpu_mask that denotes the CPUs that needs timer interrupt coming in as * IPIs in place of local APIC timers @@ -585,6 +591,40 @@ static int __init detect_init_APIC (void) return 0; } +#ifdef CONFIG_X86_IO_APIC +static struct resource * __init ioapic_setup_resources(void) +{ +#define IOAPIC_RESOURCE_NAME_SIZE 11 + unsigned long n; + struct resource *res; + char *mem; + int i; + + if (nr_ioapics <= 0) + return NULL; + + n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); + n *= nr_ioapics; + + res = alloc_bootmem(n); + + if (!res) + return NULL; + + memset(res, 0, n); + mem = (void *)&res[nr_ioapics]; + + for (i = 0; i < nr_ioapics; i++) { + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i); + mem += IOAPIC_RESOURCE_NAME_SIZE; + } + + return res; +} +#endif + void __init init_apic_mappings(void) { unsigned long apic_phys; @@ -604,6 +644,11 @@ void __init init_apic_mappings(void) apic_mapped = 1; apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys); + /* Put local APIC into the resource map. */ + lapic_resource.start = apic_phys; + lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; + insert_resource(&iomem_resource, &lapic_resource); + /* * Fetch the APIC ID of the BSP in case we have a * default configuration (or the MP table is broken). @@ -613,7 +658,9 @@ void __init init_apic_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; int i; + struct resource *ioapic_res; + ioapic_res = ioapic_setup_resources(); for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { ioapic_phys = mp_ioapics[i].mpc_apicaddr; @@ -625,6 +672,13 @@ void __init init_apic_mappings(void) apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n", __fix_to_virt(idx), ioapic_phys); idx++; + + if (ioapic_res) { + ioapic_res->start = ioapic_phys; + ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + insert_resource(&iomem_resource, ioapic_res); + ioapic_res++; + } } } } -- cgit v1.2.3 From 3022d734a54cbd2b65eea9a024564821101b4a9a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:39 +0200 Subject: [PATCH] Fix zeroing on exception in copy_*_user - Don't zero for __copy_from_user_inatomic following i386. This will prevent spurious zeros for parallel file system writers when one does a exception - The string instruction version didn't zero the output on exception. Oops. Also I cleaned up the code a bit while I was at it and added a minor optimization to the string instruction path. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/x8664_ksyms.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c index 370952c4ff2..c3454af5e3a 100644 --- a/arch/x86_64/kernel/x8664_ksyms.c +++ b/arch/x86_64/kernel/x8664_ksyms.c @@ -29,6 +29,7 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(copy_user_generic); EXPORT_SYMBOL(copy_from_user); EXPORT_SYMBOL(copy_to_user); +EXPORT_SYMBOL(__copy_from_user_inatomic); EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); -- cgit v1.2.3 From b38337a624c4d3c2c3d9cdf27d952ca94181c6a8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:40 +0200 Subject: [PATCH] Mark per cpu data initialization __initdata again Before 2.6.16 this was changed to work around code that accessed CPUs not in the possible map. But that code should be all fixed now, so mark it __initdata again. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/vmlinux.lds.S | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 2802aa963fa..d0564f1bcb0 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -216,14 +216,12 @@ SECTIONS __initramfs_start = .; .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } __initramfs_end = .; - /* temporary here to work around NR_CPUS. If you see this comment in 2.6.17+ - complain */ - . = ALIGN(4096); - __init_end = .; - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); __per_cpu_start = .; .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } __per_cpu_end = .; + . = ALIGN(4096); + __init_end = .; . = ALIGN(4096); __nosave_begin = .; -- cgit v1.2.3 From a15da49debaf7f09460a886b0ecd08588410715e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:40 +0200 Subject: [PATCH] Fix idle notifiers Previously exit_idle would be called more often than enter_idle Now instead of using complicated tests just keep track of it using the per CPU variable as a flip flop. I moved the idle state into the PDA to make the access more efficient. Original bug report and an initial patch from Stephane Eranian, but redone by AK. Cc: Stephane Eranian Signed-off-by: Andi Kleen --- arch/x86_64/kernel/process.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 885c318f76a..458006ae19f 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -80,25 +80,25 @@ void idle_notifier_unregister(struct notifier_block *n) } EXPORT_SYMBOL(idle_notifier_unregister); -enum idle_state { CPU_IDLE, CPU_NOT_IDLE }; -static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE; - void enter_idle(void) { - __get_cpu_var(idle_state) = CPU_IDLE; + write_pda(isidle, 1); atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); } static void __exit_idle(void) { - __get_cpu_var(idle_state) = CPU_NOT_IDLE; + if (read_pda(isidle) == 0) + return; + write_pda(isidle, 0); atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); } /* Called from interrupts to signify idle end */ void exit_idle(void) { - if (current->pid | read_pda(irqcount)) + /* idle loop has pid 0 */ + if (current->pid) return; __exit_idle(); } @@ -220,6 +220,9 @@ void cpu_idle (void) play_dead(); enter_idle(); idle(); + /* In many cases the interrupt that ended idle + has already called exit_idle. But some idle + loops can be woken up without interrupt. */ __exit_idle(); } -- cgit v1.2.3 From 56dd669a138c40ea6cdae487f233430d12372767 Mon Sep 17 00:00:00 2001 From: Aaron Durbin Date: Tue, 26 Sep 2006 10:52:40 +0200 Subject: [PATCH] Insert GART region into resource map Patch inserts the GART region into the iomem resource map. The GART will then be visible within /proc/iomem. It will also allow for other users utilizing the GART to subreserve the region (agp or IOMMU). Signed-off-by: Aaron Durbin Signed-off-by: Andi Kleen --- arch/x86_64/kernel/aperture.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c index 04dbf166252..f851e1f9c82 100644 --- a/arch/x86_64/kernel/aperture.c +++ b/arch/x86_64/kernel/aperture.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -33,6 +34,18 @@ int fallback_aper_force __initdata = 0; int fix_aperture __initdata = 1; +static struct resource gart_resource = { + .name = "GART", + .flags = IORESOURCE_MEM, +}; + +static void __init insert_aperture_resource(u32 aper_base, u32 aper_size) +{ + gart_resource.start = aper_base; + gart_resource.end = aper_base + aper_size - 1; + insert_resource(&iomem_resource, &gart_resource); +} + /* This code runs before the PCI subsystem is initialized, so just access the northbridge directly. */ @@ -62,6 +75,7 @@ static u32 __init allocate_aperture(void) } printk("Mapping aperture over %d KB of RAM @ %lx\n", aper_size >> 10, __pa(p)); + insert_aperture_resource((u32)__pa(p), aper_size); return (u32)__pa(p); } @@ -233,8 +247,13 @@ void __init iommu_hole_init(void) last_aper_base = aper_base; } - if (!fix && !fallback_aper_force) + if (!fix && !fallback_aper_force) { + if (last_aper_base) { + unsigned long n = (32 * 1024 * 1024) << last_aper_order; + insert_aperture_resource((u32)last_aper_base, n); + } return; + } if (!fallback_aper_force) aper_alloc = search_agp_bridge(&aper_order, &valid_agp); -- cgit v1.2.3 From 35d534a3ff4905c0a7bddbad0fc9df55967742c7 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] x86: - restore i8259A eoi status on resume Got it. i8259A_resume calls init_8259A(0) unconditionally, even if auto_eoi has been set. Keep track of the current status and restore that on resume. This fixes it for AMD64 and i386. Signed-off-by: Matthew Garrett Signed-off-by: Andi Kleen --- arch/x86_64/kernel/i8259.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c index d2f5acde574..2dd51f364ea 100644 --- a/arch/x86_64/kernel/i8259.c +++ b/arch/x86_64/kernel/i8259.c @@ -123,6 +123,8 @@ void (*interrupt[NR_IRQS])(void) = { DEFINE_SPINLOCK(i8259A_lock); +static int i8259A_auto_eoi; + static void end_8259A_irq (unsigned int irq) { if (irq > 256) { @@ -336,6 +338,8 @@ void init_8259A(int auto_eoi) { unsigned long flags; + i8259A_auto_eoi = auto_eoi; + spin_lock_irqsave(&i8259A_lock, flags); outb(0xff, 0x21); /* mask all of 8259A-1 */ @@ -394,7 +398,7 @@ static void save_ELCR(char *trigger) static int i8259A_resume(struct sys_device *dev) { - init_8259A(0); + init_8259A(i8259A_auto_eoi); restore_ELCR(irq_trigger); return 0; } -- cgit v1.2.3 From 27fbe5b28a2ffef171c6005f304ea4f80fcdcc01 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] Use string instructions for Core2 copy/clear It is faster than using a unrolled loop for the use cases the kernel cares about (cached, sizes typically < 4K) Signed-off-by: Andi Kleen --- arch/x86_64/kernel/setup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index e328e3eb8cb..085f2e09b9d 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -843,6 +843,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); + if (c->x86 == 6) + set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); c->x86_max_cores = intel_num_cpu_cores(c); -- cgit v1.2.3 From 2049336f60d297c85ac977245b9326ec00396114 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] Don't synchronize time reading on single core AMD systems We do some additional CPU synchronization in gettimeofday et.al. to make sure the time stamps are always monotonic over multiple CPUs. But on single core systems that is not needed. So don't do it. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/setup.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 085f2e09b9d..26524ce3b23 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -728,6 +728,12 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) /* Fix cpuid4 emulation for more */ num_cache_leaves = 3; + + /* When there is only one core no need to synchronize RDTSC */ + if (num_possible_cpus() == 1) + set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); + else + clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); } static void __cpuinit detect_ht(struct cpuinfo_x86 *c) -- cgit v1.2.3 From dd54a11004b2c9a1f136225f880e021a43b0eadc Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] Remove all traces of signal number conversion This was old code that was needed for iBCS and x86-64 never supported that. Pointed out by Albert Cahalan Signed-off-by: Andi Kleen --- arch/x86_64/kernel/signal.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c index 7f58bc9a056..49ec324cd14 100644 --- a/arch/x86_64/kernel/signal.c +++ b/arch/x86_64/kernel/signal.c @@ -277,11 +277,6 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, #endif /* Set up registers for signal handler */ - { - struct exec_domain *ed = current_thread_info()->exec_domain; - if (unlikely(ed && ed->signal_invmap && sig < 32)) - sig = ed->signal_invmap[sig]; - } regs->rdi = sig; /* In case the signal handler was declared without prototypes */ regs->rax = 0; -- cgit v1.2.3 From adf1423698f00d00b267f7dca8231340ce7d65ef Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] i386/x86-64: Work around gcc bug with noreturn functions in unwinder Current gcc generates calls not jumps to noreturn functions. When that happens the return address can point to the next function, which confuses the unwinder. This patch works around it by marking asynchronous exception frames in contrast normal call frames in the unwind information. Then teach the unwinder to decode this. For normal call frames the unwinder now subtracts one from the address which avoids this problem. The standard libgcc unwinder uses the same trick. It doesn't include adjustment of the printed address (i.e. for the original example, it'd still be kernel_math_error+0 that gets displayed, but the unwinder wouldn't get confused anymore. This only works with binutils 2.6.17+ and some versions of H.J.Lu's 2.6.16 unfortunately because earlier binutils don't support .cfi_signal_frame [AK: added automatic detection of the new binutils and wrote description] Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/x86_64/kernel/entry.S | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index ea32688386f..4cbc65290ae 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -123,6 +123,7 @@ .macro CFI_DEFAULT_STACK start=1 .if \start CFI_STARTPROC simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,SS+8 .else CFI_DEF_CFA_OFFSET SS+8 @@ -207,6 +208,7 @@ END(ret_from_fork) ENTRY(system_call) CFI_STARTPROC simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,PDA_STACKOFFSET CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ @@ -324,6 +326,7 @@ END(system_call) */ ENTRY(int_ret_from_sys_call) CFI_STARTPROC simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,SS+8-ARGOFFSET /*CFI_REL_OFFSET ss,SS-ARGOFFSET*/ CFI_REL_OFFSET rsp,RSP-ARGOFFSET @@ -484,6 +487,7 @@ END(stub_rt_sigreturn) */ .macro _frame ref CFI_STARTPROC simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,SS+8-\ref /*CFI_REL_OFFSET ss,SS-\ref*/ CFI_REL_OFFSET rsp,RSP-\ref -- cgit v1.2.3 From 658fdbef66e5e9be79b457edc2cbbb3add840aa9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] Don't leak NT bit into next task SYSENTER can cause a NT to be set which might cause crashes on the IRET in the next task. Following similar i386 patch from Linus. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/entry.S | 4 ++++ arch/x86_64/kernel/setup64.c | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 4cbc65290ae..2802524104f 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -155,6 +155,10 @@ /* rdi: prev */ ENTRY(ret_from_fork) CFI_DEFAULT_STACK + push kernel_eflags(%rip) + CFI_ADJUST_CFA_OFFSET 4 + popf # reset kernel eflags + CFI_ADJUST_CFA_OFFSET -4 call schedule_tail GET_THREAD_INFO(%rcx) testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 9332d2361e0..8c4b80fe71a 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -180,6 +180,8 @@ void __cpuinit check_efer(void) } } +unsigned long kernel_eflags; + /* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT @@ -281,4 +283,6 @@ void __cpuinit cpu_init (void) set_debugreg(0UL, 7); fpu_init(); + + raw_local_save_flags(kernel_eflags); } -- cgit v1.2.3 From f157cbb1eb9ce3f33a401ec6d20eb3eb852351a3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] i386/x86-64: Make all early PCI scans dependent on CONFIG_PCI This is useful on systems with broken PCI bus. Affects various scans in x86-64 and i386's early ACPI quirk scan. Cc: gregkh@suse.de Cc: len.brown@intel.com Cc: Trammell Hudson Signed-off-by: Andi Kleen --- arch/x86_64/kernel/Makefile | 3 ++- arch/x86_64/kernel/setup.c | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 000e67e8f02..2466fbd035e 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -8,7 +8,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ x8664_ksyms.o i387.o syscall.o vsyscall.o \ setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ - pci-dma.o pci-nommu.o alternative.o early-quirks.o + pci-dma.o pci-nommu.o alternative.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_X86_MCE) += mce.o @@ -39,6 +39,7 @@ obj-$(CONFIG_K8_NB) += k8.o obj-$(CONFIG_AUDIT) += audit.o obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_PCI) += early-quirks.o obj-y += topology.o obj-y += intel_cacheinfo.o diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 26524ce3b23..3d8309b1236 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -493,7 +493,9 @@ void __init setup_arch(char **cmdline_p) paging_init(); +#ifdef CONFIG_PCI early_quirks(); +#endif /* * set this early, so we dont allocate cpu0 -- cgit v1.2.3 From 0637a70a5db98182d9ad3d6ae1ee30acf20afde9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] x86: Allow disabling early pci scans with pci=noearly or disallowing conf1 Some buggy systems can machine check when config space accesses happen for some non existent devices. i386/x86-64 do some early device scans that might trigger this. Allow pci=noearly to disable this. Also when type 1 is disabling also don't do any early accesses which are always type1. This moves the pci= configuration parsing to be a early parameter. I don't think this can break anything because it only changes a single global that is only used by PCI. Cc: gregkh@suse.de Cc: Trammell Hudson Signed-off-by: Andi Kleen --- arch/x86_64/kernel/aperture.c | 2 +- arch/x86_64/kernel/early-quirks.c | 4 ++++ arch/x86_64/kernel/pci-calgary.c | 3 +++ arch/x86_64/kernel/vsmp.c | 3 +++ 4 files changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c index f851e1f9c82..b487396c4c5 100644 --- a/arch/x86_64/kernel/aperture.c +++ b/arch/x86_64/kernel/aperture.c @@ -212,7 +212,7 @@ void __init iommu_hole_init(void) u64 aper_base, last_aper_base = 0; int valid_agp = 0; - if (iommu_aperture_disabled || !fix_aperture) + if (iommu_aperture_disabled || !fix_aperture || !early_pci_allowed()) return; printk("Checking aperture...\n"); diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c index d637cff1c4b..208e38a372c 100644 --- a/arch/x86_64/kernel/early-quirks.c +++ b/arch/x86_64/kernel/early-quirks.c @@ -82,6 +82,10 @@ static struct chipset early_qrk[] = { void __init early_quirks(void) { int num, slot, func; + + if (!early_pci_allowed()) + return; + /* Poor man's PCI discovery */ for (num = 0; num < 32; num++) { for (slot = 0; slot < 32; slot++) { diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 466588f9560..cfb09b07ae9 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -924,6 +924,9 @@ void __init detect_calgary(void) if (swiotlb || no_iommu || iommu_detected) return; + if (!early_pci_allowed()) + return; + specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { diff --git a/arch/x86_64/kernel/vsmp.c b/arch/x86_64/kernel/vsmp.c index 92f70c74965..044e852bd25 100644 --- a/arch/x86_64/kernel/vsmp.c +++ b/arch/x86_64/kernel/vsmp.c @@ -20,6 +20,9 @@ static int __init vsmp_init(void) void *address; unsigned int cap, ctl; + if (!early_pci_allowed()) + return 0; + /* Check if we are running on a ScaleMP vSMP box */ if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) || (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL)) -- cgit v1.2.3 From 15d5f8398311f565682959daaca30e3ca7aea600 Mon Sep 17 00:00:00 2001 From: Dmitriy Zavin Date: Tue, 26 Sep 2006 10:52:42 +0200 Subject: [PATCH] x86: Refactor thermal throttle processing Refactor the event processing (syslog messaging and rate limiting) into separate file therm_throt.c. This allows consistent reporting of CPU thermal throttle events. After ACK'ing the interrupt, if the event is current, the user (p4.c/mce_intel.c) calls therm_throt_process to log (and rate limit) the event. If that function returns 1, the user has the option to log things further (such as to mce_log in x86_64). AK: minor cleanup Signed-off-by: Dmitriy Zavin Signed-off-by: Andi Kleen --- arch/x86_64/kernel/Makefile | 4 ++-- arch/x86_64/kernel/mce.c | 27 +++++++++++++++++++++++++++ arch/x86_64/kernel/mce_intel.c | 27 ++++++--------------------- 3 files changed, 35 insertions(+), 23 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 2466fbd035e..3c7cbff04d3 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -11,7 +11,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \ pci-dma.o pci-nommu.o alternative.o obj-$(CONFIG_STACKTRACE) += stacktrace.o -obj-$(CONFIG_X86_MCE) += mce.o +obj-$(CONFIG_X86_MCE) += mce.o therm_throt.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/ @@ -46,6 +46,7 @@ obj-y += intel_cacheinfo.o CFLAGS_vsyscall.o := $(PROFILING) -g0 +therm_throt-y += ../../i386/kernel/cpu/mcheck/therm_throt.o bootflag-y += ../../i386/kernel/bootflag.o cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o topology-y += ../../i386/kernel/topology.o @@ -55,4 +56,3 @@ quirks-y += ../../i386/kernel/quirks.o i8237-y += ../../i386/kernel/i8237.o msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o alternative-y += ../../i386/kernel/alternative.o - diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 1a93c373840..bbea88801d8 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -274,6 +274,33 @@ void do_machine_check(struct pt_regs * regs, long error_code) atomic_dec(&mce_entry); } +#ifdef CONFIG_X86_MCE_INTEL +/*** + * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog + * @cpu: The CPU on which the event occured. + * @status: Event status information + * + * This function should be called by the thermal interrupt after the + * event has been processed and the decision was made to log the event + * further. + * + * The status parameter will be saved to the 'status' field of 'struct mce' + * and historically has been the register value of the + * MSR_IA32_THERMAL_STATUS (Intel) msr. + */ +void mce_log_therm_throt_event(unsigned int cpu, __u64 status) +{ + struct mce m; + + memset(&m, 0, sizeof(m)); + m.cpu = cpu; + m.bank = MCE_THERMAL_BANK; + m.status = status; + rdtscll(m.tsc); + mce_log(&m); +} +#endif /* CONFIG_X86_MCE_INTEL */ + /* * Periodic polling timer for "silent" machine check errors. */ diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c index 8f533d2c40c..dec11219e27 100644 --- a/arch/x86_64/kernel/mce_intel.c +++ b/arch/x86_64/kernel/mce_intel.c @@ -11,36 +11,21 @@ #include #include #include - -static DEFINE_PER_CPU(unsigned long, next_check); +#include asmlinkage void smp_thermal_interrupt(void) { - struct mce m; + __u64 msr_val; ack_APIC_irq(); exit_idle(); irq_enter(); - if (time_before(jiffies, __get_cpu_var(next_check))) - goto done; - - __get_cpu_var(next_check) = jiffies + HZ*300; - memset(&m, 0, sizeof(m)); - m.cpu = smp_processor_id(); - m.bank = MCE_THERMAL_BANK; - rdtscll(m.tsc); - rdmsrl(MSR_IA32_THERM_STATUS, m.status); - if (m.status & 0x1) { - printk(KERN_EMERG - "CPU%d: Temperature above threshold, cpu clock throttled\n", m.cpu); - add_taint(TAINT_MACHINE_CHECK); - } else { - printk(KERN_EMERG "CPU%d: Temperature/speed normal\n", m.cpu); - } - mce_log(&m); -done: + rdmsrl(MSR_IA32_THERM_STATUS, msr_val); + if (therm_throt_process(msr_val & 1)) + mce_log_therm_throt_event(smp_processor_id(), msr_val); + irq_exit(); } -- cgit v1.2.3 From 3222b36f46c22f46697a0a53fa8804153a32669f Mon Sep 17 00:00:00 2001 From: Dmitriy Zavin Date: Tue, 26 Sep 2006 10:52:42 +0200 Subject: [PATCH] x86: Add a cumulative thermal throttle event counter. The counter is exported to /sys that keeps track of the number of thermal events, such that the user knows how bad the thermal problem might be (since the logging to syslog and mcelog is rate limited). AK: Fixed cpu hotplug locking Signed-off-by: Dmitriy Zavin Signed-off-by: Andi Kleen --- arch/x86_64/kernel/mce_intel.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c index dec11219e27..6551505d8a2 100644 --- a/arch/x86_64/kernel/mce_intel.c +++ b/arch/x86_64/kernel/mce_intel.c @@ -77,6 +77,9 @@ static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c) apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", cpu, tm2 ? "TM2" : "TM1"); + + /* enable thermal throttle processing */ + atomic_set(&therm_throt_en, 1); return; } -- cgit v1.2.3 From dcf10307c3fff2bca4b104ad8fe4e3aa8dcb2ad1 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Tue, 26 Sep 2006 10:52:42 +0200 Subject: [PATCH] i386/x86-64: New Intel feature flags Add supplemental SSE3 instructions flag, and Direct Cache Access flag. As described in "Intel Processor idenfication and the CPUID instruction AP485 Sept 2006" AK: also added for x86-64 Signed-off-by: Dave Jones Signed-off-by: Andi Kleen --- arch/x86_64/kernel/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 3d8309b1236..3c7ad267d8a 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -1071,8 +1071,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) /* Intel-defined (#2) */ "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", - "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, + NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* VIA/Cyrix/Centaur-defined */ -- cgit v1.2.3