From 973a2dd1d50a11d380086601f14e59116f93e8c5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:39:32 +0100 Subject: x86, mce: disable machine checks on suspend Impact: Bug fix During suspend it is not reliable to process machine check exceptions, because CPUs disappear but can still get machine check broadcasts. Also the system is slightly more likely to machine check them, but the handler is typically not a position to handle them in a meaningfull way. So disable them during suspend and enable them during resume. Also make sure they are always disabled on hot-unplugged CPUs. This new code assumes that suspend always hotunplugs all non BP CPUs. v2: Remove the WARN_ONs Thomas objected to. Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'arch/x86/kernel/cpu/mcheck/mce_64.c') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 25cf624eccb..5ed80991ab9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -728,6 +728,29 @@ __setup("mce=", mcheck_enable); * Sysfs support */ +/* + * Disable machine checks on suspend and shutdown. We can't really handle + * them later. + */ +static int mce_disable(void) +{ + int i; + + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + return 0; +} + +static int mce_suspend(struct sys_device *dev, pm_message_t state) +{ + return mce_disable(); +} + +static int mce_shutdown(struct sys_device *dev) +{ + return mce_disable(); +} + /* On resume clear all MCE state. Don't want to see leftovers from the BIOS. Only one CPU is active at this time, the others get readded later using CPU hotplug. */ @@ -752,6 +775,8 @@ static void mce_restart(void) } static struct sysdev_class mce_sysclass = { + .suspend = mce_suspend, + .shutdown = mce_shutdown, .resume = mce_resume, .name = "machinecheck", }; -- cgit v1.2.3 From 123aa76ec0cab5d4881cd8509faed43231e68801 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:39:27 +0100 Subject: x86, mce: don't disable machine checks during code patching Impact: low priority bug fix This removes part of a a patch I added myself some time ago. After some consideration the patch was a bad idea. In particular it stopped machine check exceptions during code patching. To quote the comment: * MCEs only happen when something got corrupted and in this * case we must do something about the corruption. * Ignoring it is worse than a unlikely patching race. * Also machine checks tend to be broadcast and if one CPU * goes into machine check the others follow quickly, so we don't * expect a machine check to cause undue problems during to code * patching. So undo the machine check related parts of 8f4e956b313dcccbc7be6f10808952345e3b638c NMIs are still disabled. This only removes code, the only additions are a new comment. Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'arch/x86/kernel/cpu/mcheck/mce_64.c') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 5ed80991ab9..25ccdbec86e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -680,20 +680,6 @@ static struct miscdevice mce_log_device = { &mce_chrdev_ops, }; -static unsigned long old_cr4 __initdata; - -void __init stop_mce(void) -{ - old_cr4 = read_cr4(); - clear_in_cr4(X86_CR4_MCE); -} - -void __init restart_mce(void) -{ - if (old_cr4 & X86_CR4_MCE) - set_in_cr4(X86_CR4_MCE); -} - /* * Old style boot options parsing. Only for compatibility. */ -- cgit v1.2.3 From 9bd984058088d6ef7af6946591a207e51a2f4890 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:39:28 +0100 Subject: x86, mce: always use separate work queue to run trigger Impact: Needed for bug fix in next patch This relaxes the requirement that mce_notify_user has to run in process context. Useful for future changes, but also leads to cleaner behaviour now. Now instead mce_notify_user can be called directly from interrupt (but not NMI) context. The work queue only uses a single global work struct, which can be done safely because it is always free to reuse before the trigger function is executed. This way no events can be lost. Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel/cpu/mcheck/mce_64.c') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 25ccdbec86e..18b379cf061 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -380,11 +380,17 @@ static void mcheck_timer(struct work_struct *work) schedule_delayed_work(&mcheck_work, next_interval); } +static void mce_do_trigger(struct work_struct *work) +{ + call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); +} + +static DECLARE_WORK(mce_trigger_work, mce_do_trigger); + /* - * This is only called from process context. This is where we do - * anything we need to alert userspace about new MCEs. This is called - * directly from the poller and also from entry.S and idle, thanks to - * TIF_MCE_NOTIFY. + * Notify the user(s) about new machine check events. + * Can be called from interrupt context, but not from machine check/NMI + * context. */ int mce_notify_user(void) { @@ -394,9 +400,14 @@ int mce_notify_user(void) unsigned long now = jiffies; wake_up_interruptible(&mce_wait); - if (trigger[0]) - call_usermodehelper(trigger, trigger_argv, NULL, - UMH_NO_WAIT); + + /* + * There is no risk of missing notifications because + * work_pending is always cleared before the function is + * executed. + */ + if (trigger[0] && !work_pending(&mce_trigger_work)) + schedule_work(&mce_trigger_work); if (time_after_eq(now, last_print + (check_interval*HZ))) { last_print = now; -- cgit v1.2.3 From 52d168e28bc11dd026b620fe1767cadde5a747cd Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:39:29 +0100 Subject: x86, mce: switch machine check polling to per CPU timer Impact: Higher priority bug fix The machine check poller runs a single timer and then broadcasted an IPI to all CPUs to check them. This leads to unnecessary synchronization between CPUs. The original CPU running the timer has to wait potentially a long time for all other CPUs answering. This is also real time unfriendly and in general inefficient. This was especially a problem on systems with a lot of events where the poller run with a higher frequency after processing some events. There could be more and more CPU time wasted with this, to the point of significantly slowing down machines. The machine check polling is actually fully independent per CPU, so there's no reason to not just do this all with per CPU timers. This patch implements that. Also switch the poller also to use standard timers instead of work queues. It was using work queues to be able to execute a user program on a event, but mce_notify_user() handles this case now with a separate callback. So instead always run the poll code in in a standard per CPU timer, which means that in the common case of not having to execute a trigger there will be less overhead. This allows to clean up the initialization significantly, because standard timers are already up when machine checks get init'ed. No multiple initialization functions. Thanks to Thomas Gleixner for some help. Cc: thockin@google.com v2: Use del_timer_sync() on cpu shutdown and don't try to handle migrated timers. v3: Add WARN_ON for timer running on unexpected CPU Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 68 ++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 23 deletions(-) (limited to 'arch/x86/kernel/cpu/mcheck/mce_64.c') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 18b379cf061..3f0550d16f3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -353,18 +353,17 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status) static int check_interval = 5 * 60; /* 5 minutes */ static int next_interval; /* in jiffies */ -static void mcheck_timer(struct work_struct *work); -static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); +static void mcheck_timer(unsigned long); +static DEFINE_PER_CPU(struct timer_list, mce_timer); -static void mcheck_check_cpu(void *info) +static void mcheck_timer(unsigned long data) { + struct timer_list *t = &per_cpu(mce_timer, data); + + WARN_ON(smp_processor_id() != data); + if (mce_available(¤t_cpu_data)) do_machine_check(NULL, 0); -} - -static void mcheck_timer(struct work_struct *work) -{ - on_each_cpu(mcheck_check_cpu, NULL, 1); /* * Alert userspace if needed. If we logged an MCE, reduce the @@ -377,7 +376,8 @@ static void mcheck_timer(struct work_struct *work) (int)round_jiffies_relative(check_interval*HZ)); } - schedule_delayed_work(&mcheck_work, next_interval); + t->expires = jiffies + next_interval; + add_timer(t); } static void mce_do_trigger(struct work_struct *work) @@ -436,16 +436,11 @@ static struct notifier_block mce_idle_notifier = { static __init int periodic_mcheck_init(void) { - next_interval = check_interval * HZ; - if (next_interval) - schedule_delayed_work(&mcheck_work, - round_jiffies_relative(next_interval)); - idle_notifier_register(&mce_idle_notifier); - return 0; + idle_notifier_register(&mce_idle_notifier); + return 0; } __initcall(periodic_mcheck_init); - /* * Initialize Machine Checks for a CPU. */ @@ -515,6 +510,20 @@ static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c) } } +static void mce_init_timer(void) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + + /* data race harmless because everyone sets to the same value */ + if (!next_interval) + next_interval = check_interval * HZ; + if (!next_interval) + return; + setup_timer(t, mcheck_timer, smp_processor_id()); + t->expires = round_jiffies_relative(jiffies + next_interval); + add_timer(t); +} + /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off. @@ -529,6 +538,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) mce_init(NULL); mce_cpu_features(c); + mce_init_timer(); } /* @@ -758,17 +768,19 @@ static int mce_resume(struct sys_device *dev) return 0; } +static void mce_cpu_restart(void *data) +{ + del_timer_sync(&__get_cpu_var(mce_timer)); + if (mce_available(¤t_cpu_data)) + mce_init(NULL); + mce_init_timer(); +} + /* Reinit MCEs after user configuration changes */ static void mce_restart(void) { - if (next_interval) - cancel_delayed_work(&mcheck_work); - /* Timer race is harmless here */ - on_each_cpu(mce_init, NULL, 1); next_interval = check_interval * HZ; - if (next_interval) - schedule_delayed_work(&mcheck_work, - round_jiffies_relative(next_interval)); + on_each_cpu(mce_cpu_restart, NULL, 1); } static struct sysdev_class mce_sysclass = { @@ -899,6 +911,7 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; + struct timer_list *t = &per_cpu(mce_timer, cpu); switch (action) { case CPU_ONLINE: @@ -913,6 +926,15 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, threshold_cpu_callback(action, cpu); mce_remove_device(cpu); break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + del_timer_sync(t); + break; + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + t->expires = round_jiffies_relative(jiffies + next_interval); + add_timer_on(t, cpu); + break; } return NOTIFY_OK; } -- cgit v1.2.3 From 5b4408fdaa62474dd9485cddb9126370d90d4b82 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:39:30 +0100 Subject: x86, mce: don't set up mce sysdev devices with mce=off Impact: bug fix, in this case the resume handler shouldn't run which avoids incorrectly reenabling machine checks on resume When MCEs are completely disabled on the command line don't set up the sysdev devices for them either. Includes a comment fix from Thomas Gleixner. Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel/cpu/mcheck/mce_64.c') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 3f0550d16f3..4e2b1bc5131 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -151,6 +151,8 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start) static int mce_available(struct cpuinfo_x86 *c) { + if (mce_dont_init) + return 0; return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } @@ -532,8 +534,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) { mce_cpu_quirks(c); - if (mce_dont_init || - !mce_available(c)) + if (!mce_available(c)) return; mce_init(NULL); @@ -710,8 +711,7 @@ static int __init mcheck_disable(char *str) return 1; } -/* mce=off disables machine check. Note you can re-enable it later - using sysfs. +/* mce=off disables machine check. mce=TOLERANCELEVEL (number, see above) mce=bootlog Log MCEs from before booting. Disabled by default on AMD. mce=nobootlog Don't log MCEs from before booting. */ -- cgit v1.2.3 From d6b75584a3eaab8cb2ab3e8cf90c5e57c1928a85 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:39:31 +0100 Subject: x86, mce: disable machine checks on offlined CPUs Impact: Lower priority bug fix Offlined CPUs could still get machine checks, but the machine check handler cannot handle them properly, leading to an unconditional crash. Disable machine checks on CPUs that are going down. Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'arch/x86/kernel/cpu/mcheck/mce_64.c') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 4e2b1bc5131..1db94c0d5aa 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -906,6 +906,27 @@ static __cpuinit void mce_remove_device(unsigned int cpu) cpu_clear(cpu, mce_device_initialized); } +/* Make sure there are no machine checks on offlined CPUs. */ +static void __cpuexit mce_disable_cpu(void *h) +{ + int i; + + if (!mce_available(¤t_cpu_data)) + return; + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); +} + +static void __cpuexit mce_reenable_cpu(void *h) +{ + int i; + + if (!mce_available(¤t_cpu_data)) + return; + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); +} + /* Get notified when a cpu comes on/off. Be hotplug friendly. */ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -929,11 +950,13 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: del_timer_sync(t); + smp_call_function_single(cpu, mce_disable_cpu, NULL, 1); break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: t->expires = round_jiffies_relative(jiffies + next_interval); add_timer_on(t, cpu); + smp_call_function_single(cpu, mce_reenable_cpu, NULL, 1); break; } return NOTIFY_OK; -- cgit v1.2.3 From ef41df4344ff952c79746d44a6126bd2cf7ed2bc Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 12 Feb 2009 13:39:34 +0100 Subject: x86, mce: fix a race condition in mce_read() Impact: bugfix Considering the situation as follow: before: mcelog.next == 1, mcelog.entry[0].finished = 1 +-------------------------------------------------------------------------- R W1 W2 W3 read mcelog.next (1) mcelog.next++ (2) (working on entry 1, finished == 0) mcelog.next = 0 mcelog.next++ (1) (working on entry 0) mcelog.next++ (2) (working on entry 1) <----------------- race ----------------> (done on entry 1, finished = 1) (done on entry 1, finished = 1) To fix the race condition, a cmpxchg loop is added to mce_read() to ensure no new MCE record can be added between mcelog.next reading and mcelog.next = 0. Signed-off-by: Huang Ying Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 41 ++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel/cpu/mcheck/mce_64.c') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 1db94c0d5aa..870d08deccf 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -595,7 +595,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, { unsigned long *cpu_tsc; static DEFINE_MUTEX(mce_read_mutex); - unsigned next; + unsigned prev, next; char __user *buf = ubuf; int i, err; @@ -614,25 +614,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, } err = 0; - for (i = 0; i < next; i++) { - unsigned long start = jiffies; - - while (!mcelog.entry[i].finished) { - if (time_after_eq(jiffies, start + 2)) { - memset(mcelog.entry + i,0, sizeof(struct mce)); - goto timeout; + prev = 0; + do { + for (i = prev; i < next; i++) { + unsigned long start = jiffies; + + while (!mcelog.entry[i].finished) { + if (time_after_eq(jiffies, start + 2)) { + memset(mcelog.entry + i, 0, + sizeof(struct mce)); + goto timeout; + } + cpu_relax(); } - cpu_relax(); + smp_rmb(); + err |= copy_to_user(buf, mcelog.entry + i, + sizeof(struct mce)); + buf += sizeof(struct mce); +timeout: + ; } - smp_rmb(); - err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce)); - buf += sizeof(struct mce); - timeout: - ; - } - memset(mcelog.entry, 0, next * sizeof(struct mce)); - mcelog.next = 0; + memset(mcelog.entry + prev, 0, + (next - prev) * sizeof(struct mce)); + prev = next; + next = cmpxchg(&mcelog.next, prev, 0); + } while (next != prev); synchronize_sched(); -- cgit v1.2.3 From 0d7482e3d76522157c9d741d79fce22c401fa0c5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 17 Feb 2009 23:07:13 +0100 Subject: x86, mce: implement dynamic machine check banks support Impact: cleanup; making code future proof; memory saving on small systems This patch replaces the hardcoded max number of machine check banks with dynamic allocation depending on what the CPU reports. The sysfs data structures and the banks array are dynamically allocated. There is still a hard bank limit (128) because the mcelog protocol uses banks >= 128 as pseudo banks to escape other events. But we expect that 128 banks is beyond any reasonable CPU for now. This supersedes an earlier patch by Venki, but it solves the problem more completely by making the limit fully dynamic (up to the 128 boundary). This saves some memory on machines with less than 6 banks because they won't need sysdevs for unused ones and also allows to use sysfs to control these banks on possible future CPUs with more than 6 banks. This is an updated patch addressing Venki's comments. I also added in another patch from Thomas which fixed the error allocation path (that patch was previously separated) Cc: Venki Pallipadi Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 147 ++++++++++++++++++++++++++++-------- 1 file changed, 115 insertions(+), 32 deletions(-) (limited to 'arch/x86/kernel/cpu/mcheck/mce_64.c') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 870d08deccf..2297730bb51 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -24,6 +24,8 @@ #include #include #include +#include +#include #include #include #include @@ -32,7 +34,12 @@ #include #define MISC_MCELOG_MINOR 227 -#define NR_SYSFS_BANKS 6 + +/* + * To support more than 128 would need to escape the predefined + * Linux defined extended banks first. + */ +#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) atomic_t mce_entry; @@ -47,7 +54,7 @@ static int mce_dont_init; */ static int tolerant = 1; static int banks; -static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL }; +static u64 *bank; static unsigned long notify_user; static int rip_msr; static int mce_bootlog = -1; @@ -212,7 +219,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) barrier(); for (i = 0; i < banks; i++) { - if (i < NR_SYSFS_BANKS && !bank[i]) + if (!bank[i]) continue; m.misc = 0; @@ -446,37 +453,54 @@ __initcall(periodic_mcheck_init); /* * Initialize Machine Checks for a CPU. */ -static void mce_init(void *dummy) +static int mce_cap_init(void) { u64 cap; - int i; + unsigned b; rdmsrl(MSR_IA32_MCG_CAP, cap); - banks = cap & 0xff; - if (banks > MCE_EXTENDED_BANK) { - banks = MCE_EXTENDED_BANK; - printk(KERN_INFO "MCE: warning: using only %d banks\n", - MCE_EXTENDED_BANK); + b = cap & 0xff; + if (b > MAX_NR_BANKS) { + printk(KERN_WARNING + "MCE: Using only %u machine check banks out of %u\n", + MAX_NR_BANKS, b); + b = MAX_NR_BANKS; + } + + /* Don't support asymmetric configurations today */ + WARN_ON(banks != 0 && b != banks); + banks = b; + if (!bank) { + bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); + if (!bank) + return -ENOMEM; + memset(bank, 0xff, banks * sizeof(u64)); } + /* Use accurate RIP reporting if available. */ if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) rip_msr = MSR_IA32_MCG_EIP; + return 0; +} + +static void mce_init(void *dummy) +{ + u64 cap; + int i; + /* Log the machine checks left over from the previous reset. This also clears all registers */ do_machine_check(NULL, mce_bootlog ? -1 : -2); set_in_cr4(X86_CR4_MCE); + rdmsrl(MSR_IA32_MCG_CAP, cap); if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); for (i = 0; i < banks; i++) { - if (i < NR_SYSFS_BANKS) - wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); - else - wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL); - + wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } } @@ -486,10 +510,10 @@ static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) { /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { - if(c->x86 == 15) + if (c->x86 == 15 && banks > 4) /* disable GART TBL walk error reporting, which trips off incorrectly with the IOMMU & 3ware & Cerberus. */ - clear_bit(10, &bank[4]); + clear_bit(10, (unsigned long *)&bank[4]); if(c->x86 <= 17 && mce_bootlog < 0) /* Lots of broken BIOS around that don't clear them by default and leave crap in there. Don't log. */ @@ -532,11 +556,15 @@ static void mce_init_timer(void) */ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) { - mce_cpu_quirks(c); - if (!mce_available(c)) return; + if (mce_cap_init() < 0) { + mce_dont_init = 1; + return; + } + mce_cpu_quirks(c); + mce_init(NULL); mce_cpu_features(c); mce_init_timer(); @@ -819,16 +847,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit } \ static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); -/* - * TBD should generate these dynamically based on number of available banks. - * Have only 6 contol banks in /sysfs until then. - */ -ACCESSOR(bank0ctl,bank[0],mce_restart()) -ACCESSOR(bank1ctl,bank[1],mce_restart()) -ACCESSOR(bank2ctl,bank[2],mce_restart()) -ACCESSOR(bank3ctl,bank[3],mce_restart()) -ACCESSOR(bank4ctl,bank[4],mce_restart()) -ACCESSOR(bank5ctl,bank[5],mce_restart()) +static struct sysdev_attribute *bank_attrs; + +static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, + char *buf) +{ + u64 b = bank[attr - bank_attrs]; + return sprintf(buf, "%Lx\n", b); +} + +static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, + const char *buf, size_t siz) +{ + char *end; + u64 new = simple_strtoull(buf, &end, 0); + if (end == buf) + return -EINVAL; + bank[attr - bank_attrs] = new; + mce_restart(); + return end-buf; +} static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) @@ -855,8 +893,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); ACCESSOR(check_interval,check_interval,mce_restart()) static struct sysdev_attribute *mce_attributes[] = { - &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, - &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, &attr_tolerant.attr, &attr_check_interval, &attr_trigger, NULL }; @@ -886,11 +922,22 @@ static __cpuinit int mce_create_device(unsigned int cpu) if (err) goto error; } + for (i = 0; i < banks; i++) { + err = sysdev_create_file(&per_cpu(device_mce, cpu), + &bank_attrs[i]); + if (err) + goto error2; + } cpu_set(cpu, mce_device_initialized); return 0; +error2: + while (--i >= 0) { + sysdev_remove_file(&per_cpu(device_mce, cpu), + &bank_attrs[i]); + } error: - while (i--) { + while (--i >= 0) { sysdev_remove_file(&per_cpu(device_mce,cpu), mce_attributes[i]); } @@ -909,6 +956,9 @@ static __cpuinit void mce_remove_device(unsigned int cpu) for (i = 0; mce_attributes[i]; i++) sysdev_remove_file(&per_cpu(device_mce,cpu), mce_attributes[i]); + for (i = 0; i < banks; i++) + sysdev_remove_file(&per_cpu(device_mce, cpu), + &bank_attrs[i]); sysdev_unregister(&per_cpu(device_mce,cpu)); cpu_clear(cpu, mce_device_initialized); } @@ -973,6 +1023,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = { .notifier_call = mce_cpu_callback, }; +static __init int mce_init_banks(void) +{ + int i; + + bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, + GFP_KERNEL); + if (!bank_attrs) + return -ENOMEM; + + for (i = 0; i < banks; i++) { + struct sysdev_attribute *a = &bank_attrs[i]; + a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); + if (!a->attr.name) + goto nomem; + a->attr.mode = 0644; + a->show = show_bank; + a->store = set_bank; + } + return 0; + +nomem: + while (--i >= 0) + kfree(bank_attrs[i].attr.name); + kfree(bank_attrs); + bank_attrs = NULL; + return -ENOMEM; +} + static __init int mce_init_device(void) { int err; @@ -980,6 +1058,11 @@ static __init int mce_init_device(void) if (!mce_available(&boot_cpu_data)) return -EIO; + + err = mce_init_banks(); + if (err) + return err; + err = sysdev_class_register(&mce_sysclass); if (err) return err; -- cgit v1.2.3 From b5f2fa4ea00a179ac1c2ff342ceeee261dd75e53 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:43:22 +0100 Subject: x86, mce: factor out duplicated struct mce setup into one function Impact: cleanup This merely factors out duplicated code to set up the initial struct mce state into a single function. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel/cpu/mcheck/mce_64.c') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 2297730bb51..fed875742b1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -65,6 +65,14 @@ static char *trigger_argv[2] = { trigger, NULL }; static DECLARE_WAIT_QUEUE_HEAD(mce_wait); +/* Do initial initialization of a struct mce */ +void mce_setup(struct mce *m) +{ + memset(m, 0, sizeof(struct mce)); + m->cpu = smp_processor_id(); + rdtscll(m->tsc); +} + /* * Lockless MCE logging infrastructure. * This avoids deadlocks on printk locks without having to break locks. Also @@ -208,8 +216,8 @@ void do_machine_check(struct pt_regs * regs, long error_code) || !banks) goto out2; - memset(&m, 0, sizeof(struct mce)); - m.cpu = smp_processor_id(); + mce_setup(&m); + rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); /* if the restart IP is not valid, we're done for */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) @@ -225,7 +233,6 @@ void do_machine_check(struct pt_regs * regs, long error_code) m.misc = 0; m.addr = 0; m.bank = i; - m.tsc = 0; rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); if ((m.status & MCI_STATUS_VAL) == 0) @@ -252,8 +259,8 @@ void do_machine_check(struct pt_regs * regs, long error_code) rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); mce_get_rip(&m, regs); - if (error_code >= 0) - rdtscll(m.tsc); + if (error_code < 0) + m.tsc = 0; if (error_code != -2) mce_log(&m); @@ -341,15 +348,13 @@ void do_machine_check(struct pt_regs * regs, long error_code) * and historically has been the register value of the * MSR_IA32_THERMAL_STATUS (Intel) msr. */ -void mce_log_therm_throt_event(unsigned int cpu, __u64 status) +void mce_log_therm_throt_event(__u64 status) { struct mce m; - memset(&m, 0, sizeof(m)); - m.cpu = cpu; + mce_setup(&m); m.bank = MCE_THERMAL_BANK; m.status = status; - rdtscll(m.tsc); mce_log(&m); } #endif /* CONFIG_X86_MCE_INTEL */ -- cgit v1.2.3 From b79109c3bbcf52cac5103979b283b9e5df4e796c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:43:23 +0100 Subject: x86, mce: separate correct machine check poller and fatal exception handler Impact: cleanup, performance enhancement The machine check poller is diverging more and more from the fatal exception handler. Instead of adding more special cases separate the code paths completely. The corrected poll path is actually quite simple, and this doesn't result in much code duplication. This makes both handlers much easier to read and results in cleaner code flow. The exception handler now only needs to care about uncorrected errors, which also simplifies the handling of multiple errors. The corrected poller also now always runs in standard interrupt context and does not need to do anything special to handle NMI context. Minor behaviour changes: - MCG status is now not cleared on polling. - Only the banks which had corrected errors get cleared on polling - The exception handler only clears banks with errors now v2: Forward port to new patch order. Add "uc" argument. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 129 ++++++++++++++++++++++++++++++------ 1 file changed, 108 insertions(+), 21 deletions(-) (limited to 'arch/x86/kernel/cpu/mcheck/mce_64.c') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index fed875742b1..268b05edade 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -3,6 +3,8 @@ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. * Rest from unknown author(s). * 2004 Andi Kleen. Rewrote most of it. + * Copyright 2008 Intel Corporation + * Author: Andi Kleen */ #include @@ -189,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) } /* - * The actual machine check handler + * Poll for corrected events or events that happened before reset. + * Those are just logged through /dev/mcelog. + * + * This is executed in standard interrupt context. + */ +void machine_check_poll(enum mcp_flags flags) +{ + struct mce m; + int i; + + mce_setup(&m); + + rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + for (i = 0; i < banks; i++) { + if (!bank[i]) + continue; + + m.misc = 0; + m.addr = 0; + m.bank = i; + m.tsc = 0; + + barrier(); + rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); + if (!(m.status & MCI_STATUS_VAL)) + continue; + + /* + * Uncorrected events are handled by the exception handler + * when it is enabled. But when the exception is disabled log + * everything. + * + * TBD do the same check for MCI_STATUS_EN here? + */ + if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) + continue; + + if (m.status & MCI_STATUS_MISCV) + rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); + if (m.status & MCI_STATUS_ADDRV) + rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); + + if (!(flags & MCP_TIMESTAMP)) + m.tsc = 0; + /* + * Don't get the IP here because it's unlikely to + * have anything to do with the actual error location. + */ + + mce_log(&m); + add_taint(TAINT_MACHINE_CHECK); + + /* + * Clear state for this bank. + */ + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } + + /* + * Don't clear MCG_STATUS here because it's only defined for + * exceptions. + */ +} + +/* + * The actual machine check handler. This only handles real + * exceptions when something got corrupted coming in through int 18. + * + * This is executed in NMI context not subject to normal locking rules. This + * implies that most kernel services cannot be safely used. Don't even + * think about putting a printk in there! */ void do_machine_check(struct pt_regs * regs, long error_code) { @@ -207,13 +279,14 @@ void do_machine_check(struct pt_regs * regs, long error_code) * error. */ int kill_it = 0; + DECLARE_BITMAP(toclear, MAX_NR_BANKS); atomic_inc(&mce_entry); - if ((regs - && notify_die(DIE_NMI, "machine check", regs, error_code, + if (notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL) == NOTIFY_STOP) - || !banks) + goto out2; + if (!banks) goto out2; mce_setup(&m); @@ -227,6 +300,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) barrier(); for (i = 0; i < banks; i++) { + __clear_bit(i, toclear); if (!bank[i]) continue; @@ -238,6 +312,20 @@ void do_machine_check(struct pt_regs * regs, long error_code) if ((m.status & MCI_STATUS_VAL) == 0) continue; + /* + * Non uncorrected errors are handled by machine_check_poll + * Leave them alone. + */ + if ((m.status & MCI_STATUS_UC) == 0) + continue; + + /* + * Set taint even when machine check was not enabled. + */ + add_taint(TAINT_MACHINE_CHECK); + + __set_bit(i, toclear); + if (m.status & MCI_STATUS_EN) { /* if PCC was set, there's no way out */ no_way_out |= !!(m.status & MCI_STATUS_PCC); @@ -251,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code) no_way_out = 1; kill_it = 1; } + } else { + /* + * Machine check event was not enabled. Clear, but + * ignore. + */ + continue; } if (m.status & MCI_STATUS_MISCV) @@ -259,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); mce_get_rip(&m, regs); - if (error_code < 0) - m.tsc = 0; - if (error_code != -2) - mce_log(&m); + mce_log(&m); /* Did this bank cause the exception? */ /* Assume that the bank with uncorrectable errors did it, @@ -271,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code) panicm = m; panicm_found = 1; } - - add_taint(TAINT_MACHINE_CHECK); } - /* Never do anything final in the polling timer */ - if (!regs) - goto out; - /* If we didn't find an uncorrectable error, pick the last one (shouldn't happen, just being safe). */ if (!panicm_found) @@ -325,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code) /* notify userspace ASAP */ set_thread_flag(TIF_MCE_NOTIFY); - out: /* the last thing we do is clear state */ - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + for (i = 0; i < banks; i++) { + if (test_bit(i, toclear)) + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } wrmsrl(MSR_IA32_MCG_STATUS, 0); out2: atomic_dec(&mce_entry); @@ -377,7 +463,7 @@ static void mcheck_timer(unsigned long data) WARN_ON(smp_processor_id() != data); if (mce_available(¤t_cpu_data)) - do_machine_check(NULL, 0); + machine_check_poll(MCP_TIMESTAMP); /* * Alert userspace if needed. If we logged an MCE, reduce the @@ -494,9 +580,10 @@ static void mce_init(void *dummy) u64 cap; int i; - /* Log the machine checks left over from the previous reset. - This also clears all registers */ - do_machine_check(NULL, mce_bootlog ? -1 : -2); + /* + * Log the machine checks left over from the previous reset. + */ + machine_check_poll(MCP_UC); set_in_cr4(X86_CR4_MCE); -- cgit v1.2.3 From f6d1826dfad0d15fd14a455facc80b91f2ee642f Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 19 Feb 2009 15:44:58 -0800 Subject: x86, mce: use %ll instead of %L for 64-bit numbers Impact: Cleanup The standard spelling of a printf pattern for long long is "ll", not "L", which is for long double. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel/cpu/mcheck/mce_64.c') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 268b05edade..60a114ca14f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -136,11 +136,11 @@ static void print_mce(struct mce *m) print_symbol("{%s}", m->ip); printk("\n"); } - printk(KERN_EMERG "TSC %Lx ", m->tsc); + printk(KERN_EMERG "TSC %llx ", m->tsc); if (m->addr) - printk("ADDR %Lx ", m->addr); + printk("ADDR %llx ", m->addr); if (m->misc) - printk("MISC %Lx ", m->misc); + printk("MISC %llx ", m->misc); printk("\n"); printk(KERN_EMERG "This is not a software problem!\n"); printk(KERN_EMERG "Run through mcelog --ascii to decode " @@ -945,7 +945,7 @@ static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, char *buf) { u64 b = bank[attr - bank_attrs]; - return sprintf(buf, "%Lx\n", b); + return sprintf(buf, "%llx\n", b); } static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, -- cgit v1.2.3 From ec5b3d32437571b8a742069a4cfd04edb6b6eda5 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 23 Feb 2009 14:01:04 -0800 Subject: x86, mce: remove invalid __cpuinit/__cpuexit annotations Impact: Bug fix when CPU hotplug is disabled Correct the following broken __cpuinit/__cpuexit annotations: - mce_cpu_features() is called from mce_resume(), and so cannot be __cpuinit. - mce_disable_cpu() and mce_reenable_cpu() are called from mce_cpu_callback(), and so cannot be __cpuexit(). Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel/cpu/mcheck/mce_64.c') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 60a114ca14f..0625993bf95 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -598,7 +598,7 @@ static void mce_init(void *dummy) } /* Add per CPU specific workarounds here */ -static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) +static void mce_cpu_quirks(struct cpuinfo_x86 *c) { /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { @@ -1056,7 +1056,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu) } /* Make sure there are no machine checks on offlined CPUs. */ -static void __cpuexit mce_disable_cpu(void *h) +static void mce_disable_cpu(void *h) { int i; @@ -1066,7 +1066,7 @@ static void __cpuexit mce_disable_cpu(void *h) wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); } -static void __cpuexit mce_reenable_cpu(void *h) +static void mce_reenable_cpu(void *h) { int i; -- cgit v1.2.3 From 41fdff322e26c4a86fe65cf577f2556a650cb7bc Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:30 +0100 Subject: x86, mce, cmci: export MAX_NR_BANKS Impact: Cleanup (code movement) Move MAX_NR_BANKS into mce.h because it's needed there for followup patches. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86/kernel/cpu/mcheck/mce_64.c') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index a4a7c686ce9..39f8bb525a7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -37,12 +37,6 @@ #define MISC_MCELOG_MINOR 227 -/* - * To support more than 128 would need to escape the predefined - * Linux defined extended banks first. - */ -#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) - atomic_t mce_entry; static int mce_dont_init; -- cgit v1.2.3 From 8457c84d68678cbfd4167a9073b89da58e48c037 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:33 +0100 Subject: x86, mce: replace machine check events logged interval with ratelimit Impact: behavior change, use common code Use a standard leaky bucket ratelimit for the machine check warning print interval instead of waiting every check_interval. Also decrease the limit to twice per minute. This interacts better with threshold interrupts because they can happen more often than check_interval. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel/cpu/mcheck/mce_64.c') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 39f8bb525a7..9017609cadd 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -488,11 +489,11 @@ static DECLARE_WORK(mce_trigger_work, mce_do_trigger); */ int mce_notify_user(void) { + /* Not more than two messages every minute */ + static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); + clear_thread_flag(TIF_MCE_NOTIFY); if (test_and_clear_bit(0, ¬ify_user)) { - static unsigned long last_print; - unsigned long now = jiffies; - wake_up_interruptible(&mce_wait); /* @@ -503,10 +504,8 @@ int mce_notify_user(void) if (trigger[0] && !work_pending(&mce_trigger_work)) schedule_work(&mce_trigger_work); - if (time_after_eq(now, last_print + (check_interval*HZ))) { - last_print = now; + if (__ratelimit(&ratelimit)) printk(KERN_INFO "Machine check events logged\n"); - } return 1; } -- cgit v1.2.3 From ee031c31d6381d004bfd386c2e45821211507499 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:34 +0100 Subject: x86, mce, cmci: use polled banks bitmap in machine check poller Define a per cpu bitmap that contains the banks polled by the machine check poller. This is needed for the CMCI code in the next patches to be able to disable polling on specific banks. The bank by default contains all banks, so there is no behaviour change. Only future code will remove some banks from the polling set. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel/cpu/mcheck/mce_64.c') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 9017609cadd..a8ff38bfa6e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -62,6 +62,11 @@ static char *trigger_argv[2] = { trigger, NULL }; static DECLARE_WAIT_QUEUE_HEAD(mce_wait); +/* MCA banks polled by the period polling timer for corrected events */ +DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { + [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL +}; + /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) { @@ -191,7 +196,7 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) * * This is executed in standard interrupt context. */ -void machine_check_poll(enum mcp_flags flags) +void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { struct mce m; int i; @@ -200,7 +205,7 @@ void machine_check_poll(enum mcp_flags flags) rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); for (i = 0; i < banks; i++) { - if (!bank[i]) + if (!bank[i] || !test_bit(i, *b)) continue; m.misc = 0; @@ -458,7 +463,8 @@ static void mcheck_timer(unsigned long data) WARN_ON(smp_processor_id() != data); if (mce_available(¤t_cpu_data)) - machine_check_poll(MCP_TIMESTAMP); + machine_check_poll(MCP_TIMESTAMP, + &__get_cpu_var(mce_poll_banks)); /* * Alert userspace if needed. If we logged an MCE, reduce the @@ -572,11 +578,13 @@ static void mce_init(void *dummy) { u64 cap; int i; + mce_banks_t all_banks; /* * Log the machine checks left over from the previous reset. */ - machine_check_poll(MCP_UC); + bitmap_fill(all_banks, MAX_NR_BANKS); + machine_check_poll(MCP_UC, &all_banks); set_in_cr4(X86_CR4_MCE); -- cgit v1.2.3 From 88ccbedd9ca85d1aca6a6f99df48dce87b7c02d4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:36 +0100 Subject: x86, mce, cmci: add CMCI support Impact: Major new feature Intel CMCI (Corrected Machine Check Interrupt) is a new feature on Nehalem CPUs. It allows the CPU to trigger interrupts on corrected events, which allows faster reaction to them instead of with the traditional polling timer. Also use CMCI to discover shared banks. Machine check banks can be shared by CPU threads or even cores. Using the CMCI enable bit it is possible to detect the fact that another CPU already saw a specific bank. Use this to assign shared banks only to one CPU to avoid reporting duplicated events. On CPU hot unplug bank sharing is re discovered. This is done using a thread that cycles through all the CPUs. To avoid races between the poller and CMCI we only poll for banks that are not CMCI capable and only check CMCI owned banks on a interrupt. The shared banks ownership information is currently only used for CMCI interrupts, not polled banks. The sharing discovery code follows the algorithm recommended in the IA32 SDM Vol3a 14.5.2.1 The CMCI interrupt handler just calls the machine check poller to pick up the machine check event that caused the interrupt. I decided not to implement a separate threshold event like the AMD version has, because the threshold is always one currently and adding another event didn't seem to add any value. Some code inspired by Yunhong Jiang's Xen implementation, which was in term inspired by a earlier CMCI implementation by me. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel/cpu/mcheck/mce_64.c') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index a8ff38bfa6e..bfbd5323a63 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -166,7 +166,7 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start) panic(msg); } -static int mce_available(struct cpuinfo_x86 *c) +int mce_available(struct cpuinfo_x86 *c) { if (mce_dont_init) return 0; @@ -1060,9 +1060,12 @@ static __cpuinit void mce_remove_device(unsigned int cpu) static void mce_disable_cpu(void *h) { int i; + unsigned long action = *(unsigned long *)h; if (!mce_available(¤t_cpu_data)) return; + if (!(action & CPU_TASKS_FROZEN)) + cmci_clear(); for (i = 0; i < banks; i++) wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); } @@ -1070,9 +1073,12 @@ static void mce_disable_cpu(void *h) static void mce_reenable_cpu(void *h) { int i; + unsigned long action = *(unsigned long *)h; if (!mce_available(¤t_cpu_data)) return; + if (!(action & CPU_TASKS_FROZEN)) + cmci_reenable(); for (i = 0; i < banks; i++) wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); } @@ -1100,13 +1106,17 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: del_timer_sync(t); - smp_call_function_single(cpu, mce_disable_cpu, NULL, 1); + smp_call_function_single(cpu, mce_disable_cpu, &action, 1); break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: t->expires = round_jiffies_relative(jiffies + next_interval); add_timer_on(t, cpu); - smp_call_function_single(cpu, mce_reenable_cpu, NULL, 1); + smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); + break; + case CPU_POST_DEAD: + /* intentionally ignoring frozen here */ + cmci_rediscover(cpu); break; } return NOTIFY_OK; -- cgit v1.2.3 From 5490fa96735ce0e2af270c0868987d644b9a38ec Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 11 Mar 2009 10:14:26 +0900 Subject: x86, mce: use round_jiffies() instead round_jiffies_relative() Impact: saving power _very_ little round_jiffies() round up absolute jiffies to full second. round_jiffies_relative() round up relative jiffies to full second. The "t->expires" is absolute jiffies. Then, round_jiffies() should be used instead round_jiffies_relative(). Signed-off-by: KOSAKI Motohiro Cc: Andi Kleen Cc: H. Peter Anvin Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel/cpu/mcheck/mce_64.c') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index bfbd5323a63..ca14604611e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -639,7 +639,7 @@ static void mce_init_timer(void) if (!next_interval) return; setup_timer(t, mcheck_timer, smp_processor_id()); - t->expires = round_jiffies_relative(jiffies + next_interval); + t->expires = round_jiffies(jiffies + next_interval); add_timer(t); } @@ -1110,7 +1110,7 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - t->expires = round_jiffies_relative(jiffies + next_interval); + t->expires = round_jiffies(jiffies + next_interval); add_timer_on(t, cpu); smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); break; -- cgit v1.2.3